diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index 9cf847bcb1..be65d8e63c 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -41,6 +41,14 @@ storage: ansible_host: i-051642d372c0a4f32 pageserver-3.us-west-2.aws.neon.tech: ansible_host: i-00c3844beb9ad1c6b + pageserver-4.us-west-2.aws.neon.tech: + ansible_host: i-013263dd1c239adcc + pageserver-5.us-west-2.aws.neon.tech: + ansible_host: i-00ca6417c7bf96820 + pageserver-6.us-west-2.aws.neon.tech: + ansible_host: i-01cdf7d2bc1433b6a + pageserver-7.us-west-2.aws.neon.tech: + ansible_host: i-02eec9b40617db5bc safekeepers: hosts: @@ -50,4 +58,15 @@ storage: ansible_host: i-074682f9d3c712e7c safekeeper-2.us-west-2.aws.neon.tech: ansible_host: i-042b7efb1729d7966 - + safekeeper-3.us-west-2.aws.neon.tech: + ansible_host: i-089f6b9ef426dff76 + safekeeper-4.us-west-2.aws.neon.tech: + ansible_host: i-0fe6bf912c4710c82 + safekeeper-5.us-west-2.aws.neon.tech: + ansible_host: i-0a83c1c46d2b4e409 + safekeeper-6.us-west-2.aws.neon.tech: + ansible_host: i-0fef5317b8fdc9f8d + safekeeper-7.us-west-2.aws.neon.tech: + ansible_host: i-0be739190d4289bf9 + safekeeper-8.us-west-2.aws.neon.tech: + ansible_host: i-00e851803669e5cfe diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index 39f5613935..a54ced7f3a 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -35,6 +35,8 @@ storage: hosts: pageserver-0.eu-west-1.aws.neon.build: ansible_host: i-01d496c5041c7f34c + pageserver-1.eu-west-1.aws.neon.build: + ansible_host: i-0e8013e239ce3928c safekeepers: hosts: @@ -44,3 +46,15 @@ storage: ansible_host: i-06969ee1bf2958bfc safekeeper-2.eu-west-1.aws.neon.build: ansible_host: i-087892e9625984a0b + safekeeper-3.eu-west-1.aws.neon.build: + ansible_host: i-0a6f91660e99e8891 + safekeeper-4.eu-west-1.aws.neon.build: + ansible_host: i-0012e309e28e7c249 + safekeeper-5.eu-west-1.aws.neon.build: + ansible_host: i-085a2b1193287b32e + safekeeper-6.eu-west-1.aws.neon.build: + ansible_host: i-0c713248465ed0fbd + safekeeper-7.eu-west-1.aws.neon.build: + ansible_host: i-02ad231aed2a80b7a + safekeeper-8.eu-west-1.aws.neon.build: + ansible_host: i-0dbbd8ffef66efda8 diff --git a/.github/helm-values/dev-eu-central-1-alpha.pg-sni-router.yaml b/.github/helm-values/dev-eu-central-1-alpha.pg-sni-router.yaml new file mode 100644 index 0000000000..a80423b12d --- /dev/null +++ b/.github/helm-values/dev-eu-central-1-alpha.pg-sni-router.yaml @@ -0,0 +1,19 @@ +useCertManager: true + +replicaCount: 3 + +exposedService: + # exposedService.port -- Exposed Service proxy port + port: 4432 + annotations: + external-dns.alpha.kubernetes.io/hostname: "*.snirouter.alpha.eu-central-1.internal.aws.neon.build" + +settings: + domain: "*.snirouter.alpha.eu-central-1.internal.aws.neon.build" + sentryEnvironment: "staging" + +imagePullSecrets: + - name: docker-hub-neon + +metrics: + enabled: false diff --git a/.github/helm-values/dev-eu-west-1-zeta.pg-sni-router.yaml b/.github/helm-values/dev-eu-west-1-zeta.pg-sni-router.yaml new file mode 100644 index 0000000000..c9c628af0c --- /dev/null +++ b/.github/helm-values/dev-eu-west-1-zeta.pg-sni-router.yaml @@ -0,0 +1,19 @@ +useCertManager: true + +replicaCount: 3 + +exposedService: + # exposedService.port -- Exposed Service proxy port + port: 4432 + annotations: + external-dns.alpha.kubernetes.io/hostname: "*.snirouter.zeta.eu-west-1.internal.aws.neon.build" + +settings: + domain: "*.snirouter.zeta.eu-west-1.internal.aws.neon.build" + sentryEnvironment: "staging" + +imagePullSecrets: + - name: docker-hub-neon + +metrics: + enabled: false diff --git a/.github/helm-values/dev-us-east-2-beta.pg-sni-router.yaml b/.github/helm-values/dev-us-east-2-beta.pg-sni-router.yaml new file mode 100644 index 0000000000..68ad096df7 --- /dev/null +++ b/.github/helm-values/dev-us-east-2-beta.pg-sni-router.yaml @@ -0,0 +1,19 @@ +useCertManager: true + +replicaCount: 3 + +exposedService: + # exposedService.port -- Exposed Service proxy port + port: 4432 + annotations: + external-dns.alpha.kubernetes.io/hostname: "*.snirouter.beta.us-east-2.internal.aws.neon.build" + +settings: + domain: "*.snirouter.beta.us-east-2.internal.aws.neon.build" + sentryEnvironment: "staging" + +imagePullSecrets: + - name: docker-hub-neon + +metrics: + enabled: false diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.pg-sni-router.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.pg-sni-router.yaml new file mode 100644 index 0000000000..478ad5631c --- /dev/null +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.pg-sni-router.yaml @@ -0,0 +1,19 @@ +useCertManager: true + +replicaCount: 3 + +exposedService: + # exposedService.port -- Exposed Service proxy port + port: 4432 + annotations: + external-dns.alpha.kubernetes.io/hostname: "*.snirouter.epsilon.ap-southeast-1.internal.aws.neon.tech" + +settings: + domain: "*.snirouter.epsilon.ap-southeast-1.internal.aws.neon.tech" + sentryEnvironment: "production" + +imagePullSecrets: + - name: docker-hub-neon + +metrics: + enabled: false diff --git a/.github/helm-values/prod-eu-central-1-gamma.pg-sni-router.yaml b/.github/helm-values/prod-eu-central-1-gamma.pg-sni-router.yaml new file mode 100644 index 0000000000..08a0a163bc --- /dev/null +++ b/.github/helm-values/prod-eu-central-1-gamma.pg-sni-router.yaml @@ -0,0 +1,19 @@ +useCertManager: true + +replicaCount: 3 + +exposedService: + # exposedService.port -- Exposed Service proxy port + port: 4432 + annotations: + external-dns.alpha.kubernetes.io/hostname: "*.snirouter.gamma.eu-central-1.internal.aws.neon.tech" + +settings: + domain: "*.snirouter.gamma.eu-central-1.internal.aws.neon.tech" + sentryEnvironment: "production" + +imagePullSecrets: + - name: docker-hub-neon + +metrics: + enabled: false diff --git a/.github/helm-values/prod-us-east-1-theta.pg-sni-router.yaml b/.github/helm-values/prod-us-east-1-theta.pg-sni-router.yaml new file mode 100644 index 0000000000..ab308131bc --- /dev/null +++ b/.github/helm-values/prod-us-east-1-theta.pg-sni-router.yaml @@ -0,0 +1,19 @@ +useCertManager: true + +replicaCount: 3 + +exposedService: + # exposedService.port -- Exposed Service proxy port + port: 4432 + annotations: + external-dns.alpha.kubernetes.io/hostname: "*.snirouter.theta.us-east-1.internal.aws.neon.tech" + +settings: + domain: "*.snirouter.theta.us-east-1.internal.aws.neon.tech" + sentryEnvironment: "production" + +imagePullSecrets: + - name: docker-hub-neon + +metrics: + enabled: false diff --git a/.github/helm-values/prod-us-east-2-delta.pg-sni-router.yaml b/.github/helm-values/prod-us-east-2-delta.pg-sni-router.yaml new file mode 100644 index 0000000000..ecb3f156ec --- /dev/null +++ b/.github/helm-values/prod-us-east-2-delta.pg-sni-router.yaml @@ -0,0 +1,19 @@ +useCertManager: true + +replicaCount: 3 + +exposedService: + # exposedService.port -- Exposed Service proxy port + port: 4432 + annotations: + external-dns.alpha.kubernetes.io/hostname: "*.snirouter.delta.us-east-2.internal.aws.neon.tech" + +settings: + domain: "*.snirouter.delta.us-east-2.internal.aws.neon.tech" + sentryEnvironment: "production" + +imagePullSecrets: + - name: docker-hub-neon + +metrics: + enabled: false diff --git a/.github/helm-values/prod-us-west-2-eta.pg-sni-router.yaml b/.github/helm-values/prod-us-west-2-eta.pg-sni-router.yaml new file mode 100644 index 0000000000..942250c419 --- /dev/null +++ b/.github/helm-values/prod-us-west-2-eta.pg-sni-router.yaml @@ -0,0 +1,19 @@ +useCertManager: true + +replicaCount: 3 + +exposedService: + # exposedService.port -- Exposed Service proxy port + port: 4432 + annotations: + external-dns.alpha.kubernetes.io/hostname: "*.snirouter.eta.us-west-2.internal.aws.neon.tech" + +settings: + domain: "*.snirouter.eta.us-west-2.internal.aws.neon.tech" + sentryEnvironment: "production" + +imagePullSecrets: + - name: docker-hub-neon + +metrics: + enabled: false diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e5ba7aa3eb..90f0395c7c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -418,10 +418,7 @@ jobs: - uses: actions/github-script@v6 if: > !cancelled() && - github.event_name == 'pull_request' && ( - steps.create-allure-report-debug.outputs.report-url || - steps.create-allure-report-release.outputs.report-url - ) + github.event_name == 'pull_request' with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index 5d1c6e0e16..6e77a3b571 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -27,6 +27,11 @@ on: required: true type: boolean default: true + deployPgSniRouter: + description: 'Deploy pg-sni-router' + required: true + type: boolean + default: true env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} @@ -227,3 +232,49 @@ jobs: - name: Cleanup helm folder run: rm -rf ~/.cache + + deploy-pg-sni-router: + runs-on: [ self-hosted, gen3, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + if: inputs.deployPgSniRouter + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: dev-us-east-2-beta + - target_region: eu-west-1 + target_cluster: dev-eu-west-1-zeta + - target_region: eu-central-1 + target_cluster: dev-eu-central-1-alpha + environment: + name: dev-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1-node16 + with: + role-to-assume: arn:aws:iam::369495373322:role/github-runner + aws-region: eu-central-1 + role-skip-session-tagging: true + role-duration-seconds: 1800 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Deploy pg-sni-router + run: + helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s + + - name: Cleanup helm folder + run: rm -rf ~/.cache diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index 9fa31b3225..baa44d8094 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -27,6 +27,11 @@ on: required: true type: boolean default: true + deployPgSniRouter: + description: 'Deploy pg-sni-router' + required: true + type: boolean + default: true disclamerAcknowledged: description: 'I confirm that there is an emergency and I can not use regular release workflow' required: true @@ -171,3 +176,42 @@ jobs: - name: Deploy storage-broker run: helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s + + deploy-pg-sni-router: + runs-on: prod + container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + if: inputs.deployPgSniRouter && inputs.disclamerAcknowledged + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: prod-us-east-2-delta + - target_region: us-west-2 + target_cluster: prod-us-west-2-eta + - target_region: eu-central-1 + target_cluster: prod-eu-central-1-gamma + - target_region: ap-southeast-1 + target_cluster: prod-ap-southeast-1-epsilon + - target_region: us-east-1 + target_cluster: prod-us-east-1-theta + environment: + name: prod-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Deploy pg-sni-router + run: + helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s diff --git a/Cargo.lock b/Cargo.lock index c9af0d04a7..1cd96d278e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1574,6 +1574,21 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.1.0" @@ -2361,6 +2376,24 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "native-tls" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "nix" version = "0.26.2" @@ -2483,12 +2516,50 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "openssl" +version = "0.10.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.15", +] + [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "openssl-sys" +version = "0.9.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "opentelemetry" version = "0.18.0" @@ -2682,6 +2753,7 @@ dependencies = [ "tenant_size_model", "thiserror", "tokio", + "tokio-io-timeout", "tokio-postgres", "tokio-tar", "tokio-util", @@ -2816,6 +2888,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" + [[package]] name = "plotters" version = "0.3.4" @@ -2847,7 +2925,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" dependencies = [ "bytes", "fallible-iterator", @@ -2857,10 +2935,21 @@ dependencies = [ "tokio-postgres", ] +[[package]] +name = "postgres-native-tls" +version = "0.5.0" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" +dependencies = [ + "native-tls", + "tokio", + "tokio-native-tls", + "tokio-postgres", +] + [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" dependencies = [ "base64 0.20.0", "byteorder", @@ -2878,7 +2967,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" dependencies = [ "bytes", "fallible-iterator", @@ -2959,7 +3048,6 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "rand", - "serde", "thiserror", "tokio", "tracing", @@ -3110,10 +3198,12 @@ dependencies = [ "itertools", "md5", "metrics", + "native-tls", "once_cell", "opentelemetry", "parking_lot", "pin-project-lite", + "postgres-native-tls", "postgres_backend", "pq_proto", "prometheus", @@ -3568,6 +3658,7 @@ dependencies = [ "const_format", "crc32c", "fs2", + "futures", "git-version", "hex", "humantime", @@ -3582,7 +3673,9 @@ dependencies = [ "pq_proto", "regex", "remote_storage", + "reqwest", "safekeeper_api", + "scopeguard", "serde", "serde_json", "serde_with", @@ -3869,8 +3962,7 @@ dependencies = [ [[package]] name = "sharded-slab" version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31" +source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00" dependencies = [ "lazy_static", ] @@ -4326,10 +4418,20 @@ dependencies = [ "syn 2.0.15", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" dependencies = [ "async-trait", "byteorder", @@ -4871,6 +4973,7 @@ dependencies = [ "bincode", "byteorder", "bytes", + "chrono", "criterion", "futures", "heapless", @@ -4882,6 +4985,7 @@ dependencies = [ "nix", "once_cell", "pin-project-lite", + "pq_proto", "rand", "regex", "routerify", @@ -4919,6 +5023,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" @@ -5297,13 +5407,11 @@ name = "workspace_hack" version = "0.1.0" dependencies = [ "anyhow", - "byteorder", "bytes", "chrono", "clap 4.2.2", "clap_builder", "crossbeam-utils", - "digest", "either", "fail", "futures", diff --git a/Cargo.toml b/Cargo.toml index f4872433cd..b73e29ef6c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,6 +62,7 @@ jsonwebtoken = "8" libc = "0.2" md5 = "0.7.0" memoffset = "0.8" +native-tls = "0.2" nix = "0.26" notify = "5.0.0" num_cpus = "1.15" @@ -124,10 +125,11 @@ env_logger = "0.10" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } +postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" } ## Other git libraries @@ -159,10 +161,16 @@ rstest = "0.17" tempfile = "3.4" tonic-build = "0.9" +[patch.crates-io] + # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. -[patch.crates-io] -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } + +# Changes the MAX_THREADS limit from 4096 to 32768. +# This is a temporary workaround for using tracing from many threads in safekeepers code, +# until async safekeepers patch is merged to the main. +sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" } ################# Binary contents sections diff --git a/Dockerfile b/Dockerfile index 6f7d2c32a5..f83f3b1c21 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,15 @@ COPY --chown=nonroot . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ -&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin storage_broker --bin proxy --locked --release \ + && mold -run cargo build \ + --bin pg_sni_router \ + --bin pageserver \ + --bin pageserver_binutils \ + --bin draw_timeline_dir \ + --bin safekeeper \ + --bin storage_broker \ + --bin proxy \ + --locked --release \ && cachepot -s # Build final image @@ -63,6 +71,7 @@ RUN set -e \ && useradd -d /data neon \ && chown -R neon:neon /data +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 09278e1726..0e0d71b3f1 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,7 +8,7 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use control_plane::endpoint::ComputeControlPlane; -use control_plane::endpoint::Replication; +use control_plane::endpoint::ComputeMode; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; @@ -481,7 +481,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - timeline_id, None, pg_version, - Replication::Primary, + ComputeMode::Primary, )?; println!("Done"); } @@ -568,8 +568,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .iter() .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id) { - let lsn_str = match endpoint.replication { - Replication::Static(lsn) => { + let lsn_str = match endpoint.mode { + ComputeMode::Static(lsn) => { // -> read-only endpoint // Use the node's LSN. lsn.to_string() @@ -632,21 +632,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .copied() .unwrap_or(false); - let replication = match (lsn, hot_standby) { - (Some(lsn), false) => Replication::Static(lsn), - (None, true) => Replication::Replica, - (None, false) => Replication::Primary, + let mode = match (lsn, hot_standby) { + (Some(lsn), false) => ComputeMode::Static(lsn), + (None, true) => ComputeMode::Replica, + (None, false) => ComputeMode::Primary, (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; - cplane.new_endpoint( - tenant_id, - &endpoint_id, - timeline_id, - port, - pg_version, - replication, - )?; + cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?; } "start" => { let port: Option = sub_args.get_one::("port").copied(); @@ -670,11 +663,11 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .unwrap_or(false); if let Some(endpoint) = endpoint { - match (&endpoint.replication, hot_standby) { - (Replication::Static(_), true) => { + match (&endpoint.mode, hot_standby) { + (ComputeMode::Static(_), true) => { bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") } - (Replication::Primary, true) => { + (ComputeMode::Primary, true) => { bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") } _ => {} @@ -701,10 +694,10 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .copied() .context("Failed to `pg-version` from the argument string")?; - let replication = match (lsn, hot_standby) { - (Some(lsn), false) => Replication::Static(lsn), - (None, true) => Replication::Replica, - (None, false) => Replication::Primary, + let mode = match (lsn, hot_standby) { + (Some(lsn), false) => ComputeMode::Static(lsn), + (None, true) => ComputeMode::Replica, + (None, false) => ComputeMode::Primary, (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; @@ -721,7 +714,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( timeline_id, port, pg_version, - replication, + mode, )?; ep.start(&auth_token)?; } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 7d3485518f..5a1f93dc99 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -11,15 +11,31 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, }; -use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; +use crate::local_env::LocalEnv; use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; +// contents of a endpoint.json file +#[serde_as] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +pub struct EndpointConf { + name: String, + #[serde_as(as = "DisplayFromStr")] + tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + timeline_id: TimelineId, + mode: ComputeMode, + port: u16, + pg_version: u32, +} + // // ComputeControlPlane // @@ -70,7 +86,7 @@ impl ComputeControlPlane { timeline_id: TimelineId, port: Option, pg_version: u32, - replication: Replication, + mode: ComputeMode, ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); @@ -80,12 +96,22 @@ impl ComputeControlPlane { env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), timeline_id, - replication, + mode, tenant_id, pg_version, }); - ep.create_pgdata()?; + std::fs::write( + ep.endpoint_path().join("endpoint.json"), + serde_json::to_string_pretty(&EndpointConf { + name: name.to_string(), + tenant_id, + timeline_id, + mode, + port, + pg_version, + })?, + )?; ep.setup_pg_conf()?; self.endpoints.insert(ep.name.clone(), Arc::clone(&ep)); @@ -96,12 +122,13 @@ impl ComputeControlPlane { /////////////////////////////////////////////////////////////////////////////// -#[derive(Debug, Clone, Eq, PartialEq)] -pub enum Replication { +#[serde_as] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)] +pub enum ComputeMode { // Regular read-write node Primary, // if recovery_target_lsn is provided, and we want to pin the node to a specific LSN - Static(Lsn), + Static(#[serde_as(as = "DisplayFromStr")] Lsn), // Hot standby; read-only replica. // Future versions may want to distinguish between replicas with hot standby // feedback and other kinds of replication configurations. @@ -115,7 +142,7 @@ pub struct Endpoint { pub tenant_id: TenantId, pub timeline_id: TimelineId, // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary. - pub replication: Replication, + pub mode: ComputeMode, // port and address of the Postgres server pub address: SocketAddr, @@ -144,50 +171,20 @@ impl Endpoint { let fname = entry.file_name(); let name = fname.to_str().unwrap().to_string(); - // Read config file into memory - let cfg_path = entry.path().join("pgdata").join("postgresql.conf"); - let cfg_path_str = cfg_path.to_string_lossy(); - let mut conf_file = File::open(&cfg_path) - .with_context(|| format!("failed to open config file in {}", cfg_path_str))?; - let conf = PostgresConf::read(&mut conf_file) - .with_context(|| format!("failed to read config file in {}", cfg_path_str))?; - - // Read a few options from the config file - let context = format!("in config file {}", cfg_path_str); - let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; - let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; - - // Read postgres version from PG_VERSION file to determine which postgres version binary to use. - // If it doesn't exist, assume broken data directory and use default pg version. - let pg_version_path = entry.path().join("PG_VERSION"); - - let pg_version_str = - fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); - let pg_version = u32::from_str(&pg_version_str)?; - - // parse recovery_target_lsn and primary_conninfo into Recovery Target, if any - let replication = if let Some(lsn_str) = conf.get("recovery_target_lsn") { - Replication::Static(Lsn::from_str(lsn_str)?) - } else if let Some(slot_name) = conf.get("primary_slot_name") { - let slot_name = slot_name.to_string(); - let prefix = format!("repl_{}_", timeline_id); - assert!(slot_name.starts_with(&prefix)); - Replication::Replica - } else { - Replication::Primary - }; + // Read the endpoint.json file + let conf: EndpointConf = + serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; // ok now Ok(Endpoint { - address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), + address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port), name, env: env.clone(), pageserver: Arc::clone(pageserver), - timeline_id, - replication, - tenant_id, - pg_version, + timeline_id: conf.timeline_id, + mode: conf.mode, + tenant_id: conf.tenant_id, + pg_version: conf.pg_version, }) } @@ -323,8 +320,8 @@ impl Endpoint { conf.append_line(""); // Replication-related configurations, such as WAL sending - match &self.replication { - Replication::Primary => { + match &self.mode { + ComputeMode::Primary => { // Configure backpressure // - Replication write lag depends on how fast the walreceiver can process incoming WAL. // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, @@ -366,10 +363,10 @@ impl Endpoint { conf.append("synchronous_standby_names", "pageserver"); } } - Replication::Static(lsn) => { + ComputeMode::Static(lsn) => { conf.append("recovery_target_lsn", &lsn.to_string()); } - Replication::Replica => { + ComputeMode::Replica => { assert!(!self.env.safekeepers.is_empty()); // TODO: use future host field from safekeeper spec @@ -409,8 +406,8 @@ impl Endpoint { } fn load_basebackup(&self, auth_token: &Option) -> Result<()> { - let backup_lsn = match &self.replication { - Replication::Primary => { + let backup_lsn = match &self.mode { + ComputeMode::Primary => { if !self.env.safekeepers.is_empty() { // LSN 0 means that it is bootstrap and we need to download just // latest data from the pageserver. That is a bit clumsy but whole bootstrap @@ -426,8 +423,8 @@ impl Endpoint { None } } - Replication::Static(lsn) => Some(*lsn), - Replication::Replica => { + ComputeMode::Static(lsn) => Some(*lsn), + ComputeMode::Replica => { None // Take the latest snapshot available to start with } }; @@ -526,7 +523,7 @@ impl Endpoint { // 3. Load basebackup self.load_basebackup(auth_token)?; - if self.replication != Replication::Primary { + if self.mode != ComputeMode::Primary { File::create(self.pgdata().join("standby.signal"))?; } diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index f6bf7c6fc2..453c58431a 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -50,11 +50,14 @@ impl QueryError { } } +/// Returns true if the given error is a normal consequence of a network issue, +/// or the client closing the connection. These errors can happen during normal +/// operations, and don't indicate a bug in our code. pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( e.kind(), - ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut + BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut ) } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 969befc8e7..9f3f4dc20d 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -1,15 +1,13 @@ -use anyhow::*; -use core::time::Duration; +use anyhow::{bail, ensure}; use log::*; use postgres::types::PgLsn; use postgres::Client; use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; use std::cmp::Ordering; -use std::fs; use std::path::{Path, PathBuf}; -use std::process::{Command, Stdio}; -use std::time::Instant; +use std::process::Command; +use std::time::{Duration, Instant}; use tempfile::{tempdir, TempDir}; #[derive(Debug, Clone, PartialEq, Eq)] @@ -56,7 +54,7 @@ impl Conf { self.datadir.join("pg_wal") } - fn new_pg_command(&self, command: impl AsRef) -> Result { + fn new_pg_command(&self, command: impl AsRef) -> anyhow::Result { let path = self.pg_bin_dir()?.join(command); ensure!(path.exists(), "Command {:?} does not exist", path); let mut cmd = Command::new(path); @@ -66,7 +64,7 @@ impl Conf { Ok(cmd) } - pub fn initdb(&self) -> Result<()> { + pub fn initdb(&self) -> anyhow::Result<()> { if let Some(parent) = self.datadir.parent() { info!("Pre-creating parent directory {:?}", parent); // Tests may be run concurrently and there may be a race to create `test_output/`. @@ -80,7 +78,7 @@ impl Conf { let output = self .new_pg_command("initdb")? .arg("-D") - .arg(self.datadir.as_os_str()) + .arg(&self.datadir) .args(["-U", "postgres", "--no-instructions", "--no-sync"]) .output()?; debug!("initdb output: {:?}", output); @@ -93,26 +91,18 @@ impl Conf { Ok(()) } - pub fn start_server(&self) -> Result { + pub fn start_server(&self) -> anyhow::Result { info!("Starting Postgres server in {:?}", self.datadir); - let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| { - format!( - "Failed to create pg.log file in directory {}", - self.datadir.display() - ) - })?; let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) let unix_socket_dir_path = unix_socket_dir.path().to_owned(); let server_process = self .new_pg_command("postgres")? .args(["-c", "listen_addresses="]) .arg("-k") - .arg(unix_socket_dir_path.as_os_str()) + .arg(&unix_socket_dir_path) .arg("-D") - .arg(self.datadir.as_os_str()) - .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output + .arg(&self.datadir) .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) - .stderr(Stdio::from(log_file)) .spawn()?; let server = PostgresServer { process: server_process, @@ -121,7 +111,7 @@ impl Conf { let mut c = postgres::Config::new(); c.host_path(&unix_socket_dir_path); c.user("postgres"); - c.connect_timeout(Duration::from_millis(1000)); + c.connect_timeout(Duration::from_millis(10000)); c }, }; @@ -132,7 +122,7 @@ impl Conf { &self, first_segment_name: &str, last_segment_name: &str, - ) -> Result { + ) -> anyhow::Result { let first_segment_file = self.datadir.join(first_segment_name); let last_segment_file = self.datadir.join(last_segment_name); info!( @@ -142,10 +132,7 @@ impl Conf { ); let output = self .new_pg_command("pg_waldump")? - .args([ - &first_segment_file.as_os_str(), - &last_segment_file.as_os_str(), - ]) + .args([&first_segment_file, &last_segment_file]) .output()?; debug!("waldump output: {:?}", output); Ok(output) @@ -153,10 +140,9 @@ impl Conf { } impl PostgresServer { - pub fn connect_with_timeout(&self) -> Result { + pub fn connect_with_timeout(&self) -> anyhow::Result { let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap(); while Instant::now() < retry_until { - use std::result::Result::Ok; if let Ok(client) = self.client_config.connect(postgres::NoTls) { return Ok(client); } @@ -173,7 +159,6 @@ impl PostgresServer { impl Drop for PostgresServer { fn drop(&mut self) { - use std::result::Result::Ok; match self.process.try_wait() { Ok(Some(_)) => return, Ok(None) => { @@ -188,12 +173,12 @@ impl Drop for PostgresServer { } pub trait PostgresClientExt: postgres::GenericClient { - fn pg_current_wal_insert_lsn(&mut self) -> Result { + fn pg_current_wal_insert_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_insert_lsn()", &[])? .get(0)) } - fn pg_current_wal_flush_lsn(&mut self) -> Result { + fn pg_current_wal_flush_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_flush_lsn()", &[])? .get(0)) @@ -202,7 +187,7 @@ pub trait PostgresClientExt: postgres::GenericClient { impl PostgresClientExt for C {} -pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> { +pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow::Result<()> { client.execute("create extension if not exists neon_test_utils", &[])?; let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0); @@ -236,13 +221,13 @@ pub trait Crafter { /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. /// May include or exclude Lsn(0) and the end-of-wal. /// * The expected end-of-wal LSN. - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)>; + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)>; } fn craft_internal( client: &mut C, - f: impl Fn(&mut C, PgLsn) -> Result<(Vec, Option)>, -) -> Result<(Vec, PgLsn)> { + f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec, Option)>, +) -> anyhow::Result<(Vec, PgLsn)> { ensure_server_config(client)?; let initial_lsn = client.pg_current_wal_insert_lsn()?; @@ -274,7 +259,7 @@ fn craft_internal( pub struct Simple; impl Crafter for Simple { const NAME: &'static str = "simple"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; Ok((Vec::new(), None)) @@ -285,7 +270,7 @@ impl Crafter for Simple { pub struct LastWalRecordXlogSwitch; impl Crafter for LastWalRecordXlogSwitch { const NAME: &'static str = "last_wal_record_xlog_switch"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -307,7 +292,7 @@ impl Crafter for LastWalRecordXlogSwitch { pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -374,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { fn craft_single_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, -) -> Result<(Vec, PgLsn)> { +) -> anyhow::Result<(Vec, PgLsn)> { craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), @@ -416,7 +401,7 @@ fn craft_single_logical_message( pub struct WalRecordCrossingSegmentFollowedBySmallOne; impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_single_logical_message(client, true) } } @@ -424,7 +409,7 @@ impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { pub struct LastWalRecordCrossingSegment; impl Crafter for LastWalRecordCrossingSegment { const NAME: &'static str = "last_wal_record_crossing_segment"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_single_logical_message(client, false) } } diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index 76b71729ed..b286eb0358 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -10,7 +10,6 @@ byteorder.workspace = true pin-project-lite.workspace = true postgres-protocol.workspace = true rand.workspace = true -serde.workspace = true tokio.workspace = true tracing.workspace = true thiserror.workspace = true diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index ed0239072a..2143ad2530 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -6,15 +6,10 @@ pub mod framed; use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use postgres_protocol::PG_EPOCH; -use serde::{Deserialize, Serialize}; -use std::{ - borrow::Cow, - collections::HashMap, - fmt, io, str, - time::{Duration, SystemTime}, -}; -use tracing::{trace, warn}; +use std::{borrow::Cow, collections::HashMap, fmt, io, str}; + +// re-export for use in utils pageserver_feedback.rs +pub use postgres_protocol::PG_EPOCH; pub type Oid = u32; pub type SystemId = u64; @@ -664,7 +659,7 @@ fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolErr } /// Read cstring from buf, advancing it. -fn read_cstr(buf: &mut Bytes) -> Result { +pub fn read_cstr(buf: &mut Bytes) -> Result { let pos = buf .iter() .position(|x| *x == 0) @@ -939,175 +934,10 @@ impl<'a> BeMessage<'a> { } } -/// Feedback pageserver sends to safekeeper and safekeeper resends to compute. -/// Serialized in custom flexible key/value format. In replication protocol, it -/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres -/// Standby status update / Hot standby feedback messages. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub struct PageserverFeedback { - /// Last known size of the timeline. Used to enforce timeline size limit. - pub current_timeline_size: u64, - /// LSN last received and ingested by the pageserver. - pub last_received_lsn: u64, - /// LSN up to which data is persisted by the pageserver to its local disc. - pub disk_consistent_lsn: u64, - /// LSN up to which data is persisted by the pageserver on s3; safekeepers - /// consider WAL before it can be removed. - pub remote_consistent_lsn: u64, - pub replytime: SystemTime, -} - -// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. -// Do not remove previously available fields because this might be backwards incompatible. -pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; - -impl PageserverFeedback { - pub fn empty() -> PageserverFeedback { - PageserverFeedback { - current_timeline_size: 0, - last_received_lsn: 0, - remote_consistent_lsn: 0, - disk_consistent_lsn: 0, - replytime: SystemTime::now(), - } - } - - // Serialize PageserverFeedback using custom format - // to support protocol extensibility. - // - // Following layout is used: - // char - number of key-value pairs that follow. - // - // key-value pairs: - // null-terminated string - key, - // uint32 - value length in bytes - // value itself - // - // TODO: change serialized fields names once all computes migrate to rename. - pub fn serialize(&self, buf: &mut BytesMut) { - buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys - buf.put_slice(b"current_timeline_size\0"); - buf.put_i32(8); - buf.put_u64(self.current_timeline_size); - - buf.put_slice(b"ps_writelsn\0"); - buf.put_i32(8); - buf.put_u64(self.last_received_lsn); - buf.put_slice(b"ps_flushlsn\0"); - buf.put_i32(8); - buf.put_u64(self.disk_consistent_lsn); - buf.put_slice(b"ps_applylsn\0"); - buf.put_i32(8); - buf.put_u64(self.remote_consistent_lsn); - - let timestamp = self - .replytime - .duration_since(*PG_EPOCH) - .expect("failed to serialize pg_replytime earlier than PG_EPOCH") - .as_micros() as i64; - - buf.put_slice(b"ps_replytime\0"); - buf.put_i32(8); - buf.put_i64(timestamp); - } - - // Deserialize PageserverFeedback message - // TODO: change serialized fields names once all computes migrate to rename. - pub fn parse(mut buf: Bytes) -> PageserverFeedback { - let mut rf = PageserverFeedback::empty(); - let nfields = buf.get_u8(); - for _ in 0..nfields { - let key = read_cstr(&mut buf).unwrap(); - match key.as_ref() { - b"current_timeline_size" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - rf.current_timeline_size = buf.get_u64(); - } - b"ps_writelsn" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - rf.last_received_lsn = buf.get_u64(); - } - b"ps_flushlsn" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - rf.disk_consistent_lsn = buf.get_u64(); - } - b"ps_applylsn" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - rf.remote_consistent_lsn = buf.get_u64(); - } - b"ps_replytime" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - let raw_time = buf.get_i64(); - if raw_time > 0 { - rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); - } else { - rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); - } - } - _ => { - let len = buf.get_i32(); - warn!( - "PageserverFeedback parse. unknown key {} of len {len}. Skip it.", - String::from_utf8_lossy(key.as_ref()) - ); - buf.advance(len as usize); - } - } - } - trace!("PageserverFeedback parsed is {:?}", rf); - rf - } -} - #[cfg(test)] mod tests { use super::*; - #[test] - fn test_replication_feedback_serialization() { - let mut rf = PageserverFeedback::empty(); - // Fill rf with some values - rf.current_timeline_size = 12345678; - // Set rounded time to be able to compare it with deserialized value, - // because it is rounded up to microseconds during serialization. - rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); - let mut data = BytesMut::new(); - rf.serialize(&mut data); - - let rf_parsed = PageserverFeedback::parse(data.freeze()); - assert_eq!(rf, rf_parsed); - } - - #[test] - fn test_replication_feedback_unknown_key() { - let mut rf = PageserverFeedback::empty(); - // Fill rf with some values - rf.current_timeline_size = 12345678; - // Set rounded time to be able to compare it with deserialized value, - // because it is rounded up to microseconds during serialization. - rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); - let mut data = BytesMut::new(); - rf.serialize(&mut data); - - // Add an extra field to the buffer and adjust number of keys - if let Some(first) = data.first_mut() { - *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; - } - - data.put_slice(b"new_field_one\0"); - data.put_i32(8); - data.put_u64(42); - - // Parse serialized data and check that new field is not parsed - let rf_parsed = PageserverFeedback::parse(data.freeze()); - assert_eq!(rf, rf_parsed); - } - #[test] fn test_startup_message_params_options_escaped() { fn split_options(params: &StartupMessageParams) -> Vec> { diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 2b04dfdef6..8239ffff57 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -11,6 +11,7 @@ async-trait.workspace = true anyhow.workspace = true bincode.workspace = true bytes.workspace = true +chrono.workspace = true heapless.workspace = true hex = { workspace = true, features = ["serde"] } hyper = { workspace = true, features = ["full"] } @@ -36,6 +37,7 @@ strum_macros.workspace = true url.workspace = true uuid.workspace = true +pq_proto.workspace = true metrics.workspace = true workspace_hack.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index b11aef9892..4bfb5bf994 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -131,7 +131,9 @@ impl RequestCancelled { impl Drop for RequestCancelled { fn drop(&mut self) { - if let Some(span) = self.warn.take() { + if std::thread::panicking() { + // we are unwinding due to panicking, assume we are not dropped for cancellation + } else if let Some(span) = self.warn.take() { // the span has all of the info already, but the outer `.instrument(span)` has already // been dropped, so we need to manually re-enter it for this message. // diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 40e61e3d0c..8981fdd1dd 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -1,9 +1,7 @@ -use std::fmt::Display; - use anyhow::Context; use bytes::Buf; use hyper::{header, Body, Request, Response, StatusCode}; -use serde::{Deserialize, Serialize, Serializer}; +use serde::{Deserialize, Serialize}; use super::error::ApiError; @@ -33,12 +31,3 @@ pub fn json_response( .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } - -/// Serialize through Display trait. -pub fn display_serialize(z: &F, s: S) -> Result -where - S: Serializer, - F: Display, -{ - s.serialize_str(&format!("{}", z)) -} diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index b27c5cda35..20b601f68d 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -265,6 +265,26 @@ impl fmt::Display for TenantTimelineId { } } +impl FromStr for TenantTimelineId { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let mut parts = s.split('/'); + let tenant_id = parts + .next() + .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain tenant_id"))? + .parse()?; + let timeline_id = parts + .next() + .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain timeline_id"))? + .parse()?; + if parts.next().is_some() { + anyhow::bail!("TenantTimelineId must contain only tenant_id and timeline_id"); + } + Ok(TenantTimelineId::new(tenant_id, timeline_id)) + } +} + // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued // by the console. #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 10862d1771..a946962342 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -54,6 +54,8 @@ pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod pageserver_feedback; + pub mod tracing_span_assert; /// use with fail::cfg("$name", "return(2000)") diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs new file mode 100644 index 0000000000..a3b53201d3 --- /dev/null +++ b/libs/utils/src/pageserver_feedback.rs @@ -0,0 +1,214 @@ +use std::time::{Duration, SystemTime}; + +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use pq_proto::{read_cstr, PG_EPOCH}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use tracing::{trace, warn}; + +use crate::lsn::Lsn; + +/// Feedback pageserver sends to safekeeper and safekeeper resends to compute. +/// Serialized in custom flexible key/value format. In replication protocol, it +/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres +/// Standby status update / Hot standby feedback messages. +/// +/// serde Serialize is used only for human readable dump to json (e.g. in +/// safekeepers debug_dump). +#[serde_as] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct PageserverFeedback { + /// Last known size of the timeline. Used to enforce timeline size limit. + pub current_timeline_size: u64, + /// LSN last received and ingested by the pageserver. Controls backpressure. + #[serde_as(as = "DisplayFromStr")] + pub last_received_lsn: Lsn, + /// LSN up to which data is persisted by the pageserver to its local disc. + /// Controls backpressure. + #[serde_as(as = "DisplayFromStr")] + pub disk_consistent_lsn: Lsn, + /// LSN up to which data is persisted by the pageserver on s3; safekeepers + /// consider WAL before it can be removed. + #[serde_as(as = "DisplayFromStr")] + pub remote_consistent_lsn: Lsn, + // Serialize with RFC3339 format. + #[serde(with = "serde_systemtime")] + pub replytime: SystemTime, +} + +// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. +// Do not remove previously available fields because this might be backwards incompatible. +pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; + +impl PageserverFeedback { + pub fn empty() -> PageserverFeedback { + PageserverFeedback { + current_timeline_size: 0, + last_received_lsn: Lsn::INVALID, + remote_consistent_lsn: Lsn::INVALID, + disk_consistent_lsn: Lsn::INVALID, + replytime: *PG_EPOCH, + } + } + + // Serialize PageserverFeedback using custom format + // to support protocol extensibility. + // + // Following layout is used: + // char - number of key-value pairs that follow. + // + // key-value pairs: + // null-terminated string - key, + // uint32 - value length in bytes + // value itself + // + // TODO: change serialized fields names once all computes migrate to rename. + pub fn serialize(&self, buf: &mut BytesMut) { + buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys + buf.put_slice(b"current_timeline_size\0"); + buf.put_i32(8); + buf.put_u64(self.current_timeline_size); + + buf.put_slice(b"ps_writelsn\0"); + buf.put_i32(8); + buf.put_u64(self.last_received_lsn.0); + buf.put_slice(b"ps_flushlsn\0"); + buf.put_i32(8); + buf.put_u64(self.disk_consistent_lsn.0); + buf.put_slice(b"ps_applylsn\0"); + buf.put_i32(8); + buf.put_u64(self.remote_consistent_lsn.0); + + let timestamp = self + .replytime + .duration_since(*PG_EPOCH) + .expect("failed to serialize pg_replytime earlier than PG_EPOCH") + .as_micros() as i64; + + buf.put_slice(b"ps_replytime\0"); + buf.put_i32(8); + buf.put_i64(timestamp); + } + + // Deserialize PageserverFeedback message + // TODO: change serialized fields names once all computes migrate to rename. + pub fn parse(mut buf: Bytes) -> PageserverFeedback { + let mut rf = PageserverFeedback::empty(); + let nfields = buf.get_u8(); + for _ in 0..nfields { + let key = read_cstr(&mut buf).unwrap(); + match key.as_ref() { + b"current_timeline_size" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + rf.current_timeline_size = buf.get_u64(); + } + b"ps_writelsn" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + rf.last_received_lsn = Lsn(buf.get_u64()); + } + b"ps_flushlsn" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + rf.disk_consistent_lsn = Lsn(buf.get_u64()); + } + b"ps_applylsn" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + rf.remote_consistent_lsn = Lsn(buf.get_u64()); + } + b"ps_replytime" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + let raw_time = buf.get_i64(); + if raw_time > 0 { + rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + } else { + rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + } + } + _ => { + let len = buf.get_i32(); + warn!( + "PageserverFeedback parse. unknown key {} of len {len}. Skip it.", + String::from_utf8_lossy(key.as_ref()) + ); + buf.advance(len as usize); + } + } + } + trace!("PageserverFeedback parsed is {:?}", rf); + rf + } +} + +mod serde_systemtime { + use std::time::SystemTime; + + use chrono::{DateTime, Utc}; + use serde::{Deserialize, Deserializer, Serializer}; + + pub fn serialize(ts: &SystemTime, serializer: S) -> Result + where + S: Serializer, + { + let chrono_dt: DateTime = (*ts).into(); + serializer.serialize_str(&chrono_dt.to_rfc3339()) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let time: String = Deserialize::deserialize(deserializer)?; + Ok(DateTime::parse_from_rfc3339(&time) + .map_err(serde::de::Error::custom)? + .into()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_replication_feedback_serialization() { + let mut rf = PageserverFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; + // Set rounded time to be able to compare it with deserialized value, + // because it is rounded up to microseconds during serialization. + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + let mut data = BytesMut::new(); + rf.serialize(&mut data); + + let rf_parsed = PageserverFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); + } + + #[test] + fn test_replication_feedback_unknown_key() { + let mut rf = PageserverFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; + // Set rounded time to be able to compare it with deserialized value, + // because it is rounded up to microseconds during serialization. + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + let mut data = BytesMut::new(); + rf.serialize(&mut data); + + // Add an extra field to the buffer and adjust number of keys + if let Some(first) = data.first_mut() { + *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; + } + + data.put_slice(b"new_field_one\0"); + data.put_i32(8); + data.put_u64(42); + + // Parse serialized data and check that new field is not parsed + let rf_parsed = PageserverFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); + } +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 62a8c377db..99abfb2fb5 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -52,6 +52,7 @@ sync_wrapper.workspace = true tokio-tar.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } +tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 8f139a6596..ee5980212e 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { min_lsn = min(min_lsn, lsn_range.start); max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1)); - updates.insert_historic(Arc::new(layer)).unwrap(); + updates.insert_historic(Arc::new(layer)); } println!("min: {min_lsn}, max: {max_lsn}"); @@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) { is_incremental: false, short_id: format!("Layer {}", i), }; - updates.insert_historic(Arc::new(layer)).unwrap(); + updates.insert_historic(Arc::new(layer)); } updates.flush(); println!("Finished layer map init in {:?}", now.elapsed()); diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index deb20f21f8..ec2f49c85a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,9 +1,9 @@ use metrics::core::{AtomicU64, GenericCounter}; use metrics::{ register_counter_vec, register_histogram, register_histogram_vec, register_int_counter, - register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, Counter, CounterVec, - Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, - UIntGaugeVec, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, + Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, + UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::models::TenantState; @@ -287,14 +287,33 @@ impl EvictionsWithLowResidenceDuration { let Some(_counter) = self.counter.take() else { return; }; - EVICTIONS_WITH_LOW_RESIDENCE_DURATION - .remove_label_values(&[ - tenant_id, - timeline_id, - self.data_source, - &Self::threshold_label_value(self.threshold), - ]) - .expect("we own the metric, no-one else should remove it"); + + let threshold = Self::threshold_label_value(self.threshold); + + let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[ + tenant_id, + timeline_id, + self.data_source, + &threshold, + ]); + + match removed { + Err(e) => { + // this has been hit in staging as + // , but we don't know how. + // because we can be in the drop path already, don't risk: + // - "double-panic => illegal instruction" or + // - future "drop panick => abort" + // + // so just nag: (the error has the labels) + tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"); + } + Ok(()) => { + // to help identify cases where we double-remove the same values, let's log all + // deletions? + tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source); + } + } } } @@ -459,6 +478,56 @@ pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); +// walreceiver metrics + +pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_walreceiver_started_connections_total", + "Number of started walreceiver connections" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_walreceiver_active_managers", + "Number of active walreceiver managers" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_SWITCHES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_walreceiver_switches_total", + "Number of walreceiver manager change_connection calls", + &["reason"] + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_walreceiver_broker_updates_total", + "Number of received broker updates in walreceiver" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_walreceiver_candidates_events_total", + "Number of walreceiver candidate events", + &["event"] + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_CANDIDATES_ADDED: Lazy = + Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"])); + +pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy = + Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); + // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3610704f2c..a7a0d1a22e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -250,6 +250,15 @@ async fn page_service_conn_main( let peer_addr = socket.peer_addr().context("get peer address")?; + // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements: + // - long enough for most valid compute connections + // - less than infinite to stop us from "leaking" connections to long-gone computes + // + // no write timeout is used, because the kernel is assumed to error writes after some time. + let mut socket = tokio_io_timeout::TimeoutReader::new(socket); + socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10))); + let socket = std::pin::pin!(socket); + // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler @@ -343,7 +352,7 @@ impl PageServerHandler { tenant_id: TenantId, timeline_id: TimelineId, ctx: RequestContext, - ) -> anyhow::Result<()> + ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { @@ -389,7 +398,9 @@ impl PageServerHandler { Some(FeMessage::CopyData(bytes)) => bytes, Some(FeMessage::Terminate) => break, Some(m) => { - anyhow::bail!("unexpected message: {m:?} during COPY"); + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message: {m:?} during COPY" + ))); } None => break, // client disconnected }; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6e1629c41e..76f93b8cfd 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -272,10 +272,7 @@ impl UninitializedTimeline<'_> { .await .context("Failed to flush after basebackup import")?; - // Initialize without loading the layer map. We started with an empty layer map, and already - // updated it for the layers that we created during the import. - let mut timelines = self.owning_tenant.timelines.lock().unwrap(); - self.initialize_with_lock(ctx, &mut timelines, false, true) + self.initialize(ctx) } fn raw_timeline(&self) -> anyhow::Result<&Arc> { @@ -2404,8 +2401,6 @@ impl Tenant { ) })?; - // Initialize the timeline without loading the layer map, because we already updated the layer - // map above, when we imported the datadir. let timeline = { let mut timelines = self.timelines.lock().unwrap(); raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)? diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 0ee0c6f77d..8d06ccd565 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -51,7 +51,7 @@ use crate::keyspace::KeyPartitioning; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; -use anyhow::{bail, Result}; +use anyhow::Result; use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; @@ -125,7 +125,7 @@ where /// /// Insert an on-disk layer. /// - pub fn insert_historic(&mut self, layer: Arc) -> anyhow::Result<()> { + pub fn insert_historic(&mut self, layer: Arc) { self.layer_map.insert_historic_noflush(layer) } @@ -273,21 +273,16 @@ where /// /// Helper function for BatchedUpdates::insert_historic /// - pub(self) fn insert_historic_noflush(&mut self, layer: Arc) -> anyhow::Result<()> { - let key = historic_layer_coverage::LayerKey::from(&*layer); - if self.historic.contains(&key) { - bail!( - "Attempt to insert duplicate layer {} in layer map", - layer.short_id() - ); - } - self.historic.insert(key, Arc::clone(&layer)); + pub(self) fn insert_historic_noflush(&mut self, layer: Arc) { + // TODO: See #3869, resulting #4088, attempted fix and repro #4094 + self.historic.insert( + historic_layer_coverage::LayerKey::from(&*layer), + Arc::clone(&layer), + ); if Self::is_l0(&layer) { self.l0_delta_layers.push(layer); } - - Ok(()) } /// @@ -839,7 +834,7 @@ mod tests { let expected_in_counts = (1, usize::from(expected_l0)); - map.batch_update().insert_historic(remote.clone()).unwrap(); + map.batch_update().insert_historic(remote.clone()); assert_eq!(count_layer_in(&map, &remote), expected_in_counts); let replaced = map diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 1fdcd5e5a4..b63c361314 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -417,14 +417,6 @@ impl BufferedHistoricLayerCoverage { } } - pub fn contains(&self, layer_key: &LayerKey) -> bool { - match self.buffer.get(layer_key) { - Some(None) => false, // layer remove was buffered - Some(_) => true, // layer insert was buffered - None => self.layers.contains_key(layer_key), // no buffered ops for this layer - } - } - pub fn insert(&mut self, layer_key: LayerKey, value: Value) { self.buffer.insert(layer_key, Some(value)); } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5c671ffd63..bc55c2091c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -588,15 +588,25 @@ impl Timeline { let _timer = self.metrics.wait_lsn_time_histo.start_timer(); - self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await - .with_context(|| - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - )?; - - Ok(()) + match self + .last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .await + { + Ok(()) => Ok(()), + seqwait_error => { + drop(_timer); + let walreceiver_status = self.walreceiver.status().await; + seqwait_error.with_context(|| format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}", + lsn, + self.get_last_record_lsn(), + self.get_disk_consistent_lsn(), + walreceiver_status.map(|status| status.to_human_readable_string()) + .unwrap_or_else(|| "WalReceiver status: Not active".to_string()), + )) + } + } } /// Check that it is valid to request operations with that lsn. @@ -1484,7 +1494,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(Arc::new(layer))?; + updates.insert_historic(Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { // Create a DeltaLayer struct for each delta file. @@ -1516,7 +1526,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(Arc::new(layer))?; + updates.insert_historic(Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these @@ -1590,7 +1600,7 @@ impl Timeline { // remote index file? // If so, rename_to_backup those files & replace their local layer with // a RemoteLayer in the layer map so that we re-download them on-demand. - if let Some(local_layer) = &local_layer { + if let Some(local_layer) = local_layer { let local_layer_path = local_layer .local_path() .expect("caller must ensure that local_layers only contains local layers"); @@ -1615,6 +1625,7 @@ impl Timeline { anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { self.metrics.resident_physical_size_gauge.sub(local_size); + updates.remove_historic(local_layer); // fall-through to adding the remote layer } } else { @@ -1650,11 +1661,7 @@ impl Timeline { ); let remote_layer = Arc::new(remote_layer); - if let Some(local_layer) = &local_layer { - updates.replace_historic(local_layer, remote_layer)?; - } else { - updates.insert_historic(remote_layer)?; - } + updates.insert_historic(remote_layer); } LayerFileName::Delta(deltafilename) => { // Create a RemoteLayer for the delta file. @@ -1678,11 +1685,7 @@ impl Timeline { LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted), ); let remote_layer = Arc::new(remote_layer); - if let Some(local_layer) = &local_layer { - updates.replace_historic(local_layer, remote_layer)?; - } else { - updates.insert_historic(remote_layer)?; - } + updates.insert_historic(remote_layer); } } } @@ -2730,7 +2733,7 @@ impl Timeline { .write() .unwrap() .batch_update() - .insert_historic(Arc::new(new_delta))?; + .insert_historic(Arc::new(new_delta)); // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); @@ -2935,7 +2938,7 @@ impl Timeline { self.metrics .resident_physical_size_gauge .add(metadata.len()); - updates.insert_historic(Arc::new(l))?; + updates.insert_historic(Arc::new(l)); } updates.flush(); drop(layers); @@ -3368,7 +3371,7 @@ impl Timeline { new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); let x: Arc = Arc::new(l); - updates.insert_historic(x)?; + updates.insert_historic(x); } // Now that we have reshuffled the data to set of new delta layers, we can diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 00f446af38..91f7208194 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -38,12 +38,14 @@ use std::sync::{Arc, Weak}; use std::time::Duration; use storage_broker::BrokerClientChannel; use tokio::select; -use tokio::sync::watch; +use tokio::sync::{watch, RwLock}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantTimelineId; +use self::connection_manager::ConnectionManagerStatus; + use super::Timeline; #[derive(Clone)] @@ -63,6 +65,7 @@ pub struct WalReceiver { timeline_ref: Weak, conf: WalReceiverConf, started: AtomicBool, + manager_status: Arc>>, } impl WalReceiver { @@ -76,6 +79,7 @@ impl WalReceiver { timeline_ref, conf, started: AtomicBool::new(false), + manager_status: Arc::new(RwLock::new(None)), } } @@ -96,8 +100,8 @@ impl WalReceiver { let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - let wal_receiver_conf = self.conf.clone(); + let loop_status = Arc::clone(&self.manager_status); task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverManager, @@ -115,24 +119,28 @@ impl WalReceiver { select! { _ = task_mgr::shutdown_watcher() => { info!("WAL receiver shutdown requested, shutting down"); - connection_manager_state.shutdown().await; - return Ok(()); + break; }, loop_step_result = connection_manager_loop_step( &mut broker_client, &mut connection_manager_state, &walreceiver_ctx, + &loop_status, ) => match loop_step_result { ControlFlow::Continue(()) => continue, ControlFlow::Break(()) => { info!("Connection manager loop ended, shutting down"); - connection_manager_state.shutdown().await; - return Ok(()); + break; } }, } } - }.instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) + + connection_manager_state.shutdown().await; + *loop_status.write().await = None; + Ok(()) + } + .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) ); self.started.store(true, atomic::Ordering::Release); @@ -149,6 +157,10 @@ impl WalReceiver { .await; self.started.store(false, atomic::Ordering::Release); } + + pub(super) async fn status(&self) -> Option { + self.manager_status.read().await.clone() + } } /// A handle of an asynchronous task. diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 731c5c4644..9cb17ea799 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -13,6 +13,10 @@ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, ti use super::{TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; +use crate::metrics::{ + WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, + WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, +}; use crate::task_mgr::TaskKind; use crate::tenant::Timeline; use anyhow::Context; @@ -24,6 +28,7 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use storage_broker::BrokerClientChannel; use storage_broker::Streaming; +use tokio::sync::RwLock; use tokio::{select, sync::watch}; use tracing::*; @@ -43,6 +48,7 @@ pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, + manager_status: &RwLock>, ) -> ControlFlow<(), ()> { let mut timeline_state_updates = connection_manager_state .timeline @@ -56,6 +62,11 @@ pub(super) async fn connection_manager_loop_step( } } + WALRECEIVER_ACTIVE_MANAGERS.inc(); + scopeguard::defer! { + WALRECEIVER_ACTIVE_MANAGERS.dec(); + } + let id = TenantTimelineId { tenant_id: connection_manager_state.timeline.tenant_id, timeline_id: connection_manager_state.timeline.timeline_id, @@ -180,6 +191,7 @@ pub(super) async fn connection_manager_loop_step( .change_connection(new_candidate, ctx) .await } + *manager_status.write().await = Some(connection_manager_state.manager_status()); } } @@ -267,6 +279,78 @@ pub(super) struct ConnectionManagerState { wal_stream_candidates: HashMap, } +/// An information about connection manager's current connection and connection candidates. +#[derive(Debug, Clone)] +pub struct ConnectionManagerStatus { + existing_connection: Option, + wal_stream_candidates: HashMap, +} + +impl ConnectionManagerStatus { + /// Generates a string, describing current connection status in a form, suitable for logging. + pub fn to_human_readable_string(&self) -> String { + let mut resulting_string = "WalReceiver status".to_string(); + match &self.existing_connection { + Some(connection) => { + if connection.has_processed_wal { + resulting_string.push_str(&format!( + " (update {}): streaming WAL from node {}, ", + connection.latest_wal_update.format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + + match (connection.streaming_lsn, connection.commit_lsn) { + (None, None) => resulting_string.push_str("no streaming data"), + (None, Some(commit_lsn)) => { + resulting_string.push_str(&format!("commit Lsn: {commit_lsn}")) + } + (Some(streaming_lsn), None) => { + resulting_string.push_str(&format!("streaming Lsn: {streaming_lsn}")) + } + (Some(streaming_lsn), Some(commit_lsn)) => resulting_string.push_str( + &format!("commit|streaming Lsn: {commit_lsn}|{streaming_lsn}"), + ), + } + } else if connection.is_connected { + resulting_string.push_str(&format!( + " (update {}): connecting to node {}", + connection + .latest_connection_update + .format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + } else { + resulting_string.push_str(&format!( + " (update {}): initializing node {} connection", + connection + .latest_connection_update + .format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + } + } + None => resulting_string.push_str(": disconnected"), + } + + resulting_string.push_str(", safekeeper candidates (id|update_time|commit_lsn): ["); + let mut candidates = self.wal_stream_candidates.iter().peekable(); + while let Some((node_id, candidate_info)) = candidates.next() { + resulting_string.push_str(&format!( + "({}|{}|{})", + node_id, + candidate_info.latest_update.format("%H:%M:%S"), + Lsn(candidate_info.timeline.commit_lsn) + )); + if candidates.peek().is_some() { + resulting_string.push_str(", "); + } + } + resulting_string.push(']'); + + resulting_string + } +} + /// Current connection data. #[derive(Debug)] struct WalConnection { @@ -293,14 +377,14 @@ struct NewCommittedWAL { discovered_at: NaiveDateTime, } -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] struct RetryInfo { next_retry_at: Option, retry_duration_seconds: f64, } /// Data about the timeline to connect to, received from the broker. -#[derive(Debug)] +#[derive(Debug, Clone)] struct BrokerSkTimeline { timeline: SafekeeperTimelineInfo, /// Time at which the data was fetched from the broker last time, to track the stale data. @@ -325,9 +409,14 @@ impl ConnectionManagerState { /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { + WALRECEIVER_SWITCHES + .with_label_values(&[new_sk.reason.name()]) + .inc(); + self.drop_old_connection(true).await; let id = self.id; + let node_id = new_sk.safekeeper_id; let connect_timeout = self.conf.wal_connect_timeout; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( @@ -343,12 +432,13 @@ impl ConnectionManagerState { cancellation, connect_timeout, ctx, + node_id, ) .await .context("walreceiver connection handling failure") } .instrument( - info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, node_id = %new_sk.safekeeper_id), + info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id), ) }); @@ -364,6 +454,7 @@ impl ConnectionManagerState { latest_wal_update: now, streaming_lsn: None, commit_lsn: None, + node: node_id, }, connection_task: connection_handle, discovered_new_wal: None, @@ -437,6 +528,8 @@ impl ConnectionManagerState { /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { + WALRECEIVER_BROKER_UPDATES.inc(); + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); let old_entry = self.wal_stream_candidates.insert( new_safekeeper_id, @@ -448,6 +541,7 @@ impl ConnectionManagerState { if old_entry.is_none() { info!("New SK node was added: {new_safekeeper_id}"); + WALRECEIVER_CANDIDATES_ADDED.inc(); } } @@ -716,6 +810,7 @@ impl ConnectionManagerState { for node_id in node_ids_to_remove { info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); self.wal_connection_retries.remove(&node_id); + WALRECEIVER_CANDIDATES_REMOVED.inc(); } } } @@ -725,6 +820,13 @@ impl ConnectionManagerState { wal_connection.connection_task.shutdown().await; } } + + fn manager_status(&self) -> ConnectionManagerStatus { + ConnectionManagerStatus { + existing_connection: self.wal_connection.as_ref().map(|conn| conn.status), + wal_stream_candidates: self.wal_stream_candidates.clone(), + } + } } #[derive(Debug)] @@ -732,8 +834,6 @@ struct NewWalConnectionCandidate { safekeeper_id: NodeId, wal_source_connconf: PgConnectionConfig, availability_zone: Option, - // This field is used in `derive(Debug)` only. - #[allow(dead_code)] reason: ReconnectReason, } @@ -762,6 +862,18 @@ enum ReconnectReason { }, } +impl ReconnectReason { + fn name(&self) -> &str { + match self { + ReconnectReason::NoExistingConnection => "NoExistingConnection", + ReconnectReason::LaggingWal { .. } => "LaggingWal", + ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone", + ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout", + ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives", + } + } +} + fn wal_stream_connection_config( TenantTimelineId { tenant_id, @@ -867,6 +979,7 @@ mod tests { latest_wal_update: now, commit_lsn: Some(Lsn(current_lsn)), streaming_lsn: Some(Lsn(current_lsn)), + node: NodeId(1), }; state.conf.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); @@ -1035,6 +1148,7 @@ mod tests { latest_wal_update: now, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: connected_sk_id, }; state.wal_connection = Some(WalConnection { @@ -1101,6 +1215,7 @@ mod tests { latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: NodeId(1), }; state.wal_connection = Some(WalConnection { @@ -1164,6 +1279,7 @@ mod tests { latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: NodeId(1), }; state.wal_connection = Some(WalConnection { @@ -1261,6 +1377,7 @@ mod tests { latest_wal_update: now, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: connected_sk_id, }; state.wal_connection = Some(WalConnection { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index d5099dc2a5..1cbed3416c 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -24,8 +24,8 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; use super::TaskStateUpdate; -use crate::context::RequestContext; use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -37,8 +37,8 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use pq_proto::PageserverFeedback; -use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; +use utils::{id::NodeId, lsn::Lsn}; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -56,6 +56,8 @@ pub(super) struct WalConnectionStatus { pub streaming_lsn: Option, /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet. pub commit_lsn: Option, + /// The node it is connected to + pub node: NodeId, } /// Open a connection to the given safekeeper and receive WAL, sending back progress @@ -67,7 +69,10 @@ pub(super) async fn handle_walreceiver_connection( cancellation: CancellationToken, connect_timeout: Duration, ctx: RequestContext, + node: NodeId, ) -> anyhow::Result<()> { + WALRECEIVER_STARTED_CONNECTIONS.inc(); + // Connect to the database in replication mode. info!("connecting to {wal_source_connconf:?}"); @@ -100,6 +105,7 @@ pub(super) async fn handle_walreceiver_connection( latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, commit_lsn: None, + node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); @@ -122,7 +128,7 @@ pub(super) async fn handle_walreceiver_connection( false, async move { select! { - connection_result = connection => match connection_result{ + connection_result = connection => match connection_result { Ok(()) => info!("Walreceiver db connection closed"), Err(connection_error) => { if let Err(e) = ignore_expected_errors(connection_error) { @@ -319,12 +325,12 @@ pub(super) async fn handle_walreceiver_connection( timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); // The last LSN we processed. It is not guaranteed to survive pageserver crash. - let last_received_lsn = u64::from(last_lsn); + let last_received_lsn = last_lsn; // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn()); + let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn); + let remote_consistent_lsn = timeline_remote_consistent_lsn; let ts = SystemTime::now(); // Update the status about what we just received. This is shown in the mgmt API. diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index cc46fb5a25..1ab2ae668a 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -96,6 +96,8 @@ static shmem_request_hook_type prev_shmem_request_hook; #endif static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */ +void FileCacheMonitorMain(Datum main_arg); + static void lfc_shmem_startup(void) { @@ -378,7 +380,6 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) { BufferTag tag; FileCacheEntry* entry; - ssize_t rc; bool found; int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); uint32 hash; diff --git a/poetry.lock b/poetry.lock index 7b368cd3b4..141371c925 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "aiohttp" @@ -968,14 +968,14 @@ testing = ["pre-commit"] [[package]] name = "flask" -version = "2.1.3" +version = "2.2.5" description = "A simple framework for building complex web applications." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"}, - {file = "Flask-2.1.3.tar.gz", hash = "sha256:15972e5017df0575c3d6c090ba168b6db90259e620ac8d7ea813a396bad5b6cb"}, + {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"}, + {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, ] [package.dependencies] @@ -983,7 +983,7 @@ click = ">=8.0" importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.0" Jinja2 = ">=3.0" -Werkzeug = ">=2.0" +Werkzeug = ">=2.2.2" [package.extras] async = ["asgiref (>=3.2)"] diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 9d702b29c3..e7a4fd236e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -62,6 +62,8 @@ utils.workspace = true uuid.workspace = true webpki-roots.workspace = true x509-parser.workspace = true +native-tls.workspace = true +postgres-native-tls.workspace = true workspace_hack.workspace = true tokio-util.workspace = true diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 7175a23dc1..da43cf11c4 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -9,6 +9,7 @@ use crate::{ use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_postgres::config::SslMode; use tracing::{info, info_span}; #[derive(Debug, Error)] @@ -87,6 +88,16 @@ pub(super) async fn authenticate( .dbname(&db_info.dbname) .user(&db_info.user); + // Backwards compatibility. pg_sni_proxy uses "--" in domain names + // while direct connections do not. Once we migrate to pg_sni_proxy + // everywhere, we can remove this. + if db_info.host.contains("--") { + // we need TLS connection with SNI info to properly route it + config.ssl_mode(SslMode::Require); + } else { + config.ssl_mode(SslMode::Disable); + } + if let Some(password) = db_info.password { config.password(password.as_ref()); } @@ -96,6 +107,7 @@ pub(super) async fn authenticate( value: NodeInfo { config, aux: db_info.aux.into(), + allow_self_signed_compute: false, // caller may override }, }) } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs new file mode 100644 index 0000000000..bba2d51caf --- /dev/null +++ b/proxy/src/bin/pg_sni_router.rs @@ -0,0 +1,250 @@ +/// A stand-alone program that routes connections, e.g. from +/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +/// +/// This allows connecting to pods/services running in the same Kubernetes cluster from +/// the outside. Similar to an ingress controller for HTTPS. +use std::{net::SocketAddr, sync::Arc}; + +use tokio::net::TcpListener; + +use anyhow::{anyhow, bail, ensure, Context}; +use clap::{self, Arg}; +use futures::TryFutureExt; +use proxy::console::messages::MetricsAuxInfo; +use proxy::stream::{PqStream, Stream}; + +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_util::sync::CancellationToken; +use utils::{project_git_version, sentry_init::init_sentry}; + +use tracing::{error, info, warn}; + +project_git_version!(GIT_VERSION); + +fn cli() -> clap::Command { + clap::Command::new("Neon proxy/router") + .version(GIT_VERSION) + .arg( + Arg::new("listen") + .short('l') + .long("listen") + .help("listen for incoming client connections on ip:port") + .default_value("127.0.0.1:4432"), + ) + .arg( + Arg::new("tls-key") + .short('k') + .long("tls-key") + .help("path to TLS key for client postgres connections") + .required(true), + ) + .arg( + Arg::new("tls-cert") + .short('c') + .long("tls-cert") + .help("path to TLS cert for client postgres connections") + .required(true), + ) + .arg( + Arg::new("dest") + .short('d') + .long("destination") + .help("append this domain zone to the SNI hostname to get the destination address") + .required(true), + ) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let _logging_guard = proxy::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + let args = cli().get_matches(); + let destination: String = args.get_one::("dest").unwrap().parse()?; + + // Configure TLS + let tls_config: Arc = match ( + args.get_one::("tls-key"), + args.get_one::("tls-cert"), + ) { + (Some(key_path), Some(cert_path)) => { + let key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) + .context(format!("Failed to read TLS keys at '{key_path}'"))?; + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + keys.pop().map(rustls::PrivateKey).unwrap() + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .context(format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ))? + .into_iter() + .map(rustls::Certificate) + .collect() + }; + + rustls::ServerConfig::builder() + .with_safe_default_cipher_suites() + .with_safe_default_kx_groups() + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into() + } + _ => bail!("tls-key and tls-cert must be specified"), + }; + + // Start listening for incoming client connections + let proxy_address: SocketAddr = args.get_one::("listen").unwrap().parse()?; + info!("Starting sni router on {proxy_address}"); + let proxy_listener = TcpListener::bind(proxy_address).await?; + + let cancellation_token = CancellationToken::new(); + + let main = proxy::flatten_err(tokio::spawn(task_main( + Arc::new(destination), + tls_config, + proxy_listener, + cancellation_token.clone(), + ))); + let signals_task = proxy::flatten_err(tokio::spawn(proxy::handle_signals(cancellation_token))); + + tokio::select! { + res = main => { res?; }, + res = signals_task => { res?; }, + } + + Ok(()) +} + +async fn task_main( + dest_suffix: Arc, + tls_config: Arc, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let mut connections = tokio::task::JoinSet::new(); + + loop { + tokio::select! { + accept_result = listener.accept() => { + let (socket, peer_addr) = accept_result?; + info!("accepted postgres client connection from {peer_addr}"); + + let session_id = uuid::Uuid::new_v4(); + let tls_config = Arc::clone(&tls_config); + let dest_suffix = Arc::clone(&dest_suffix); + + connections.spawn( + async move { + info!("spawned a task for {peer_addr}"); + + socket + .set_nodelay(true) + .context("failed to set socket option")?; + + handle_client(dest_suffix, tls_config, session_id, socket).await + } + .unwrap_or_else(|e| { + // Acknowledge that the task has finished with an error. + error!("per-client task finished with an error: {e:#}"); + }), + ); + } + _ = cancellation_token.cancelled() => { + drop(listener); + break; + } + } + } + + // Drain connections + info!("waiting for all client connections to finish"); + while let Some(res) = connections.join_next().await { + if let Err(e) = res { + if !e.is_panic() && !e.is_cancelled() { + warn!("unexpected error from joined connection task: {e:?}"); + } + } + } + info!("all client connections have finished"); + Ok(()) +} + +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; + +async fn ssl_handshake( + raw_stream: S, + tls_config: Arc, +) -> anyhow::Result> { + let mut stream = PqStream::new(Stream::from_raw(raw_stream)); + + let msg = stream.read_startup_packet().await?; + info!("received {msg:?}"); + use pq_proto::FeStartupPacket::*; + + match msg { + SslRequest => { + stream + .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) + .await?; + // Upgrade raw stream into a secure TLS-backed stream. + // NOTE: We've consumed `tls`; this fact will be used later. + + let (raw, read_buf) = stream.into_inner(); + // TODO: Normally, client doesn't send any data before + // server says TLS handshake is ok and read_buf is empy. + // However, you could imagine pipelining of postgres + // SSLRequest + TLS ClientHello in one hunk similar to + // pipelining in our node js driver. We should probably + // support that by chaining read_buf with the stream. + if !read_buf.is_empty() { + bail!("data is sent before server replied with EncryptionResponse"); + } + Ok(raw.upgrade(tls_config).await?) + } + _ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?, + } +} + +#[tracing::instrument(fields(session_id = ?session_id), skip_all)] +async fn handle_client( + dest_suffix: Arc, + tls_config: Arc, + session_id: uuid::Uuid, + stream: impl AsyncRead + AsyncWrite + Unpin, +) -> anyhow::Result<()> { + let tls_stream = ssl_handshake(stream, tls_config).await?; + + // Cut off first part of the SNI domain + // We receive required destination details in the format of + // `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain` + let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?; + let dest: Vec<&str> = sni + .split_once('.') + .context("invalid SNI")? + .0 + .splitn(3, "--") + .collect(); + let port = dest[2].parse::().context("invalid port")?; + let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port); + + info!("destination: {}", destination); + + let client = tokio::net::TcpStream::connect(destination).await?; + + let metrics_aux: MetricsAuxInfo = Default::default(); + proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await +} diff --git a/proxy/src/main.rs b/proxy/src/bin/proxy.rs similarity index 79% rename from proxy/src/main.rs rename to proxy/src/bin/proxy.rs index 1fd13c9f68..28e6e25317 100644 --- a/proxy/src/main.rs +++ b/proxy/src/bin/proxy.rs @@ -1,49 +1,23 @@ -//! Postgres protocol proxy/router. -//! -//! This service listens psql port and can check auth via external service -//! (control plane API in our case) and can create new databases and accounts -//! in somewhat transparent manner (again via communication with control plane API). +use proxy::auth; +use proxy::console; +use proxy::http; +use proxy::metrics; -mod auth; -mod cache; -mod cancellation; -mod compute; -mod config; -mod console; -mod error; -mod http; -mod logging; -mod metrics; -mod parse; -mod proxy; -mod sasl; -mod scram; -mod stream; -mod url; -mod waiters; - -use anyhow::{bail, Context}; +use anyhow::bail; use clap::{self, Arg}; -use config::ProxyConfig; -use futures::FutureExt; -use std::{borrow::Cow, future::Future, net::SocketAddr}; -use tokio::{net::TcpListener, task::JoinError}; +use proxy::config::{self, ProxyConfig}; +use std::{borrow::Cow, net::SocketAddr}; +use tokio::net::TcpListener; use tokio_util::sync::CancellationToken; -use tracing::{info, warn}; +use tracing::info; +use tracing::warn; use utils::{project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); -/// Flattens `Result>` into `Result`. -async fn flatten_err( - f: impl Future, JoinError>>, -) -> anyhow::Result<()> { - f.map(|r| r.context("join error").and_then(|x| x)).await -} - #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = logging::init().await?; + let _logging_guard = proxy::logging::init().await?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); @@ -69,7 +43,7 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let cancellation_token = CancellationToken::new(); - let mut client_tasks = vec![tokio::spawn(proxy::task_main( + let mut client_tasks = vec![tokio::spawn(proxy::proxy::task_main( config, proxy_listener, cancellation_token.clone(), @@ -88,7 +62,7 @@ async fn main() -> anyhow::Result<()> { } let mut tasks = vec![ - tokio::spawn(handle_signals(cancellation_token)), + tokio::spawn(proxy::handle_signals(cancellation_token)), tokio::spawn(http::server::task_main(http_listener)), tokio::spawn(console::mgmt::task_main(mgmt_listener)), ]; @@ -97,8 +71,9 @@ async fn main() -> anyhow::Result<()> { tasks.push(tokio::spawn(metrics::task_main(metrics_config))); } - let tasks = futures::future::try_join_all(tasks.into_iter().map(flatten_err)); - let client_tasks = futures::future::try_join_all(client_tasks.into_iter().map(flatten_err)); + let tasks = futures::future::try_join_all(tasks.into_iter().map(proxy::flatten_err)); + let client_tasks = + futures::future::try_join_all(client_tasks.into_iter().map(proxy::flatten_err)); tokio::select! { // We are only expecting an error from these forever tasks res = tasks => { res?; }, @@ -107,33 +82,6 @@ async fn main() -> anyhow::Result<()> { Ok(()) } -/// Handle unix signals appropriately. -async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> { - use tokio::signal::unix::{signal, SignalKind}; - - let mut hangup = signal(SignalKind::hangup())?; - let mut interrupt = signal(SignalKind::interrupt())?; - let mut terminate = signal(SignalKind::terminate())?; - - loop { - tokio::select! { - // Hangup is commonly used for config reload. - _ = hangup.recv() => { - warn!("received SIGHUP; config reload is not supported"); - } - // Shut down the whole application. - _ = interrupt.recv() => { - warn!("received SIGINT, exiting immediately"); - bail!("interrupted"); - } - _ = terminate.recv() => { - warn!("received SIGTERM, shutting down once all existing connections have closed"); - token.cancel(); - } - } - } -} - /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> { let tls_config = match ( @@ -149,6 +97,14 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; + let allow_self_signed_compute: bool = args + .get_one::("allow-self-signed-compute") + .unwrap() + .parse()?; + if allow_self_signed_compute { + warn!("allowing self-signed compute certificates"); + } + let metric_collection = match ( args.get_one::("metric-collection-endpoint"), args.get_one::("metric-collection-interval"), @@ -198,6 +154,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> tls_config, auth_backend, metric_collection, + allow_self_signed_compute, })); Ok(config) @@ -288,6 +245,12 @@ fn cli() -> clap::Command { .help("cache for `wake_compute` api method (use `size=0` to disable)") .default_value(config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO), ) + .arg( + Arg::new("allow-self-signed-compute") + .long("allow-self-signed-compute") + .help("Allow self-signed certificates for compute nodes (for testing)") + .default_value("false"), + ) } #[cfg(test)] diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 0465703ae6..d277940a12 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -5,7 +5,7 @@ use pq_proto::StartupMessageParams; use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; use tokio::net::TcpStream; -use tokio_postgres::NoTls; +use tokio_postgres::tls::MakeTlsConnect; use tracing::{error, info, warn}; const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; @@ -19,6 +19,9 @@ pub enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] CouldNotConnect(#[from] io::Error), + + #[error("{COULD_NOT_CONNECT}: {0}")] + TlsError(#[from] native_tls::Error), } impl UserFacingError for ConnectionError { @@ -125,9 +128,15 @@ impl std::ops::DerefMut for ConnCfg { } } +impl Default for ConnCfg { + fn default() -> Self { + Self::new() + } +} + impl ConnCfg { /// Establish a raw TCP connection to the compute node. - async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { + async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> { use tokio_postgres::config::Host; // wrap TcpStream::connect with timeout @@ -180,7 +189,7 @@ impl ConnCfg { }; match connect_once(host, *port).await { - Ok(socket) => return Ok(socket), + Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)), Err(err) => { // We can't throw an error here, as there might be more hosts to try. warn!("couldn't connect to compute node at {host}:{port}: {err}"); @@ -200,7 +209,10 @@ impl ConnCfg { pub struct PostgresConnection { /// Socket connected to a compute node. - pub stream: TcpStream, + pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream< + tokio::net::TcpStream, + postgres_native_tls::TlsStream, + >, /// PostgreSQL connection parameters. pub params: std::collections::HashMap, /// Query cancellation token. @@ -208,11 +220,27 @@ pub struct PostgresConnection { } impl ConnCfg { - async fn do_connect(&self) -> Result { - // TODO: establish a secure connection to the DB. - let (socket_addr, mut stream) = self.connect_raw().await?; - let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?; - info!("connected to compute node at {socket_addr}"); + async fn do_connect( + &self, + allow_self_signed_compute: bool, + ) -> Result { + let (socket_addr, stream, host) = self.connect_raw().await?; + + let tls_connector = native_tls::TlsConnector::builder() + .danger_accept_invalid_certs(allow_self_signed_compute) + .build() + .unwrap(); + let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector); + let tls = MakeTlsConnect::::make_tls_connect(&mut mk_tls, host)?; + + // connect_raw() will not use TLS if sslmode is "disable" + let (client, connection) = self.0.connect_raw(stream, tls).await?; + let stream = connection.stream.into_inner(); + + info!( + "connected to compute node at {host} ({socket_addr}) sslmode={:?}", + self.0.get_ssl_mode() + ); // This is very ugly but as of now there's no better way to // extract the connection parameters from tokio-postgres' connection. @@ -233,8 +261,11 @@ impl ConnCfg { } /// Connect to a corresponding compute node. - pub async fn connect(&self) -> Result { - self.do_connect() + pub async fn connect( + &self, + allow_self_signed_compute: bool, + ) -> Result { + self.do_connect(allow_self_signed_compute) .inspect_err(|err| { // Immediately log the error we have at our disposal. error!("couldn't connect to compute node: {err}"); diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 0ceb556ca1..530229b3fd 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -12,6 +12,7 @@ pub struct ProxyConfig { pub tls_config: Option, pub auth_backend: auth::BackendType<'static, ()>, pub metric_collection: Option, + pub allow_self_signed_compute: bool, } #[derive(Debug)] diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 80cd94d483..44e23e0adf 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -170,6 +170,9 @@ pub struct NodeInfo { /// Labels for proxy's metrics. pub aux: Arc, + + /// Whether we should accept self-signed certificates (for testing) + pub allow_self_signed_compute: bool, } pub type NodeInfoCache = TimedLru, NodeInfo>; diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index eaac9c06d9..3b42c73a34 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -8,6 +8,7 @@ use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUr use async_trait::async_trait; use futures::TryFutureExt; use thiserror::Error; +use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; #[derive(Debug, Error)] @@ -86,11 +87,13 @@ impl Api { let mut config = compute::ConnCfg::new(); config .host(self.endpoint.host_str().unwrap_or("localhost")) - .port(self.endpoint.port().unwrap_or(5432)); + .port(self.endpoint.port().unwrap_or(5432)) + .ssl_mode(SslMode::Disable); let node = NodeInfo { config, aux: Default::default(), + allow_self_signed_compute: false, }; Ok(node) diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 3644db17f7..a8e855b2c8 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -8,6 +8,7 @@ use super::{ use crate::{auth::ClientCredentials, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; +use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; #[derive(Clone)] @@ -100,11 +101,12 @@ impl Api { // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). let mut config = compute::ConnCfg::new(); - config.host(host).port(port); + config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. let node = NodeInfo { config, aux: body.aux.into(), + allow_self_signed_compute: false, }; Ok(node) diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs new file mode 100644 index 0000000000..148ee67d90 --- /dev/null +++ b/proxy/src/lib.rs @@ -0,0 +1,57 @@ +use anyhow::{bail, Context}; +use futures::{Future, FutureExt}; +use tokio::task::JoinError; +use tokio_util::sync::CancellationToken; +use tracing::warn; + +pub mod auth; +pub mod cache; +pub mod cancellation; +pub mod compute; +pub mod config; +pub mod console; +pub mod error; +pub mod http; +pub mod logging; +pub mod metrics; +pub mod parse; +pub mod proxy; +pub mod sasl; +pub mod scram; +pub mod stream; +pub mod url; +pub mod waiters; + +/// Handle unix signals appropriately. +pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> { + use tokio::signal::unix::{signal, SignalKind}; + + let mut hangup = signal(SignalKind::hangup())?; + let mut interrupt = signal(SignalKind::interrupt())?; + let mut terminate = signal(SignalKind::terminate())?; + + loop { + tokio::select! { + // Hangup is commonly used for config reload. + _ = hangup.recv() => { + warn!("received SIGHUP; config reload is not supported"); + } + // Shut down the whole application. + _ = interrupt.recv() => { + warn!("received SIGINT, exiting immediately"); + bail!("interrupted"); + } + _ = terminate.recv() => { + warn!("received SIGTERM, shutting down once all existing connections have closed"); + token.cancel(); + } + } + } +} + +/// Flattens `Result>` into `Result`. +pub async fn flatten_err( + f: impl Future, JoinError>>, +) -> anyhow::Result<()> { + f.map(|r| r.context("join error").and_then(|x| x)).await +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 1169d76160..f3d3524d30 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -155,7 +155,7 @@ pub async fn handle_ws_client( async { result }.or_else(|e| stream.throw_error(e)).await? }; - let client = Client::new(stream, creds, ¶ms, session_id); + let client = Client::new(stream, creds, ¶ms, session_id, false); cancel_map .with_session(|session| client.connect_to_db(session, true)) .await @@ -194,7 +194,15 @@ async fn handle_client( async { result }.or_else(|e| stream.throw_error(e)).await? }; - let client = Client::new(stream, creds, ¶ms, session_id); + let allow_self_signed_compute = config.allow_self_signed_compute; + + let client = Client::new( + stream, + creds, + ¶ms, + session_id, + allow_self_signed_compute, + ); cancel_map .with_session(|session| client.connect_to_db(session, false)) .await @@ -297,9 +305,11 @@ async fn connect_to_compute_once( NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); }; + let allow_self_signed_compute = node_info.allow_self_signed_compute; + node_info .config - .connect() + .connect(allow_self_signed_compute) .inspect_err(invalidate_cache) .await } @@ -378,7 +388,7 @@ async fn prepare_client_connection( /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] -async fn proxy_pass( +pub async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, aux: &MetricsAuxInfo, @@ -420,6 +430,8 @@ struct Client<'a, S> { params: &'a StartupMessageParams, /// Unique connection ID. session_id: uuid::Uuid, + /// Allow self-signed certificates (for testing). + allow_self_signed_compute: bool, } impl<'a, S> Client<'a, S> { @@ -429,12 +441,14 @@ impl<'a, S> Client<'a, S> { creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, params: &'a StartupMessageParams, session_id: uuid::Uuid, + allow_self_signed_compute: bool, ) -> Self { Self { stream, creds, params, session_id, + allow_self_signed_compute, } } } @@ -451,6 +465,7 @@ impl Client<'_, S> { mut creds, params, session_id, + allow_self_signed_compute, } = self; let extra = console::ConsoleReqExtra { @@ -473,6 +488,8 @@ impl Client<'_, S> { value: mut node_info, } = auth_result; + node_info.allow_self_signed_compute = allow_self_signed_compute; + let mut node = connect_to_compute(&mut node_info, params, &extra, &creds) .or_else(|e| stream.throw_error(e)) .await?; diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 00cd111da5..393570df6a 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -19,11 +19,14 @@ git-version.workspace = true hex.workspace = true humantime.workspace = true hyper.workspace = true +futures.workspace = true once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true regex.workspace = true +scopeguard.workspace = true +reqwest = { workspace = true, features = ["json"] } serde.workspace = true serde_json.workspace = true serde_with.workspace = true @@ -33,6 +36,7 @@ tokio = { workspace = true, features = ["fs"] } tokio-io-timeout.workspace = true tokio-postgres.workspace = true toml_edit.workspace = true +tempfile.workspace = true tracing.workspace = true url.workspace = true metrics.workspace = true @@ -45,6 +49,3 @@ storage_broker.workspace = true utils.workspace = true workspace_hack.workspace = true - -[dev-dependencies] -tempfile.workspace = true diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 92f35bf51f..5e25d22ec1 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -14,10 +14,13 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::Request; use std::time::Duration; +use std::time::Instant; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use crate::metrics::BROKER_PULLED_UPDATES; +use crate::metrics::BROKER_PUSHED_UPDATES; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -49,12 +52,17 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. + let now = Instant::now(); let mut active_tlis = GlobalTimelines::get_all(); active_tlis.retain(|tli| tli.is_active()); for tli in &active_tlis { let sk_info = tli.get_safekeeper_info(&conf); yield sk_info; + BROKER_PUSHED_UPDATES.inc(); } + let elapsed = now.elapsed(); + // Log duration every second. Should be about 10MB of logs per day. + info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); sleep(push_interval).await; } }; @@ -79,6 +87,10 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { .context("subscribe_safekeper_info request failed")? .into_inner(); + let ok_counter = BROKER_PULLED_UPDATES.with_label_values(&["ok"]); + let not_found = BROKER_PULLED_UPDATES.with_label_values(&["not_found"]); + let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]); + while let Some(msg) = stream.message().await? { let proto_ttid = msg .tenant_timeline_id @@ -91,7 +103,15 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { // connection to the broker. // note: there are blocking operations below, but it's considered fine for now - tli.record_safekeeper_info(&msg).await? + let res = tli.record_safekeeper_info(msg).await; + if res.is_ok() { + ok_counter.inc(); + } else { + err_counter.inc(); + } + res?; + } else { + not_found.inc(); } } bail!("end of stream"); diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 674cf9f6eb..f711c4429d 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -9,9 +9,10 @@ use std::path::PathBuf; use anyhow::Result; use chrono::{DateTime, Utc}; use postgres_ffi::XLogSegNo; +use serde::Deserialize; use serde::Serialize; -use utils::http::json::display_serialize; +use serde_with::{serde_as, DisplayFromStr}; use utils::id::NodeId; use utils::id::TenantTimelineId; use utils::id::{TenantId, TimelineId}; @@ -22,11 +23,11 @@ use crate::safekeeper::SafekeeperMemState; use crate::safekeeper::TermHistory; use crate::SafeKeeperConf; -use crate::timeline::ReplicaState; +use crate::send_wal::WalSenderState; use crate::GlobalTimelines; /// Various filters that influence the resulting JSON output. -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct Args { /// Dump all available safekeeper state. False by default. pub dump_all: bool, @@ -51,7 +52,7 @@ pub struct Args { } /// Response for debug dump request. -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct Response { pub start_time: DateTime, pub finish_time: DateTime, @@ -61,7 +62,7 @@ pub struct Response { } /// Safekeeper configuration. -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct Config { pub id: NodeId, pub workdir: PathBuf, @@ -72,22 +73,23 @@ pub struct Config { pub wal_backup_enabled: bool, } -#[derive(Debug, Serialize)] +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] pub struct Timeline { - #[serde(serialize_with = "display_serialize")] + #[serde_as(as = "DisplayFromStr")] pub tenant_id: TenantId, - #[serde(serialize_with = "display_serialize")] + #[serde_as(as = "DisplayFromStr")] pub timeline_id: TimelineId, pub control_file: Option, pub memory: Option, pub disk_content: Option, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct Memory { pub is_cancelled: bool, pub peers_info_len: usize, - pub replicas: Vec>, + pub walsenders: Vec, pub wal_backup_active: bool, pub active: bool, pub num_computes: u32, @@ -102,12 +104,12 @@ pub struct Memory { pub file_open: bool, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct DiskContent { pub files: Vec, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct FileInfo { pub name: String, pub size: u64, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 2c3d1cea0e..7d25ced449 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -10,7 +10,7 @@ use tracing::{info, info_span, Instrument}; use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; -use crate::metrics::TrafficMetrics; +use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED}; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; use postgres_backend::QueryError; @@ -72,6 +72,15 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { } } +fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { + match cmd { + SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH", + SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", + SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", + SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL", + } +} + #[async_trait::async_trait] impl postgres_backend::Handler for SafekeeperPostgresHandler @@ -168,6 +177,12 @@ impl postgres_backend::Handler } let cmd = parse_cmd(query_string)?; + let cmd_str = cmd_to_string(&cmd); + + PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc(); + scopeguard::defer! { + PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc(); + } info!( "got query {:?} in timeline {:?}", diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index cdec45c148..a498d868af 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -3,19 +3,21 @@ use hyper::{Body, Request, Response, StatusCode, Uri}; use once_cell::sync::Lazy; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::SkTimelineInfo; -use serde::Serialize; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::collections::{HashMap, HashSet}; use std::fmt; use std::str::FromStr; use std::sync::Arc; use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use tokio::fs::File; +use tokio::io::AsyncReadExt; use tokio::task::JoinError; -use utils::http::json::display_serialize; -use crate::debug_dump; use crate::safekeeper::ServerInfo; use crate::safekeeper::Term; +use crate::{debug_dump, pull_timeline}; use crate::timelines_global_map::TimelineDeleteForceResult; use crate::GlobalTimelines; @@ -57,44 +59,46 @@ fn get_conf(request: &Request) -> &SafeKeeperConf { /// Same as TermSwitchEntry, but serializes LSN using display serializer /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. -#[derive(Debug, Serialize)] -struct TermSwitchApiEntry { +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +pub struct TermSwitchApiEntry { pub term: Term, - #[serde(serialize_with = "display_serialize")] + #[serde_as(as = "DisplayFromStr")] pub lsn: Lsn, } /// Augment AcceptorState with epoch for convenience -#[derive(Debug, Serialize)] -struct AcceptorStateStatus { - term: Term, - epoch: Term, - term_history: Vec, +#[derive(Debug, Serialize, Deserialize)] +pub struct AcceptorStateStatus { + pub term: Term, + pub epoch: Term, + pub term_history: Vec, } /// Info about timeline on safekeeper ready for reporting. -#[derive(Debug, Serialize)] -struct TimelineStatus { - #[serde(serialize_with = "display_serialize")] - tenant_id: TenantId, - #[serde(serialize_with = "display_serialize")] - timeline_id: TimelineId, - acceptor_state: AcceptorStateStatus, - pg_info: ServerInfo, - #[serde(serialize_with = "display_serialize")] - flush_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - timeline_start_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - local_start_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - commit_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - backup_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - peer_horizon_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - remote_consistent_lsn: Lsn, +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineStatus { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + pub timeline_id: TimelineId, + pub acceptor_state: AcceptorStateStatus, + pub pg_info: ServerInfo, + #[serde_as(as = "DisplayFromStr")] + pub flush_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub timeline_start_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub local_start_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub commit_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub backup_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub peer_horizon_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub remote_consistent_lsn: Lsn, } fn check_permission(request: &Request, tenant_id: Option) -> Result<(), ApiError> { @@ -144,7 +148,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { + check_permission(&request, None)?; + + let data: pull_timeline::Request = json_request(&mut request).await?; + + let resp = pull_timeline::handle_request(data) + .await + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, resp) +} + +/// Download a file from the timeline directory. +// TODO: figure out a better way to copy files between safekeepers +async fn timeline_files_handler(request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let filename: String = parse_request_param(&request, "filename")?; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let filepath = tli.timeline_dir.join(filename); + let mut file = File::open(&filepath) + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + let mut content = Vec::new(); + // TODO: don't store files in memory + file.read_to_end(&mut content) + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + Response::builder() + .status(StatusCode::OK) + .header("Content-Type", "application/octet-stream") + .body(Body::from(content)) + .map_err(|e| ApiError::InternalServerError(e.into())) +} + /// Deactivates the timeline and removes its data directory. async fn timeline_delete_force_handler( mut request: Request, @@ -246,7 +293,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result RouterBuilder timeline_delete_force_handler, ) .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler) + .post("/v1/pull_timeline", timeline_pull_handler) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename", + timeline_files_handler, + ) // for tests .post( "/v1/record_safekeeper_info/:tenant_id/:timeline_id", diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 2841cd195f..dc9188723e 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -50,7 +50,7 @@ pub struct AppendLogicalMessage { pub pg_version: u32, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize)] struct AppendResult { // safekeeper state after append state: SafeKeeperState, @@ -133,7 +133,7 @@ fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::R Ok(()) } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize)] pub struct InsertedWAL { begin_lsn: Lsn, pub end_lsn: Lsn, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 2c28c5218d..ff621fdbc0 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -15,6 +15,7 @@ pub mod handler; pub mod http; pub mod json_ctrl; pub mod metrics; +pub mod pull_timeline; pub mod receive_wal; pub mod remove_wal; pub mod safekeeper; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 2aaa17bfc5..189af2b044 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -10,16 +10,16 @@ use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter_vec, Gauge, IntCounterVec, IntGaugeVec, + register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; +use utils::pageserver_feedback::PageserverFeedback; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ safekeeper::{SafeKeeperState, SafekeeperMemState}, - timeline::ReplicaState, GlobalTimelines, }; @@ -73,6 +73,58 @@ pub static PG_IO_BYTES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_pg_io_bytes gauge") }); +pub static BROKER_PUSHED_UPDATES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_broker_pushed_updates_total", + "Number of timeline updates pushed to the broker" + ) + .expect("Failed to register safekeeper_broker_pushed_updates_total counter") +}); +pub static BROKER_PULLED_UPDATES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_broker_pulled_updates_total", + "Number of timeline updates pulled and processed from the broker", + &["result"] + ) + .expect("Failed to register safekeeper_broker_pulled_updates_total counter") +}); +pub static PG_QUERIES_RECEIVED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_queries_received_total", + "Number of queries received through pg protocol", + &["query"] + ) + .expect("Failed to register safekeeper_pg_queries_received_total counter") +}); +pub static PG_QUERIES_FINISHED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_queries_finished_total", + "Number of queries finished through pg protocol", + &["query"] + ) + .expect("Failed to register safekeeper_pg_queries_finished_total counter") +}); +pub static REMOVED_WAL_SEGMENTS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_removed_wal_segments_total", + "Number of WAL segments removed from the disk" + ) + .expect("Failed to register safekeeper_removed_wal_segments_total counter") +}); +pub static BACKED_UP_SEGMENTS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_backed_up_segments_total", + "Number of WAL segments backed up to the broker" + ) + .expect("Failed to register safekeeper_backed_up_segments_total counter") +}); +pub static BACKUP_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_backup_errors_total", + "Number of errors during backup" + ) + .expect("Failed to register safekeeper_backup_errors_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; @@ -231,7 +283,7 @@ pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result { /// Metrics for a single timeline. pub struct FullTimelineInfo { pub ttid: TenantTimelineId, - pub replicas: Vec, + pub ps_feedback: PageserverFeedback, pub wal_backup_active: bool, pub timeline_is_active: bool, pub num_computes: u32, @@ -242,6 +294,7 @@ pub struct FullTimelineInfo { pub persisted_state: SafeKeeperState, pub flush_lsn: Lsn, + pub remote_consistent_lsn: Lsn, pub wal_storage: WalStorageMetrics, } @@ -514,19 +567,6 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; - let mut most_advanced: Option = None; - for replica in tli.replicas.iter() { - if let Some(replica_feedback) = replica.pageserver_feedback { - if let Some(current) = most_advanced { - if current.last_received_lsn < replica_feedback.last_received_lsn { - most_advanced = Some(replica_feedback); - } - } else { - most_advanced = Some(replica_feedback); - } - } - } - self.commit_lsn .with_label_values(labels) .set(tli.mem_state.commit_lsn.into()); @@ -544,7 +584,7 @@ impl Collector for TimelineCollector { .set(tli.mem_state.peer_horizon_lsn.into()); self.remote_consistent_lsn .with_label_values(labels) - .set(tli.mem_state.remote_consistent_lsn.into()); + .set(tli.remote_consistent_lsn.into()); self.timeline_active .with_label_values(labels) .set(tli.timeline_is_active as u64); @@ -567,15 +607,17 @@ impl Collector for TimelineCollector { .with_label_values(labels) .set(tli.wal_storage.flush_wal_seconds); - if let Some(feedback) = most_advanced { - self.ps_last_received_lsn + self.ps_last_received_lsn + .with_label_values(labels) + .set(tli.ps_feedback.last_received_lsn.0); + if let Ok(unix_time) = tli + .ps_feedback + .replytime + .duration_since(SystemTime::UNIX_EPOCH) + { + self.feedback_last_time_seconds .with_label_values(labels) - .set(feedback.last_received_lsn); - if let Ok(unix_time) = feedback.replytime.duration_since(SystemTime::UNIX_EPOCH) { - self.feedback_last_time_seconds - .with_label_values(labels) - .set(unix_time.as_secs()); - } + .set(unix_time.as_secs()); } if tli.last_removed_segno != 0 { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs new file mode 100644 index 0000000000..344b760fd3 --- /dev/null +++ b/safekeeper/src/pull_timeline.rs @@ -0,0 +1,240 @@ +use serde::{Deserialize, Serialize}; + +use anyhow::{bail, Context, Result}; +use tokio::io::AsyncWriteExt; +use tracing::info; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +use serde_with::{serde_as, DisplayFromStr}; + +use crate::{ + control_file, debug_dump, + http::routes::TimelineStatus, + wal_storage::{self, Storage}, + GlobalTimelines, +}; + +/// Info about timeline on safekeeper ready for reporting. +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +pub struct Request { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + pub timeline_id: TimelineId, + pub http_hosts: Vec, +} + +#[derive(Debug, Serialize)] +pub struct Response { + // Donor safekeeper host + pub safekeeper_host: String, + // TODO: add more fields? +} + +/// Find the most advanced safekeeper and pull timeline from it. +pub async fn handle_request(request: Request) -> Result { + let existing_tli = GlobalTimelines::get(TenantTimelineId::new( + request.tenant_id, + request.timeline_id, + )); + if existing_tli.is_ok() { + bail!("Timeline {} already exists", request.timeline_id); + } + + let client = reqwest::Client::new(); + let http_hosts = request.http_hosts.clone(); + + // Send request to /v1/tenant/:tenant_id/timeline/:timeline_id + let responses = futures::future::join_all(http_hosts.iter().map(|url| { + let url = format!( + "{}/v1/tenant/{}/timeline/{}", + url, request.tenant_id, request.timeline_id + ); + client.get(url).send() + })) + .await; + + let mut statuses = Vec::new(); + for (i, response) in responses.into_iter().enumerate() { + let response = response.context(format!("Failed to get status from {}", http_hosts[i]))?; + let status: crate::http::routes::TimelineStatus = response.json().await?; + statuses.push((status, i)); + } + + // Find the most advanced safekeeper + // TODO: current logic may be wrong, fix it later + let (status, i) = statuses + .into_iter() + .max_by_key(|(status, _)| { + ( + status.acceptor_state.epoch, + status.flush_lsn, + status.commit_lsn, + ) + }) + .unwrap(); + let safekeeper_host = http_hosts[i].clone(); + + assert!(status.tenant_id == request.tenant_id); + assert!(status.timeline_id == request.timeline_id); + + pull_timeline(status, safekeeper_host).await +} + +async fn pull_timeline(status: TimelineStatus, host: String) -> Result { + let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); + info!( + "Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", + ttid, + host, + status.commit_lsn, + status.flush_lsn, + status.acceptor_state.term, + status.acceptor_state.epoch + ); + + let conf = &GlobalTimelines::get_global_config(); + + let client = reqwest::Client::new(); + // TODO: don't use debug dump, it should be used only in tests. + // This is a proof of concept, we should figure out a way + // to use scp without implementing it manually. + + // Implementing our own scp over HTTP. + // At first, we need to fetch list of files from safekeeper. + let dump: debug_dump::Response = client + .get(format!( + "{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}", + host, status.tenant_id, status.timeline_id + )) + .send() + .await? + .json() + .await?; + + if dump.timelines.len() != 1 { + bail!( + "Expected to fetch single timeline, got {} timelines", + dump.timelines.len() + ); + } + + let timeline = dump.timelines.into_iter().next().unwrap(); + let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!( + "Timeline {} doesn't have disk content", + ttid + ))?; + + let mut filenames = disk_content + .files + .iter() + .map(|file| file.name.clone()) + .collect::>(); + + // Sort filenames to make sure we pull files in correct order + // After sorting, we should have: + // - 000000010000000000000001 + // - ... + // - 000000010000000000000002.partial + // - safekeeper.control + filenames.sort(); + + // safekeeper.control should be the first file, so we need to move it to the beginning + let control_file_index = filenames + .iter() + .position(|name| name == "safekeeper.control") + .ok_or(anyhow::anyhow!("safekeeper.control not found"))?; + filenames.remove(control_file_index); + filenames.insert(0, "safekeeper.control".to_string()); + + info!( + "Downloading {} files from safekeeper {}", + filenames.len(), + host + ); + + // Creating temp directory for a new timeline. It needs to be + // located on the same filesystem as the rest of the timelines. + + // conf.workdir is usually /storage/safekeeper/data + // will try to transform it into /storage/safekeeper/tmp + let temp_base = conf + .workdir + .parent() + .ok_or(anyhow::anyhow!("workdir has no parent"))? + .join("tmp"); + + tokio::fs::create_dir_all(&temp_base).await?; + + let tli_dir = tempfile::Builder::new() + .suffix("_temptli") + .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id)) + .tempdir_in(temp_base)?; + let tli_dir_path = tli_dir.path().to_owned(); + + // Note: some time happens between fetching list of files and fetching files themselves. + // It's possible that some files will be removed from safekeeper and we will fail to fetch them. + // This function will fail in this case, should be retried by the caller. + for filename in filenames { + let file_path = tli_dir_path.join(&filename); + // /v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename + let http_url = format!( + "{}/v1/tenant/{}/timeline/{}/file/{}", + host, status.tenant_id, status.timeline_id, filename + ); + + let mut file = tokio::fs::File::create(&file_path).await?; + let mut response = client.get(&http_url).send().await?; + while let Some(chunk) = response.chunk().await? { + file.write_all(&chunk).await?; + } + } + + // TODO: fsync? + + // Let's create timeline from temp directory and verify that it's correct + + let control_path = tli_dir_path.join("safekeeper.control"); + + let control_store = control_file::FileStorage::load_control_file(control_path)?; + if control_store.server.wal_seg_size == 0 { + bail!("wal_seg_size is not set"); + } + + let wal_store = + wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?; + + let commit_lsn = status.commit_lsn; + let flush_lsn = wal_store.flush_lsn(); + + info!( + "Finished downloading timeline {}, commit_lsn={}, flush_lsn={}", + ttid, commit_lsn, flush_lsn + ); + assert!(status.commit_lsn <= status.flush_lsn); + + // Move timeline dir to the correct location + let timeline_path = conf.timeline_dir(&ttid); + + info!( + "Moving timeline {} from {} to {}", + ttid, + tli_dir_path.display(), + timeline_path.display() + ); + tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?; + tokio::fs::rename(tli_dir_path, &timeline_path).await?; + + let tli = GlobalTimelines::load_timeline(ttid).context("Failed to load timeline after copy")?; + + info!( + "Loaded timeline {}, flush_lsn={}", + ttid, + tli.get_flush_lsn() + ); + + Ok(Response { + safekeeper_host: host, + }) +} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 10b4842cbd..33da0c8e5a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -18,7 +18,8 @@ use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use pq_proto::{PageserverFeedback, SystemId}; +use pq_proto::SystemId; +use utils::pageserver_feedback::PageserverFeedback; use utils::{ bin_ser::LeSer, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -205,14 +206,13 @@ pub struct SafeKeeperState { pub peers: PersistedPeers, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] // In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; values // are not flushed yet. pub struct SafekeeperMemState { pub commit_lsn: Lsn, pub backup_lsn: Lsn, pub peer_horizon_lsn: Lsn, - pub remote_consistent_lsn: Lsn, #[serde(with = "hex")] pub proposer_uuid: PgUuid, } @@ -347,7 +347,7 @@ pub struct AppendRequestHeader { } /// Report safekeeper state to proposer -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize)] pub struct AppendResponse { // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. @@ -540,7 +540,6 @@ where commit_lsn: state.commit_lsn, backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, - remote_consistent_lsn: state.remote_consistent_lsn, proposer_uuid: state.proposer_uuid, }, state, @@ -781,10 +780,6 @@ where // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); - // Initializing remote_consistent_lsn sets that we have nothing to - // stream to pageserver(s) immediately after creation. - self.inmem.remote_consistent_lsn = - max(self.inmem.remote_consistent_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); self.persist_control_file(state)?; @@ -837,7 +832,6 @@ where state.commit_lsn = self.inmem.commit_lsn; state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; - state.remote_consistent_lsn = self.inmem.remote_consistent_lsn; state.proposer_uuid = self.inmem.proposer_uuid; self.state.persist(&state) } @@ -940,14 +934,12 @@ where self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; self.inmem.backup_lsn = new_backup_lsn; - let new_remote_consistent_lsn = max( - Lsn(sk_info.remote_consistent_lsn), - self.inmem.remote_consistent_lsn, - ); + // value in sk_info should be maximized over our local in memory value. + let new_remote_consistent_lsn = Lsn(sk_info.remote_consistent_lsn); + assert!(self.state.remote_consistent_lsn <= new_remote_consistent_lsn); sync_control_file |= self.state.remote_consistent_lsn + (self.state.server.wal_seg_size as u64) < new_remote_consistent_lsn; - self.inmem.remote_consistent_lsn = new_remote_consistent_lsn; let new_peer_horizon_lsn = max(Lsn(sk_info.peer_horizon_lsn), self.inmem.peer_horizon_lsn); sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) @@ -955,7 +947,12 @@ where self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; if sync_control_file { - self.persist_control_file(self.state.clone())?; + let mut state = self.state.clone(); + // Note: we do not persist remote_consistent_lsn in other paths of + // persisting cf -- that is not much needed currently. We could do + // that by storing Arc to walsenders in Safekeeper. + state.remote_consistent_lsn = new_remote_consistent_lsn; + self.persist_control_file(state)?; } Ok(()) } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index a6ca89efa4..6b303eb0fe 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -1,21 +1,28 @@ //! This module implements the streaming side of replication protocol, starting -//! with the "START_REPLICATION" message. +//! with the "START_REPLICATION" message, and registry of walsenders. use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::{ReplicaState, Timeline}; +use crate::timeline::Timeline; +use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use crate::GlobalTimelines; use anyhow::Context as AnyhowContext; use bytes::Bytes; +use parking_lot::Mutex; use postgres_backend::PostgresBackend; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; -use pq_proto::{BeMessage, PageserverFeedback, WalSndKeepAlive, XLogDataBody}; +use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use tokio::io::{AsyncRead, AsyncWrite}; +use utils::id::TenantTimelineId; +use utils::lsn::AtomicLsn; +use utils::pageserver_feedback::PageserverFeedback; -use std::cmp::min; +use std::cmp::{max, min}; +use std::net::SocketAddr; use std::str; use std::sync::Arc; use std::time::Duration; @@ -40,6 +47,8 @@ pub struct HotStandbyFeedback { pub catalog_xmin: FullTransactionId, } +const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; + impl HotStandbyFeedback { pub fn empty() -> HotStandbyFeedback { HotStandbyFeedback { @@ -51,24 +60,294 @@ impl HotStandbyFeedback { } /// Standby status update -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct StandbyReply { - pub write_lsn: Lsn, // last lsn received by pageserver - pub flush_lsn: Lsn, // pageserver's disk consistent lSN - pub apply_lsn: Lsn, // pageserver's remote consistent lSN - pub reply_ts: TimestampTz, + pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. + pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. + pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. + pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. pub reply_requested: bool, } -/// Scope guard to unregister replication connection from timeline -struct ReplicationConnGuard { - replica: usize, // replica internal ID assigned by timeline - timeline: Arc, +impl StandbyReply { + fn empty() -> Self { + StandbyReply { + write_lsn: Lsn::INVALID, + flush_lsn: Lsn::INVALID, + apply_lsn: Lsn::INVALID, + reply_ts: 0, + reply_requested: false, + } + } } -impl Drop for ReplicationConnGuard { +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct StandbyFeedback { + reply: StandbyReply, + hs_feedback: HotStandbyFeedback, +} + +/// WalSenders registry. Timeline holds it (wrapped in Arc). +pub struct WalSenders { + /// Lsn maximized over all walsenders *and* peer data, so might be higher + /// than what we receive from replicas. + remote_consistent_lsn: AtomicLsn, + mutex: Mutex, +} + +impl WalSenders { + pub fn new(remote_consistent_lsn: Lsn) -> Arc { + Arc::new(WalSenders { + remote_consistent_lsn: AtomicLsn::from(remote_consistent_lsn), + mutex: Mutex::new(WalSendersShared::new()), + }) + } + + /// Register new walsender. Returned guard provides access to the slot and + /// automatically deregisters in Drop. + fn register( + self: &Arc, + ttid: TenantTimelineId, + addr: SocketAddr, + conn_id: ConnectionId, + appname: Option, + ) -> WalSenderGuard { + let slots = &mut self.mutex.lock().slots; + let walsender_state = WalSenderState { + ttid, + addr, + conn_id, + appname, + feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()), + }; + // find empty slot or create new one + let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) { + slots[pos] = Some(walsender_state); + pos + } else { + let pos = slots.len(); + slots.push(Some(walsender_state)); + pos + }; + WalSenderGuard { + id: pos, + walsenders: self.clone(), + } + } + + /// Get state of all walsenders. + pub fn get_all(self: &Arc) -> Vec { + self.mutex.lock().slots.iter().flatten().cloned().collect() + } + + /// Get aggregated pageserver feedback. + pub fn get_ps_feedback(self: &Arc) -> PageserverFeedback { + self.mutex.lock().agg_ps_feedback + } + + /// Get aggregated pageserver and hot standby feedback (we send them to compute). + pub fn get_feedbacks(self: &Arc) -> (PageserverFeedback, HotStandbyFeedback) { + let shared = self.mutex.lock(); + (shared.agg_ps_feedback, shared.agg_hs_feedback) + } + + /// Record new pageserver feedback, update aggregated values. + fn record_ps_feedback(self: &Arc, id: WalSenderId, feedback: &PageserverFeedback) { + let mut shared = self.mutex.lock(); + shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback); + shared.update_ps_feedback(); + self.update_remote_consistent_lsn(shared.agg_ps_feedback.remote_consistent_lsn); + } + + /// Record standby reply. + fn record_standby_reply(self: &Arc, id: WalSenderId, reply: &StandbyReply) { + let mut shared = self.mutex.lock(); + let slot = shared.get_slot_mut(id); + match &mut slot.feedback { + ReplicationFeedback::Standby(sf) => sf.reply = *reply, + ReplicationFeedback::Pageserver(_) => { + slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + reply: *reply, + hs_feedback: HotStandbyFeedback::empty(), + }) + } + } + } + + /// Record hot standby feedback, update aggregated value. + fn record_hs_feedback(self: &Arc, id: WalSenderId, feedback: &HotStandbyFeedback) { + let mut shared = self.mutex.lock(); + let slot = shared.get_slot_mut(id); + match &mut slot.feedback { + ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback, + ReplicationFeedback::Pageserver(_) => { + slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: *feedback, + }) + } + } + shared.update_hs_feedback(); + } + + /// Get remote_consistent_lsn reported by the pageserver. Returns None if + /// client is not pageserver. + fn get_ws_remote_consistent_lsn(self: &Arc, id: WalSenderId) -> Option { + let shared = self.mutex.lock(); + let slot = shared.get_slot(id); + match slot.feedback { + ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn), + _ => None, + } + } + + /// Get remote_consistent_lsn maximized across all walsenders and peers. + pub fn get_remote_consistent_lsn(self: &Arc) -> Lsn { + self.remote_consistent_lsn.load() + } + + /// Update maximized remote_consistent_lsn, return new (potentially) value. + pub fn update_remote_consistent_lsn(self: &Arc, candidate: Lsn) -> Lsn { + self.remote_consistent_lsn + .fetch_max(candidate) + .max(candidate) + } + + /// Unregister walsender. + fn unregister(self: &Arc, id: WalSenderId) { + let mut shared = self.mutex.lock(); + shared.slots[id] = None; + shared.update_hs_feedback(); + } +} + +struct WalSendersShared { + // aggregated over all walsenders value + agg_hs_feedback: HotStandbyFeedback, + // aggregated over all walsenders value + agg_ps_feedback: PageserverFeedback, + slots: Vec>, +} + +impl WalSendersShared { + fn new() -> Self { + WalSendersShared { + agg_hs_feedback: HotStandbyFeedback::empty(), + agg_ps_feedback: PageserverFeedback::empty(), + slots: Vec::new(), + } + } + + /// Get content of provided id slot, it must exist. + fn get_slot(&self, id: WalSenderId) -> &WalSenderState { + self.slots[id].as_ref().expect("walsender doesn't exist") + } + + /// Get mut content of provided id slot, it must exist. + fn get_slot_mut(&mut self, id: WalSenderId) -> &mut WalSenderState { + self.slots[id].as_mut().expect("walsender doesn't exist") + } + + /// Update aggregated hot standy feedback. We just take min of valid xmins + /// and ts. + fn update_hs_feedback(&mut self) { + let mut agg = HotStandbyFeedback::empty(); + for ws_state in self.slots.iter().flatten() { + if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback { + let hs_feedback = standby_feedback.hs_feedback; + // doing Option math like op1.iter().chain(op2.iter()).min() + // would be nicer, but we serialize/deserialize this struct + // directly, so leave as is for now + if hs_feedback.xmin != INVALID_FULL_TRANSACTION_ID { + if agg.xmin != INVALID_FULL_TRANSACTION_ID { + agg.xmin = min(agg.xmin, hs_feedback.xmin); + } else { + agg.xmin = hs_feedback.xmin; + } + agg.ts = min(agg.ts, hs_feedback.ts); + } + if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID { + if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID { + agg.catalog_xmin = min(agg.catalog_xmin, hs_feedback.catalog_xmin); + } else { + agg.catalog_xmin = hs_feedback.catalog_xmin; + } + agg.ts = min(agg.ts, hs_feedback.ts); + } + } + } + self.agg_hs_feedback = agg; + } + + /// Update aggregated pageserver feedback. LSNs (last_received, + /// disk_consistent, remote_consistent) and reply timestamp are just + /// maximized; timeline_size if taken from feedback with highest + /// last_received lsn. This is generally reasonable, but we might want to + /// implement other policies once multiple pageservers start to be actively + /// used. + fn update_ps_feedback(&mut self) { + let init = PageserverFeedback::empty(); + let acc = + self.slots + .iter() + .flatten() + .fold(init, |mut acc, ws_state| match ws_state.feedback { + ReplicationFeedback::Pageserver(feedback) => { + if feedback.last_received_lsn > acc.last_received_lsn { + acc.current_timeline_size = feedback.current_timeline_size; + } + acc.last_received_lsn = + max(feedback.last_received_lsn, acc.last_received_lsn); + acc.disk_consistent_lsn = + max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn); + acc.remote_consistent_lsn = + max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn); + acc.replytime = max(feedback.replytime, acc.replytime); + acc + } + ReplicationFeedback::Standby(_) => acc, + }); + self.agg_ps_feedback = acc; + } +} + +// Serialized is used only for pretty printing in json. +#[serde_as] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalSenderState { + #[serde_as(as = "DisplayFromStr")] + ttid: TenantTimelineId, + addr: SocketAddr, + conn_id: ConnectionId, + // postgres application_name + appname: Option, + feedback: ReplicationFeedback, +} + +// Receiver is either pageserver or regular standby, which have different +// feedbacks. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +enum ReplicationFeedback { + Pageserver(PageserverFeedback), + Standby(StandbyFeedback), +} + +// id of the occupied slot in WalSenders to access it (and save in the +// WalSenderGuard). We could give Arc directly to the slot, but there is not +// much sense in that as values aggregation which is performed on each feedback +// receival iterates over all walsenders. +pub type WalSenderId = usize; + +/// Scope guard to access slot in WalSenders registry and unregister from it in +/// Drop. +pub struct WalSenderGuard { + id: WalSenderId, + walsenders: Arc, +} + +impl Drop for WalSenderGuard { fn drop(&mut self) { - self.timeline.remove_replica(self.replica); + self.walsenders.unregister(self.id); } } @@ -97,16 +376,13 @@ impl SafekeeperPostgresHandler { let tli = GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?; - let state = ReplicaState::new(); - // This replica_id is used below to check if it's time to stop replication. - let replica_id = tli.add_replica(state); - - // Use a guard object to remove our entry from the timeline, when the background - // thread and us have both finished using it. - let _guard = Arc::new(ReplicationConnGuard { - replica: replica_id, - timeline: tli.clone(), - }); + // Use a guard object to remove our entry from the timeline when we are done. + let ws_guard = Arc::new(tli.get_walsenders().register( + self.ttid, + *pgb.get_peer_addr(), + self.conn_id, + self.appname.clone(), + )); // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will @@ -154,16 +430,11 @@ impl SafekeeperPostgresHandler { end_pos, stop_pos, commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), - replica_id, + ws_guard: ws_guard.clone(), wal_reader, send_buf: [0; MAX_SEND_SIZE], }; - let mut reply_reader = ReplyReader { - reader, - tli, - replica_id, - feedback: ReplicaState::new(), - }; + let mut reply_reader = ReplyReader { reader, ws_guard }; let res = tokio::select! { // todo: add read|write .context to these errors @@ -190,7 +461,7 @@ struct WalSender<'a, IO> { // in recovery. stop_pos: Option, commit_lsn_watch_rx: Receiver, - replica_id: usize, + ws_guard: Arc, wal_reader: WalReader, // buffer for readling WAL into to send it send_buf: [u8; MAX_SEND_SIZE], @@ -264,14 +535,20 @@ impl WalSender<'_, IO> { return Ok(()); } // Timed out waiting for WAL, check for termination and send KA - if self.tli.should_walsender_stop(self.replica_id) { - // Terminate if there is nothing more to send. - // TODO close the stream properly - return Err(CopyStreamHandlerEnd::ServerInitiated(format!( - "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", - self.appname, self.start_pos, - ))); + if let Some(remote_consistent_lsn) = self + .ws_guard + .walsenders + .get_ws_remote_consistent_lsn(self.ws_guard.id) + { + if self.tli.should_walsender_stop(remote_consistent_lsn) { + // Terminate if there is nothing more to send. + return Err(CopyStreamHandlerEnd::ServerInitiated(format!( + "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", + self.appname, self.start_pos, + ))); + } } + self.pgb .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { sent_ptr: self.end_pos.0, @@ -286,9 +563,7 @@ impl WalSender<'_, IO> { /// A half driving receiving replies. struct ReplyReader { reader: PostgresBackendReader, - tli: Arc, - replica_id: usize, - feedback: ReplicaState, + ws_guard: Arc, } impl ReplyReader { @@ -303,29 +578,32 @@ impl ReplyReader { match msg.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { // Note: deserializing is on m[1..] because we skip the tag byte. - self.feedback.hs_feedback = HotStandbyFeedback::des(&msg[1..]) + let hs_feedback = HotStandbyFeedback::des(&msg[1..]) .context("failed to deserialize HotStandbyFeedback")?; - self.tli - .update_replica_state(self.replica_id, self.feedback); + self.ws_guard + .walsenders + .record_hs_feedback(self.ws_guard.id, &hs_feedback); } Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => { - let _reply = + let reply = StandbyReply::des(&msg[1..]).context("failed to deserialize StandbyReply")?; - // This must be a regular postgres replica, - // because pageserver doesn't send this type of messages to safekeeper. - // Currently we just ignore this, tracking progress for them is not supported. + self.ws_guard + .walsenders + .record_standby_reply(self.ws_guard.id, &reply); } Some(NEON_STATUS_UPDATE_TAG_BYTE) => { // pageserver sends this. // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. let buf = Bytes::copy_from_slice(&msg[9..]); - let reply = PageserverFeedback::parse(buf); + let ps_feedback = PageserverFeedback::parse(buf); - trace!("PageserverFeedback is {:?}", reply); - self.feedback.pageserver_feedback = Some(reply); - - self.tli - .update_replica_state(self.replica_id, self.feedback); + trace!("PageserverFeedback is {:?}", ps_feedback); + self.ws_guard + .walsenders + .record_ps_feedback(self.ws_guard.id, &ps_feedback); + // in principle new remote_consistent_lsn could allow to + // deactivate the timeline, but we check that regularly through + // broker updated, not need to do it here } _ => warn!("unexpected message {:?}", msg), } @@ -368,3 +646,89 @@ async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> anyhow::Result