diff --git a/.circleci/ansible/neon-stress.hosts b/.circleci/ansible/neon-stress.hosts new file mode 100644 index 0000000000..283ec0e8b3 --- /dev/null +++ b/.circleci/ansible/neon-stress.hosts @@ -0,0 +1,19 @@ +[pageservers] +neon-stress-ps-1 console_region_id=1 +neon-stress-ps-2 console_region_id=1 + +[safekeepers] +neon-stress-sk-1 console_region_id=1 +neon-stress-sk-2 console_region_id=1 +neon-stress-sk-3 console_region_id=1 + +[storage:children] +pageservers +safekeepers + +[storage:vars] +console_mgmt_base_url = http://neon-stress-console.local +bucket_name = neon-storage-ireland +bucket_region = eu-west-1 +etcd_endpoints = etcd-stress.local:2379 +safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index 2ed8f517f7..03c6cf57e0 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -1,5 +1,6 @@ [pageservers] -zenith-1-ps-1 console_region_id=1 +#zenith-1-ps-1 console_region_id=1 +zenith-1-ps-2 console_region_id=1 [safekeepers] zenith-1-sk-1 console_region_id=1 @@ -15,4 +16,3 @@ console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 etcd_endpoints = etcd-release.local:2379 -safekeeper_enable_s3_offload = true diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 8e89e843d9..cf5b98eaa1 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -6,6 +6,7 @@ zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-sk-1 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 +zenith-us-stage-sk-6 console_region_id=27 [storage:children] pageservers @@ -16,4 +17,3 @@ console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 etcd_endpoints = etcd-staging.local:2379 -safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/systemd/pageserver.service b/.circleci/ansible/systemd/pageserver.service index d346643e58..54a7b1ba0a 100644 --- a/.circleci/ansible/systemd/pageserver.service +++ b/.circleci/ansible/systemd/pageserver.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=pageserver Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data +ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index 55088db859..e4a395a60e 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }} +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/.circleci/config.yml b/.circleci/config.yml index 85654b5d45..624d367053 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,15 +11,6 @@ executors: - image: zimg/rust:1.58 jobs: - check-codestyle-rust: - executor: neon-xlarge-executor - steps: - - checkout - - run: - name: rustfmt - when: always - command: cargo fmt --all -- --check - # A job to build postgres build-postgres: executor: neon-xlarge-executor @@ -222,6 +213,12 @@ jobs: key: v2-python-deps-{{ checksum "poetry.lock" }} paths: - /home/circleci/.cache/pypoetry/virtualenvs + - run: + name: Print versions + when: always + command: | + poetry run python --version + poetry show - run: name: Run yapf to ensure code format when: always @@ -355,7 +352,7 @@ jobs: when: always command: | du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" -delete + find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete du -sh /tmp/test_output/* - store_artifacts: path: /tmp/test_output @@ -587,6 +584,56 @@ jobs: helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait + deploy-neon-stress: + docker: + - image: cimg/python:3.10 + steps: + - checkout + - setup_remote_docker + - run: + name: Setup ansible + command: | + pip install --progress-bar off --user ansible boto3 + - run: + name: Redeploy + command: | + cd "$(pwd)/.circleci/ansible" + + ./get_binaries.sh + + echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key + echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub + chmod 0600 ssh-key + ssh-add ssh-key + rm -f ssh-key ssh-key-cert.pub + + ansible-playbook deploy.yaml -i neon-stress.hosts + rm -f neon_install.tar.gz .neon_current_version + + deploy-neon-stress-proxy: + docker: + - image: cimg/base:2021.04 + environment: + KUBECONFIG: .kubeconfig + steps: + - checkout + - run: + name: Store kubeconfig file + command: | + echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} + chmod 0600 ${KUBECONFIG} + - run: + name: Setup helm v3 + command: | + curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + helm repo add neondatabase https://neondatabase.github.io/helm-charts + - run: + name: Re-deploy proxy + command: | + DOCKER_TAG=$(git log --oneline|wc -l) + helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait + deploy-release: docker: - image: cimg/python:3.10 @@ -629,12 +676,13 @@ jobs: name: Setup helm v3 command: | curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add zenithdb https://neondatabase.github.io/helm-charts + helm repo add neondatabase https://neondatabase.github.io/helm-charts - run: name: Re-deploy proxy command: | DOCKER_TAG="release-$(git log --oneline|wc -l)" - helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait # Trigger a new remote CI job remote-ci-trigger: @@ -683,7 +731,6 @@ jobs: workflows: build_and_test: jobs: - - check-codestyle-rust - check-codestyle-python - build-postgres: name: build-postgres-<< matrix.build_type >> @@ -771,6 +818,25 @@ workflows: requires: - docker-image + - deploy-neon-stress: + # Context gives an ability to login + context: Docker Hub + # deploy only for commits to main + filters: + branches: + only: + - main + requires: + - docker-image + - deploy-neon-stress-proxy: + # deploy only for commits to main + filters: + branches: + only: + - main + requires: + - docker-image + - docker-image-release: # Context gives an ability to login context: Docker Hub diff --git a/.circleci/helm-values/neon-stress.proxy-scram.yaml b/.circleci/helm-values/neon-stress.proxy-scram.yaml new file mode 100644 index 0000000000..8f55d31c87 --- /dev/null +++ b/.circleci/helm-values/neon-stress.proxy-scram.yaml @@ -0,0 +1,26 @@ +fullnameOverride: "neon-stress-proxy-scram" + +settings: + authBackend: "console" + authEndpoint: "http://neon-stress-console.local/management/api/v2" + domain: "*.stress.neon.tech" + +podLabels: + zenith_service: proxy-scram + zenith_env: staging + zenith_region: eu-west-1 + zenith_region_slug: ireland + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech' + +metrics: + enabled: true + serviceMonitor: + enabled: true + selector: + release: kube-prometheus-stack diff --git a/.circleci/helm-values/neon-stress.proxy.yaml b/.circleci/helm-values/neon-stress.proxy.yaml new file mode 100644 index 0000000000..8236f9873a --- /dev/null +++ b/.circleci/helm-values/neon-stress.proxy.yaml @@ -0,0 +1,34 @@ +fullnameOverride: "neon-stress-proxy" + +settings: + authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/" + uri: "https://console.dev.neon.tech/psql_session/" + +# -- Additional labels for zenith-proxy pods +podLabels: + zenith_service: proxy + zenith_env: staging + zenith_region: eu-west-1 + zenith_region_slug: ireland + +service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal + external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local + type: LoadBalancer + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech + +metrics: + enabled: true + serviceMonitor: + enabled: true + selector: + release: kube-prometheus-stack diff --git a/.circleci/helm-values/production.proxy-scram.yaml b/.circleci/helm-values/production.proxy-scram.yaml new file mode 100644 index 0000000000..54b0fbcd98 --- /dev/null +++ b/.circleci/helm-values/production.proxy-scram.yaml @@ -0,0 +1,24 @@ +settings: + authBackend: "console" + authEndpoint: "http://console-release.local/management/api/v2" + domain: "*.cloud.neon.tech" + +podLabels: + zenith_service: proxy-scram + zenith_env: production + zenith_region: us-west-2 + zenith_region_slug: oregon + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech' + +metrics: + enabled: true + serviceMonitor: + enabled: true + selector: + release: kube-prometheus-stack diff --git a/.circleci/helm-values/production.proxy.yaml b/.circleci/helm-values/production.proxy.yaml index e13968a6a8..87c61c90cf 100644 --- a/.circleci/helm-values/production.proxy.yaml +++ b/.circleci/helm-values/production.proxy.yaml @@ -1,9 +1,3 @@ -# Helm chart values for zenith-proxy. -# This is a YAML-formatted file. - -image: - repository: neondatabase/neon - settings: authEndpoint: "https://console.neon.tech/authenticate_proxy_request/" uri: "https://console.neon.tech/psql_session/" @@ -28,7 +22,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-type: external service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: start.zenith.tech,connect.neon.tech,pg.neon.tech + external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech metrics: enabled: true diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 6d109b9bb5..1ce1b64a49 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -1,6 +1,10 @@ name: Build and Test -on: push +on: + push: + branches: + - main + pull_request: jobs: regression-check: @@ -21,13 +25,17 @@ jobs: submodules: true fetch-depth: 2 - - name: install rust toolchain ${{ matrix.rust_toolchain }} + - name: Install rust toolchain ${{ matrix.rust_toolchain }} uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: ${{ matrix.rust_toolchain }} + components: rustfmt, clippy override: true + - name: Check formatting + run: cargo fmt --all -- --check + - name: Install Ubuntu postgres dependencies if: matrix.os == 'ubuntu-latest' run: | diff --git a/COPYRIGHT b/COPYRIGHT deleted file mode 100644 index 448363b12f..0000000000 --- a/COPYRIGHT +++ /dev/null @@ -1,20 +0,0 @@ -This software is licensed under the Apache 2.0 License: - ----------------------------------------------------------------------------- -Copyright 2021 Zenith Labs, Inc - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. ----------------------------------------------------------------------------- - -The PostgreSQL submodule in vendor/postgres is licensed under the -PostgreSQL license. See vendor/postgres/COPYRIGHT. diff --git a/Cargo.lock b/Cargo.lock index c56b9519fe..a67e8586d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -166,7 +166,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.4.4", "object", "rustc-demangle", ] @@ -871,6 +871,18 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" +[[package]] +name = "flate2" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39522e96686d38f4bc984b9198e3a0613264abaebaff2c5c918bfa6b6da09af" +dependencies = [ + "cfg-if", + "crc32fast", + "libc", + "miniz_oxide 0.5.1", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1539,6 +1551,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miniz_oxide" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b29bd4bc3f33391105ebee3589c19197c4271e3e5a9ec9bfe8127eeff8f082" +dependencies = [ + "adler", +] + [[package]] name = "mio" version = "0.8.2" @@ -1713,9 +1734,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" [[package]] name = "oorandom" @@ -1784,6 +1805,7 @@ dependencies = [ "crc32c", "crossbeam-utils", "daemonize", + "etcd_broker", "fail", "futures", "git-version", @@ -2038,15 +2060,18 @@ dependencies = [ "bytes", "chrono", "crc32c", + "env_logger", "hex", "lazy_static", "log", "memoffset", + "postgres", "rand", "regex", "serde", "thiserror", "utils", + "wal_generate", "workspace_hack", ] @@ -2100,6 +2125,20 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "procfs" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95e344cafeaeefe487300c361654bcfc85db3ac53619eeccced29f5ea18c4c70" +dependencies = [ + "bitflags", + "byteorder", + "flate2", + "hex", + "lazy_static", + "libc", +] + [[package]] name = "prometheus" version = "0.13.0" @@ -2109,8 +2148,10 @@ dependencies = [ "cfg-if", "fnv", "lazy_static", + "libc", "memchr", "parking_lot 0.11.2", + "procfs", "thiserror", ] @@ -2366,6 +2407,8 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "metrics", + "once_cell", "rusoto_core", "rusoto_s3", "serde", @@ -2373,6 +2416,7 @@ dependencies = [ "tempfile", "tokio", "tokio-util 0.7.0", + "toml_edit", "tracing", "workspace_hack", ] @@ -2624,6 +2668,7 @@ name = "safekeeper" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "byteorder", "bytes", "clap 3.0.14", @@ -2632,12 +2677,14 @@ dependencies = [ "daemonize", "etcd_broker", "fs2", + "futures", "git-version", "hex", "humantime", "hyper", "lazy_static", "metrics", + "once_cell", "postgres", "postgres-protocol", "postgres_ffi", @@ -2651,6 +2698,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-util 0.7.0", + "toml_edit", "tracing", "url", "utils", @@ -3602,6 +3650,18 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "wal_generate" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap 3.0.14", + "env_logger", + "log", + "postgres", + "tempfile", +] + [[package]] name = "walkdir" version = "2.3.2" diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index bbe0f517ce..f0c9b9d56a 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -15,4 +15,4 @@ RUN set -e \ # Final image that only has one binary FROM debian:buster-slim -COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl +COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl diff --git a/Makefile b/Makefile index d2a79661f2..fdfc64f6fa 100644 --- a/Makefile +++ b/Makefile @@ -12,15 +12,21 @@ endif # BUILD_TYPE ?= debug ifeq ($(BUILD_TYPE),release) - PG_CONFIGURE_OPTS = --enable-debug + PG_CONFIGURE_OPTS = --enable-debug --with-openssl PG_CFLAGS = -O2 -g3 $(CFLAGS) # Unfortunately, `--profile=...` is a nightly feature CARGO_BUILD_FLAGS += --release else ifeq ($(BUILD_TYPE),debug) - PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend + PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS = -O0 -g3 $(CFLAGS) else -$(error Bad build type `$(BUILD_TYPE)', see Makefile for options) + $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) +endif + +# macOS with brew-installed openssl requires explicit paths +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + PG_CONFIGURE_OPTS += --with-includes=/usr/local/opt/openssl/include --with-libraries=/usr/local/opt/openssl/lib endif # Choose whether we should be silent or verbose diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000000..47cc4e798f --- /dev/null +++ b/NOTICE @@ -0,0 +1,5 @@ +Neon +Copyright 2022 Neon Inc. + +The PostgreSQL submodule in vendor/postgres is licensed under the +PostgreSQL license. See vendor/postgres/COPYRIGHT. diff --git a/README.md b/README.md index af384d2672..8e8bf1a9b2 100644 --- a/README.md +++ b/README.md @@ -23,29 +23,70 @@ Pageserver consists of: ## Running local installation + +#### building on Ubuntu/ Debian (Linux) 1. Install build dependencies and other useful packages On Ubuntu or Debian this set of packages should be sufficient to build the code: ```text apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang pkg-config libpq-dev +libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd ``` -[Rust] 1.58 or later is also required. +2. [Install Rust](https://www.rust-lang.org/tools/install) +``` +# recommended approach from https://www.rust-lang.org/tools/install +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` -To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. +3. Install PostgreSQL Client +``` +apt install postgresql-client +``` -To run the integration tests or Python scripts (not required to use the code), install -Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory. - -2. Build neon and patched postgres +4. Build neon and patched postgres ```sh git clone --recursive https://github.com/neondatabase/neon.git cd neon make -j5 ``` -3. Start pageserver and postgres on top of it (should be called from repo root): +#### building on OSX (12.3.1) +1. Install XCode and dependencies +``` +xcode-select --install +brew install protobuf etcd +``` + +2. [Install Rust](https://www.rust-lang.org/tools/install) +``` +# recommended approach from https://www.rust-lang.org/tools/install +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` + +3. Install PostgreSQL Client +``` +# from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos +brew install libpq +brew link --force libpq +``` + +4. Build neon and patched postgres +```sh +git clone --recursive https://github.com/neondatabase/neon.git +cd neon +make -j5 +``` + +#### dependency installation notes +To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. + +To run the integration tests or Python scripts (not required to use the code), install +Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory. + + +#### running neon database +1. Start pageserver and postgres on top of it (should be called from repo root): ```sh # Create repository in .zenith with proper paths to binaries and data # Later that would be responsibility of a package install script @@ -75,7 +116,7 @@ Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=po main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` -4. Now it is possible to connect to postgres and run some queries: +2. Now it is possible to connect to postgres and run some queries: ```text > psql -p55432 -h 127.0.0.1 -U zenith_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); @@ -89,7 +130,7 @@ postgres=# select * from t; (1 row) ``` -5. And create branches and run postgres on them: +3. And create branches and run postgres on them: ```sh # create branch named migration_check > ./target/debug/neon_local timeline branch --branch-name migration_check @@ -133,7 +174,7 @@ postgres=# select * from t; (1 row) ``` -6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances +4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances you have just started. You can stop them all with one command: ```sh > ./target/debug/neon_local stop diff --git a/compute_tools/README.md b/compute_tools/README.md index ccae3d2842..15876ed246 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -1,9 +1,9 @@ # Compute node tools -Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` -`ExecStart` option. It will handle all the `zenith` specifics during compute node +Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` +`ExecStart` option. It will handle all the `Neon` specifics during compute node initialization: -- `zenith_ctl` accepts cluster (compute node) specification as a JSON file. +- `compute_ctl` accepts cluster (compute node) specification as a JSON file. - Every start is a fresh start, so the data directory is removed and initialized again on each run. - Next it will put configuration files into the `PGDATA` directory. @@ -13,18 +13,18 @@ initialization: - Check and alter/drop/create roles and databases. - Hang waiting on the `postmaster` process to exit. -Also `zenith_ctl` spawns two separate service threads: +Also `compute_ctl` spawns two separate service threads: - `compute-monitor` checks the last Postgres activity timestamp and saves it - into the shared `ComputeState`; + into the shared `ComputeNode`; - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the last activity requests. Usage example: ```sh -zenith_ctl -D /var/db/postgres/compute \ - -C 'postgresql://zenith_admin@localhost/postgres' \ - -S /var/db/postgres/specs/current.json \ - -b /usr/local/bin/postgres +compute_ctl -D /var/db/postgres/compute \ + -C 'postgresql://zenith_admin@localhost/postgres' \ + -S /var/db/postgres/specs/current.json \ + -b /usr/local/bin/postgres ``` ## Tests diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs new file mode 100644 index 0000000000..5c951b7779 --- /dev/null +++ b/compute_tools/src/bin/compute_ctl.rs @@ -0,0 +1,174 @@ +//! +//! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` +//! `ExecStart` option. It will handle all the `Neon` specifics during compute node +//! initialization: +//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file. +//! - Every start is a fresh start, so the data directory is removed and +//! initialized again on each run. +//! - Next it will put configuration files into the `PGDATA` directory. +//! - Sync safekeepers and get commit LSN. +//! - Get `basebackup` from pageserver using the returned on the previous step LSN. +//! - Try to start `postgres` and wait until it is ready to accept connections. +//! - Check and alter/drop/create roles and databases. +//! - Hang waiting on the `postmaster` process to exit. +//! +//! Also `compute_ctl` spawns two separate service threads: +//! - `compute-monitor` checks the last Postgres activity timestamp and saves it +//! into the shared `ComputeNode`; +//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the +//! last activity requests. +//! +//! Usage example: +//! ```sh +//! compute_ctl -D /var/db/postgres/compute \ +//! -C 'postgresql://zenith_admin@localhost/postgres' \ +//! -S /var/db/postgres/specs/current.json \ +//! -b /usr/local/bin/postgres +//! ``` +//! +use std::fs::File; +use std::panic; +use std::path::Path; +use std::process::exit; +use std::sync::{Arc, RwLock}; +use std::{thread, time::Duration}; + +use anyhow::Result; +use chrono::Utc; +use clap::Arg; +use log::{error, info}; + +use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; +use compute_tools::http::api::launch_http_server; +use compute_tools::logger::*; +use compute_tools::monitor::launch_monitor; +use compute_tools::params::*; +use compute_tools::pg_helpers::*; +use compute_tools::spec::*; + +fn main() -> Result<()> { + // TODO: re-use `utils::logging` later + init_logger(DEFAULT_LOG_LEVEL)?; + + // Env variable is set by `cargo` + let version: Option<&str> = option_env!("CARGO_PKG_VERSION"); + let matches = clap::App::new("compute_ctl") + .version(version.unwrap_or("unknown")) + .arg( + Arg::new("connstr") + .short('C') + .long("connstr") + .value_name("DATABASE_URL") + .required(true), + ) + .arg( + Arg::new("pgdata") + .short('D') + .long("pgdata") + .value_name("DATADIR") + .required(true), + ) + .arg( + Arg::new("pgbin") + .short('b') + .long("pgbin") + .value_name("POSTGRES_PATH"), + ) + .arg( + Arg::new("spec") + .short('s') + .long("spec") + .value_name("SPEC_JSON"), + ) + .arg( + Arg::new("spec-path") + .short('S') + .long("spec-path") + .value_name("SPEC_PATH"), + ) + .get_matches(); + + let pgdata = matches.value_of("pgdata").expect("PGDATA path is required"); + let connstr = matches + .value_of("connstr") + .expect("Postgres connection string is required"); + let spec = matches.value_of("spec"); + let spec_path = matches.value_of("spec-path"); + + // Try to use just 'postgres' if no path is provided + let pgbin = matches.value_of("pgbin").unwrap_or("postgres"); + + let spec: ComputeSpec = match spec { + // First, try to get cluster spec from the cli argument + Some(json) => serde_json::from_str(json)?, + None => { + // Second, try to read it from the file if path is provided + if let Some(sp) = spec_path { + let path = Path::new(sp); + let file = File::open(path)?; + serde_json::from_reader(file)? + } else { + panic!("cluster spec should be provided via --spec or --spec-path argument"); + } + } + }; + + let pageserver_connstr = spec + .cluster + .settings + .find("zenith.page_server_connstring") + .expect("pageserver connstr should be provided"); + let tenant = spec + .cluster + .settings + .find("zenith.zenith_tenant") + .expect("tenant id should be provided"); + let timeline = spec + .cluster + .settings + .find("zenith.zenith_timeline") + .expect("tenant id should be provided"); + + let compute_state = ComputeNode { + start_time: Utc::now(), + connstr: connstr.to_string(), + pgdata: pgdata.to_string(), + pgbin: pgbin.to_string(), + spec, + tenant, + timeline, + pageserver_connstr, + metrics: ComputeMetrics::new(), + state: RwLock::new(ComputeState::new()), + }; + let compute = Arc::new(compute_state); + + // Launch service threads first, so we were able to serve availability + // requests, while configuration is still in progress. + let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); + + // Run compute (Postgres) and hang waiting on it. + match compute.prepare_and_run() { + Ok(ec) => { + let code = ec.code().unwrap_or(1); + info!("Postgres exited with code {}, shutting down", code); + exit(code) + } + Err(error) => { + error!("could not start the compute node: {}", error); + + let mut state = compute.state.write().unwrap(); + state.error = Some(format!("{:?}", error)); + state.status = ComputeStatus::Failed; + drop(state); + + // Keep serving HTTP requests, so the cloud control plane was able to + // get the actual error. + info!("giving control plane 30s to collect the error before shutdown"); + thread::sleep(Duration::from_secs(30)); + info!("shutting down"); + Err(error) + } + } +} diff --git a/compute_tools/src/bin/zenith_ctl.rs b/compute_tools/src/bin/zenith_ctl.rs deleted file mode 100644 index 3685f8e8b4..0000000000 --- a/compute_tools/src/bin/zenith_ctl.rs +++ /dev/null @@ -1,252 +0,0 @@ -//! -//! Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` -//! `ExecStart` option. It will handle all the `zenith` specifics during compute node -//! initialization: -//! - `zenith_ctl` accepts cluster (compute node) specification as a JSON file. -//! - Every start is a fresh start, so the data directory is removed and -//! initialized again on each run. -//! - Next it will put configuration files into the `PGDATA` directory. -//! - Sync safekeepers and get commit LSN. -//! - Get `basebackup` from pageserver using the returned on the previous step LSN. -//! - Try to start `postgres` and wait until it is ready to accept connections. -//! - Check and alter/drop/create roles and databases. -//! - Hang waiting on the `postmaster` process to exit. -//! -//! Also `zenith_ctl` spawns two separate service threads: -//! - `compute-monitor` checks the last Postgres activity timestamp and saves it -//! into the shared `ComputeState`; -//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the -//! last activity requests. -//! -//! Usage example: -//! ```sh -//! zenith_ctl -D /var/db/postgres/compute \ -//! -C 'postgresql://zenith_admin@localhost/postgres' \ -//! -S /var/db/postgres/specs/current.json \ -//! -b /usr/local/bin/postgres -//! ``` -//! -use std::fs::File; -use std::panic; -use std::path::Path; -use std::process::{exit, Command, ExitStatus}; -use std::sync::{Arc, RwLock}; - -use anyhow::{Context, Result}; -use chrono::Utc; -use clap::Arg; -use log::info; -use postgres::{Client, NoTls}; - -use compute_tools::checker::create_writablity_check_data; -use compute_tools::config; -use compute_tools::http_api::launch_http_server; -use compute_tools::logger::*; -use compute_tools::monitor::launch_monitor; -use compute_tools::params::*; -use compute_tools::pg_helpers::*; -use compute_tools::spec::*; -use compute_tools::zenith::*; - -/// Do all the preparations like PGDATA directory creation, configuration, -/// safekeepers sync, basebackup, etc. -fn prepare_pgdata(state: &Arc>) -> Result<()> { - let state = state.read().unwrap(); - let spec = &state.spec; - let pgdata_path = Path::new(&state.pgdata); - let pageserver_connstr = spec - .cluster - .settings - .find("zenith.page_server_connstring") - .expect("pageserver connstr should be provided"); - let tenant = spec - .cluster - .settings - .find("zenith.zenith_tenant") - .expect("tenant id should be provided"); - let timeline = spec - .cluster - .settings - .find("zenith.zenith_timeline") - .expect("tenant id should be provided"); - - info!( - "starting cluster #{}, operation #{}", - spec.cluster.cluster_id, - spec.operation_uuid.as_ref().unwrap() - ); - - // Remove/create an empty pgdata directory and put configuration there. - create_pgdata(&state.pgdata)?; - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; - - info!("starting safekeepers syncing"); - let lsn = sync_safekeepers(&state.pgdata, &state.pgbin) - .with_context(|| "failed to sync safekeepers")?; - info!("safekeepers synced at LSN {}", lsn); - - info!( - "getting basebackup@{} from pageserver {}", - lsn, pageserver_connstr - ); - get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context( - || { - format!( - "failed to get basebackup@{} from pageserver {}", - lsn, pageserver_connstr - ) - }, - )?; - - // Update pg_hba.conf received with basebackup. - update_pg_hba(pgdata_path)?; - - Ok(()) -} - -/// Start Postgres as a child process and manage DBs/roles. -/// After that this will hang waiting on the postmaster process to exit. -fn run_compute(state: &Arc>) -> Result { - let read_state = state.read().unwrap(); - let pgdata_path = Path::new(&read_state.pgdata); - - // Run postgres as a child process. - let mut pg = Command::new(&read_state.pgbin) - .args(&["-D", &read_state.pgdata]) - .spawn() - .expect("cannot start postgres process"); - - // Try default Postgres port if it is not provided - let port = read_state - .spec - .cluster - .settings - .find("port") - .unwrap_or_else(|| "5432".to_string()); - wait_for_postgres(&port, pgdata_path)?; - - let mut client = Client::connect(&read_state.connstr, NoTls)?; - - handle_roles(&read_state.spec, &mut client)?; - handle_databases(&read_state.spec, &mut client)?; - handle_grants(&read_state.spec, &mut client)?; - create_writablity_check_data(&mut client)?; - - // 'Close' connection - drop(client); - - info!( - "finished configuration of cluster #{}", - read_state.spec.cluster.cluster_id - ); - - // Release the read lock. - drop(read_state); - - // Get the write lock, update state and release the lock, so HTTP API - // was able to serve requests, while we are blocked waiting on - // Postgres. - let mut state = state.write().unwrap(); - state.ready = true; - drop(state); - - // Wait for child postgres process basically forever. In this state Ctrl+C - // will be propagated to postgres and it will be shut down as well. - let ecode = pg.wait().expect("failed to wait on postgres"); - - Ok(ecode) -} - -fn main() -> Result<()> { - // TODO: re-use `utils::logging` later - init_logger(DEFAULT_LOG_LEVEL)?; - - // Env variable is set by `cargo` - let version: Option<&str> = option_env!("CARGO_PKG_VERSION"); - let matches = clap::App::new("zenith_ctl") - .version(version.unwrap_or("unknown")) - .arg( - Arg::new("connstr") - .short('C') - .long("connstr") - .value_name("DATABASE_URL") - .required(true), - ) - .arg( - Arg::new("pgdata") - .short('D') - .long("pgdata") - .value_name("DATADIR") - .required(true), - ) - .arg( - Arg::new("pgbin") - .short('b') - .long("pgbin") - .value_name("POSTGRES_PATH"), - ) - .arg( - Arg::new("spec") - .short('s') - .long("spec") - .value_name("SPEC_JSON"), - ) - .arg( - Arg::new("spec-path") - .short('S') - .long("spec-path") - .value_name("SPEC_PATH"), - ) - .get_matches(); - - let pgdata = matches.value_of("pgdata").expect("PGDATA path is required"); - let connstr = matches - .value_of("connstr") - .expect("Postgres connection string is required"); - let spec = matches.value_of("spec"); - let spec_path = matches.value_of("spec-path"); - - // Try to use just 'postgres' if no path is provided - let pgbin = matches.value_of("pgbin").unwrap_or("postgres"); - - let spec: ClusterSpec = match spec { - // First, try to get cluster spec from the cli argument - Some(json) => serde_json::from_str(json)?, - None => { - // Second, try to read it from the file if path is provided - if let Some(sp) = spec_path { - let path = Path::new(sp); - let file = File::open(path)?; - serde_json::from_reader(file)? - } else { - panic!("cluster spec should be provided via --spec or --spec-path argument"); - } - } - }; - - let compute_state = ComputeState { - connstr: connstr.to_string(), - pgdata: pgdata.to_string(), - pgbin: pgbin.to_string(), - spec, - ready: false, - last_active: Utc::now(), - }; - let compute_state = Arc::new(RwLock::new(compute_state)); - - // Launch service threads first, so we were able to serve availability - // requests, while configuration is still in progress. - let mut _threads = vec![ - launch_http_server(&compute_state).expect("cannot launch compute monitor thread"), - launch_monitor(&compute_state).expect("cannot launch http endpoint thread"), - ]; - - prepare_pgdata(&compute_state)?; - - // Run compute (Postgres) and hang waiting on it. Panic if any error happens, - // it will help us to trigger unwind and kill postmaster as well. - match run_compute(&compute_state) { - Ok(ec) => exit(ec.success() as i32), - Err(error) => panic!("cannot start compute node, error: {}", error), - } -} diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index 63da6ea23e..dbb70a74cf 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,11 +1,11 @@ -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use anyhow::{anyhow, Result}; use log::error; use postgres::Client; use tokio_postgres::NoTls; -use crate::zenith::ComputeState; +use crate::compute::ComputeNode; pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { let query = " @@ -23,9 +23,9 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { Ok(()) } -pub async fn check_writability(state: &Arc>) -> Result<()> { - let connstr = state.read().unwrap().connstr.clone(); - let (client, connection) = tokio_postgres::connect(&connstr, NoTls).await?; +pub async fn check_writability(compute: &Arc) -> Result<()> { + let connstr = &compute.connstr; + let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?; if client.is_closed() { return Err(anyhow!("connection to postgres closed")); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs new file mode 100644 index 0000000000..fd60b80305 --- /dev/null +++ b/compute_tools/src/compute.rs @@ -0,0 +1,321 @@ +// +// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`, +// but there are several things that makes `PostgresNode` usage inconvenient in the +// cloud: +// - it inherits from `LocalEnv`, which contains **all-all** the information about +// a complete service running +// - it uses `PageServerNode` with information about http endpoint, which we do not +// need in the cloud again +// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud +// +// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required +// attributes (not required for the cloud). Yet, it is still tempting to unify these +// `PostgresNode` and `ComputeNode` and use one in both places. +// +// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`. +// +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::path::Path; +use std::process::{Command, ExitStatus, Stdio}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::RwLock; + +use anyhow::{Context, Result}; +use chrono::{DateTime, Utc}; +use log::info; +use postgres::{Client, NoTls}; +use serde::{Serialize, Serializer}; + +use crate::checker::create_writablity_check_data; +use crate::config; +use crate::pg_helpers::*; +use crate::spec::*; + +/// Compute node info shared across several `compute_ctl` threads. +pub struct ComputeNode { + pub start_time: DateTime, + pub connstr: String, + pub pgdata: String, + pub pgbin: String, + pub spec: ComputeSpec, + pub tenant: String, + pub timeline: String, + pub pageserver_connstr: String, + pub metrics: ComputeMetrics, + /// Volatile part of the `ComputeNode` so should be used under `RwLock` + /// to allow HTTP API server to serve status requests, while configuration + /// is in progress. + pub state: RwLock, +} + +fn rfc3339_serialize(x: &DateTime, s: S) -> Result +where + S: Serializer, +{ + x.to_rfc3339().serialize(s) +} + +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub struct ComputeState { + pub status: ComputeStatus, + /// Timestamp of the last Postgres activity + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: DateTime, + pub error: Option, +} + +impl ComputeState { + pub fn new() -> Self { + Self { + status: ComputeStatus::Init, + last_active: Utc::now(), + error: None, + } + } +} + +impl Default for ComputeState { + fn default() -> Self { + Self::new() + } +} + +#[derive(Serialize, Clone, Copy, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputeStatus { + Init, + Running, + Failed, +} + +#[derive(Serialize)] +pub struct ComputeMetrics { + pub sync_safekeepers_ms: AtomicU64, + pub basebackup_ms: AtomicU64, + pub config_ms: AtomicU64, + pub total_startup_ms: AtomicU64, +} + +impl ComputeMetrics { + pub fn new() -> Self { + Self { + sync_safekeepers_ms: AtomicU64::new(0), + basebackup_ms: AtomicU64::new(0), + config_ms: AtomicU64::new(0), + total_startup_ms: AtomicU64::new(0), + } + } +} + +impl Default for ComputeMetrics { + fn default() -> Self { + Self::new() + } +} + +impl ComputeNode { + pub fn set_status(&self, status: ComputeStatus) { + self.state.write().unwrap().status = status; + } + + pub fn get_status(&self) -> ComputeStatus { + self.state.read().unwrap().status + } + + // Remove `pgdata` directory and create it again with right permissions. + fn create_pgdata(&self) -> Result<()> { + // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. + // If it is something different then create_dir() will error out anyway. + let _ok = fs::remove_dir_all(&self.pgdata); + fs::create_dir(&self.pgdata)?; + fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?; + + Ok(()) + } + + // Get basebackup from the libpq connection to pageserver using `connstr` and + // unarchive it to `pgdata` directory overriding all its previous content. + fn get_basebackup(&self, lsn: &str) -> Result<()> { + let start_time = Utc::now(); + + let mut client = Client::connect(&self.pageserver_connstr, NoTls)?; + let basebackup_cmd = match lsn { + "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute + _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), + }; + let copyreader = client.copy_out(basebackup_cmd.as_str())?; + + // Read the archive directly from the `CopyOutReader` + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); + ar.unpack(&self.pgdata)?; + + self.metrics.basebackup_ms.store( + Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + + Ok(()) + } + + // Run `postgres` in a special mode with `--sync-safekeepers` argument + // and return the reported LSN back to the caller. + fn sync_safekeepers(&self) -> Result { + let start_time = Utc::now(); + + let sync_handle = Command::new(&self.pgbin) + .args(&["--sync-safekeepers"]) + .env("PGDATA", &self.pgdata) // we cannot use -D in this mode + .stdout(Stdio::piped()) + .spawn() + .expect("postgres --sync-safekeepers failed to start"); + + // `postgres --sync-safekeepers` will print all log output to stderr and + // final LSN to stdout. So we pipe only stdout, while stderr will be automatically + // redirected to the caller output. + let sync_output = sync_handle + .wait_with_output() + .expect("postgres --sync-safekeepers failed"); + if !sync_output.status.success() { + anyhow::bail!( + "postgres --sync-safekeepers exited with non-zero status: {}", + sync_output.status, + ); + } + + self.metrics.sync_safekeepers_ms.store( + Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + + let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); + + Ok(lsn) + } + + /// Do all the preparations like PGDATA directory creation, configuration, + /// safekeepers sync, basebackup, etc. + pub fn prepare_pgdata(&self) -> Result<()> { + let spec = &self.spec; + let pgdata_path = Path::new(&self.pgdata); + + // Remove/create an empty pgdata directory and put configuration there. + self.create_pgdata()?; + config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; + + info!("starting safekeepers syncing"); + let lsn = self + .sync_safekeepers() + .with_context(|| "failed to sync safekeepers")?; + info!("safekeepers synced at LSN {}", lsn); + + info!( + "getting basebackup@{} from pageserver {}", + lsn, &self.pageserver_connstr + ); + self.get_basebackup(&lsn).with_context(|| { + format!( + "failed to get basebackup@{} from pageserver {}", + lsn, &self.pageserver_connstr + ) + })?; + + // Update pg_hba.conf received with basebackup. + update_pg_hba(pgdata_path)?; + + Ok(()) + } + + /// Start Postgres as a child process and manage DBs/roles. + /// After that this will hang waiting on the postmaster process to exit. + pub fn run(&self) -> Result { + let start_time = Utc::now(); + + let pgdata_path = Path::new(&self.pgdata); + + // Run postgres as a child process. + let mut pg = Command::new(&self.pgbin) + .args(&["-D", &self.pgdata]) + .spawn() + .expect("cannot start postgres process"); + + // Try default Postgres port if it is not provided + let port = self + .spec + .cluster + .settings + .find("port") + .unwrap_or_else(|| "5432".to_string()); + wait_for_postgres(&mut pg, &port, pgdata_path)?; + + let mut client = Client::connect(&self.connstr, NoTls)?; + + handle_roles(&self.spec, &mut client)?; + handle_databases(&self.spec, &mut client)?; + handle_grants(&self.spec, &mut client)?; + create_writablity_check_data(&mut client)?; + + // 'Close' connection + drop(client); + let startup_end_time = Utc::now(); + + self.metrics.config_ms.store( + startup_end_time + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + self.metrics.total_startup_ms.store( + startup_end_time + .signed_duration_since(self.start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + + self.set_status(ComputeStatus::Running); + + info!( + "finished configuration of compute for project {}", + self.spec.cluster.cluster_id + ); + + // Wait for child Postgres process basically forever. In this state Ctrl+C + // will propagate to Postgres and it will be shut down as well. + let ecode = pg + .wait() + .expect("failed to start waiting on Postgres process"); + + Ok(ecode) + } + + pub fn prepare_and_run(&self) -> Result { + info!( + "starting compute for project {}, operation {}, tenant {}, timeline {}", + self.spec.cluster.cluster_id, + self.spec.operation_uuid.as_ref().unwrap(), + self.tenant, + self.timeline, + ); + + self.prepare_pgdata()?; + self.run() + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 22134db0f8..6cbd0e3d4c 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,7 +6,7 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::PgOptionsSerialize; -use crate::zenith::ClusterSpec; +use crate::spec::ComputeSpec; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -32,20 +32,20 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { } /// Create or completely rewrite configuration file specified by `path` -pub fn write_postgres_conf(path: &Path, spec: &ClusterSpec) -> Result<()> { +pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { // File::create() destroys the file content if it exists. let mut postgres_conf = File::create(path)?; - write_zenith_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?; + write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?; Ok(()) } // Write Postgres config block wrapped with generated comment section -fn write_zenith_managed_block(file: &mut File, buf: &str) -> Result<()> { - writeln!(file, "# Managed by Zenith: begin")?; +fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> { + writeln!(file, "# Managed by compute_ctl: begin")?; writeln!(file, "{}", buf)?; - writeln!(file, "# Managed by Zenith: end")?; + writeln!(file, "# Managed by compute_ctl: end")?; Ok(()) } diff --git a/compute_tools/src/http_api.rs b/compute_tools/src/http/api.rs similarity index 56% rename from compute_tools/src/http_api.rs rename to compute_tools/src/http/api.rs index 7e1a876044..4c8bbc608b 100644 --- a/compute_tools/src/http_api.rs +++ b/compute_tools/src/http/api.rs @@ -1,37 +1,64 @@ use std::convert::Infallible; use std::net::SocketAddr; -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use std::thread; use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; use log::{error, info}; +use serde_json; -use crate::zenith::*; +use crate::compute::{ComputeNode, ComputeStatus}; // Service function to handle all available routes. -async fn routes(req: Request, state: Arc>) -> Response { +async fn routes(req: Request, compute: Arc) -> Response { match (req.method(), req.uri().path()) { // Timestamp of the last Postgres activity in the plain text. + // DEPRECATED in favour of /status (&Method::GET, "/last_activity") => { info!("serving /last_active GET request"); - let state = state.read().unwrap(); + let state = compute.state.read().unwrap(); // Use RFC3339 format for consistency. Response::new(Body::from(state.last_active.to_rfc3339())) } - // Has compute setup process finished? -> true/false + // Has compute setup process finished? -> true/false. + // DEPRECATED in favour of /status (&Method::GET, "/ready") => { info!("serving /ready GET request"); - let state = state.read().unwrap(); - Response::new(Body::from(format!("{}", state.ready))) + let status = compute.get_status(); + Response::new(Body::from(format!("{}", status == ComputeStatus::Running))) } + // Serialized compute state. + (&Method::GET, "/status") => { + info!("serving /status GET request"); + let state = compute.state.read().unwrap(); + Response::new(Body::from(serde_json::to_string(&*state).unwrap())) + } + + // Startup metrics in JSON format. Keep /metrics reserved for a possible + // future use for Prometheus metrics format. + (&Method::GET, "/metrics.json") => { + info!("serving /metrics.json GET request"); + Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap())) + } + + // DEPRECATED, use POST instead (&Method::GET, "/check_writability") => { info!("serving /check_writability GET request"); - let res = crate::checker::check_writability(&state).await; + let res = crate::checker::check_writability(&compute).await; + match res { + Ok(_) => Response::new(Body::from("true")), + Err(e) => Response::new(Body::from(e.to_string())), + } + } + + (&Method::POST, "/check_writability") => { + info!("serving /check_writability POST request"); + let res = crate::checker::check_writability(&compute).await; match res { Ok(_) => Response::new(Body::from("true")), Err(e) => Response::new(Body::from(e.to_string())), @@ -49,7 +76,7 @@ async fn routes(req: Request, state: Arc>) -> Respons // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] -async fn serve(state: Arc>) { +async fn serve(state: Arc) { let addr = SocketAddr::from(([0, 0, 0, 0], 3080)); let make_service = make_service_fn(move |_conn| { @@ -73,7 +100,7 @@ async fn serve(state: Arc>) { } /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`. -pub fn launch_http_server(state: &Arc>) -> Result> { +pub fn launch_http_server(state: &Arc) -> Result> { let state = Arc::clone(state); Ok(thread::Builder::new() diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs new file mode 100644 index 0000000000..e5fdf85eed --- /dev/null +++ b/compute_tools/src/http/mod.rs @@ -0,0 +1 @@ +pub mod api; diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml new file mode 100644 index 0000000000..9c0f8e3ccd --- /dev/null +++ b/compute_tools/src/http/openapi_spec.yaml @@ -0,0 +1,158 @@ +openapi: "3.0.2" +info: + title: Compute node control API + version: "1.0" + +servers: + - url: "http://localhost:3080" + +paths: + /status: + get: + tags: + - "info" + summary: Get compute node internal status + description: "" + operationId: getComputeStatus + responses: + "200": + description: ComputeState + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeState" + + /metrics.json: + get: + tags: + - "info" + summary: Get compute node startup metrics in JSON format + description: "" + operationId: getComputeMetricsJSON + responses: + "200": + description: ComputeMetrics + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeMetrics" + + /ready: + get: + deprecated: true + tags: + - "info" + summary: Check whether compute startup process finished successfully + description: "" + operationId: computeIsReady + responses: + "200": + description: Compute is ready ('true') or not ('false') + content: + text/plain: + schema: + type: string + example: "true" + + /last_activity: + get: + deprecated: true + tags: + - "info" + summary: Get timestamp of the last compute activity + description: "" + operationId: getLastComputeActivityTS + responses: + "200": + description: Timestamp of the last compute activity + content: + text/plain: + schema: + type: string + example: "2022-10-12T07:20:50.52Z" + + /check_writability: + get: + deprecated: true + tags: + - "check" + summary: Check that we can write new data on this compute + description: "" + operationId: checkComputeWritabilityDeprecated + responses: + "200": + description: Check result + content: + text/plain: + schema: + type: string + description: Error text or 'true' if check passed + example: "true" + + post: + tags: + - "check" + summary: Check that we can write new data on this compute + description: "" + operationId: checkComputeWritability + responses: + "200": + description: Check result + content: + text/plain: + schema: + type: string + description: Error text or 'true' if check passed + example: "true" + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + + schemas: + ComputeMetrics: + type: object + description: Compute startup metrics + required: + - sync_safekeepers_ms + - basebackup_ms + - config_ms + - total_startup_ms + properties: + sync_safekeepers_ms: + type: integer + basebackup_ms: + type: integer + config_ms: + type: integer + total_startup_ms: + type: integer + + ComputeState: + type: object + required: + - status + - last_active + properties: + status: + $ref: '#/components/schemas/ComputeStatus' + last_active: + type: string + description: The last detected compute activity timestamp in UTC and RFC3339 format + example: "2022-10-12T07:20:50.52Z" + error: + type: string + description: Text of the error during compute startup, if any + + ComputeStatus: + type: string + enum: + - init + - failed + - running + +security: + - JWT: [] diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index ffb9700a49..aee6b53e6a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -4,11 +4,11 @@ //! pub mod checker; pub mod config; -pub mod http_api; +pub mod http; #[macro_use] pub mod logger; +pub mod compute; pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; -pub mod zenith; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 596981b2d2..496a5aae3b 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -1,4 +1,4 @@ -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use std::{thread, time}; use anyhow::Result; @@ -6,16 +6,16 @@ use chrono::{DateTime, Utc}; use log::{debug, info}; use postgres::{Client, NoTls}; -use crate::zenith::ComputeState; +use crate::compute::ComputeNode; const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds // Spin in a loop and figure out the last activity time in the Postgres. // Then update it in the shared state. This function never errors out. // XXX: the only expected panic is at `RwLock` unwrap(). -fn watch_compute_activity(state: &Arc>) { +fn watch_compute_activity(compute: &Arc) { // Suppose that `connstr` doesn't change - let connstr = state.read().unwrap().connstr.clone(); + let connstr = compute.connstr.clone(); // Define `client` outside of the loop to reuse existing connection if it's active. let mut client = Client::connect(&connstr, NoTls); let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL); @@ -46,7 +46,7 @@ fn watch_compute_activity(state: &Arc>) { AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors? &[], ); - let mut last_active = state.read().unwrap().last_active; + let mut last_active = compute.state.read().unwrap().last_active; if let Ok(backs) = backends { let mut idle_backs: Vec> = vec![]; @@ -83,14 +83,14 @@ fn watch_compute_activity(state: &Arc>) { } // Update the last activity in the shared state if we got a more recent one. - let mut state = state.write().unwrap(); + let mut state = compute.state.write().unwrap(); if last_active > state.last_active { state.last_active = last_active; debug!("set the last compute activity time to: {}", last_active); } } Err(e) => { - info!("cannot connect to postgres: {}, retrying", e); + debug!("cannot connect to postgres: {}, retrying", e); // Establish a new connection and try again. client = Client::connect(&connstr, NoTls); @@ -100,7 +100,7 @@ fn watch_compute_activity(state: &Arc>) { } /// Launch a separate compute monitor thread and return its `JoinHandle`. -pub fn launch_monitor(state: &Arc>) -> Result> { +pub fn launch_monitor(state: &Arc) -> Result> { let state = Arc::clone(state); Ok(thread::Builder::new() diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 1409a81b6b..74856eac63 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,7 +1,9 @@ +use std::fs::File; +use std::io::{BufRead, BufReader}; use std::net::{SocketAddr, TcpStream}; use std::os::unix::fs::PermissionsExt; use std::path::Path; -use std::process::Command; +use std::process::Child; use std::str::FromStr; use std::{fs, thread, time}; @@ -220,12 +222,12 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { /// Wait for Postgres to become ready to accept connections: /// - state should be `ready` in the `pgdata/postmaster.pid` /// - and we should be able to connect to 127.0.0.1:5432 -pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> { +pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); let mut slept: u64 = 0; // ms let pause = time::Duration::from_millis(100); - let timeout = time::Duration::from_millis(200); + let timeout = time::Duration::from_millis(10); let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap(); loop { @@ -236,14 +238,19 @@ pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> { bail!("timed out while waiting for Postgres to start"); } + if let Ok(Some(status)) = pg.try_wait() { + // Postgres exited, that is not what we expected, bail out earlier. + let code = status.code().unwrap_or(-1); + bail!("Postgres exited unexpectedly with code {}", code); + } + if pid_path.exists() { - // XXX: dumb and the simplest way to get the last line in a text file - // TODO: better use `.lines().last()` later - let stdout = Command::new("tail") - .args(&["-n1", pid_path.to_str().unwrap()]) - .output()? - .stdout; - let status = String::from_utf8(stdout)?; + let file = BufReader::new(File::open(&pid_path)?); + let status = file + .lines() + .last() + .unwrap() + .unwrap_or_else(|_| "unknown".to_string()); let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); // Now Postgres is ready to accept connections diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 334e0a9e05..e88df56a65 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -3,16 +3,53 @@ use std::path::Path; use anyhow::Result; use log::{info, log_enabled, warn, Level}; use postgres::Client; +use serde::Deserialize; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -use crate::zenith::ClusterSpec; + +/// Cluster spec or configuration represented as an optional number of +/// delta operations + final cluster state description. +#[derive(Clone, Deserialize)] +pub struct ComputeSpec { + pub format_version: f32, + pub timestamp: String, + pub operation_uuid: Option, + /// Expected cluster state at the end of transition process. + pub cluster: Cluster, + pub delta_operations: Option>, +} + +/// Cluster state seen from the perspective of the external tools +/// like Rails web console. +#[derive(Clone, Deserialize)] +pub struct Cluster { + pub cluster_id: String, + pub name: String, + pub state: Option, + pub roles: Vec, + pub databases: Vec, + pub settings: GenericOptions, +} + +/// Single cluster state changing operation that could not be represented as +/// a static `Cluster` structure. For example: +/// - DROP DATABASE +/// - DROP ROLE +/// - ALTER ROLE name RENAME TO new_name +/// - ALTER DATABASE name RENAME TO new_name +#[derive(Clone, Deserialize)] +pub struct DeltaOp { + pub action: String, + pub name: PgIdent, + pub new_name: Option, +} /// It takes cluster specification and does the following: /// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file. /// - Update `pg_hba.conf` to allow external connections. -pub fn handle_configuration(spec: &ClusterSpec, pgdata_path: &Path) -> Result<()> { +pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> { // File `postgresql.conf` is no longer included into `basebackup`, so just // always write all config into it creating new file. config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; @@ -39,7 +76,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { /// Given a cluster spec json and open transaction it handles roles creation, /// deletion and update. -pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { +pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let mut xact = client.transaction()?; let existing_roles: Vec = get_existing_roles(&mut xact)?; @@ -165,7 +202,7 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level /// atomicity should be enough here due to the order of operations and various checks, /// which together provide us idempotency. -pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { +pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let existing_dbs: Vec = get_existing_dbs(client)?; // Print a list of existing Postgres databases (only in debug mode) @@ -254,7 +291,7 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { // Grant CREATE ON DATABASE to the database owner // to allow clients create trusted extensions. -pub fn handle_grants(spec: &ClusterSpec, client: &mut Client) -> Result<()> { +pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> { info!("cluster spec grants:"); for db in &spec.cluster.databases { diff --git a/compute_tools/src/zenith.rs b/compute_tools/src/zenith.rs deleted file mode 100644 index ba7dc20787..0000000000 --- a/compute_tools/src/zenith.rs +++ /dev/null @@ -1,109 +0,0 @@ -use std::process::{Command, Stdio}; - -use anyhow::Result; -use chrono::{DateTime, Utc}; -use postgres::{Client, NoTls}; -use serde::Deserialize; - -use crate::pg_helpers::*; - -/// Compute node state shared across several `zenith_ctl` threads. -/// Should be used under `RwLock` to allow HTTP API server to serve -/// status requests, while configuration is in progress. -pub struct ComputeState { - pub connstr: String, - pub pgdata: String, - pub pgbin: String, - pub spec: ClusterSpec, - /// Compute setup process has finished - pub ready: bool, - /// Timestamp of the last Postgres activity - pub last_active: DateTime, -} - -/// Cluster spec or configuration represented as an optional number of -/// delta operations + final cluster state description. -#[derive(Clone, Deserialize)] -pub struct ClusterSpec { - pub format_version: f32, - pub timestamp: String, - pub operation_uuid: Option, - /// Expected cluster state at the end of transition process. - pub cluster: Cluster, - pub delta_operations: Option>, -} - -/// Cluster state seen from the perspective of the external tools -/// like Rails web console. -#[derive(Clone, Deserialize)] -pub struct Cluster { - pub cluster_id: String, - pub name: String, - pub state: Option, - pub roles: Vec, - pub databases: Vec, - pub settings: GenericOptions, -} - -/// Single cluster state changing operation that could not be represented as -/// a static `Cluster` structure. For example: -/// - DROP DATABASE -/// - DROP ROLE -/// - ALTER ROLE name RENAME TO new_name -/// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Deserialize)] -pub struct DeltaOp { - pub action: String, - pub name: PgIdent, - pub new_name: Option, -} - -/// Get basebackup from the libpq connection to pageserver using `connstr` and -/// unarchive it to `pgdata` directory overriding all its previous content. -pub fn get_basebackup( - pgdata: &str, - connstr: &str, - tenant: &str, - timeline: &str, - lsn: &str, -) -> Result<()> { - let mut client = Client::connect(connstr, NoTls)?; - let basebackup_cmd = match lsn { - "0/0" => format!("basebackup {} {}", tenant, timeline), // First start of the compute - _ => format!("basebackup {} {} {}", tenant, timeline, lsn), - }; - let copyreader = client.copy_out(basebackup_cmd.as_str())?; - let mut ar = tar::Archive::new(copyreader); - - ar.unpack(&pgdata)?; - - Ok(()) -} - -/// Run `postgres` in a special mode with `--sync-safekeepers` argument -/// and return the reported LSN back to the caller. -pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result { - let sync_handle = Command::new(&pgbin) - .args(&["--sync-safekeepers"]) - .env("PGDATA", &pgdata) // we cannot use -D in this mode - .stdout(Stdio::piped()) - .spawn() - .expect("postgres --sync-safekeepers failed to start"); - - // `postgres --sync-safekeepers` will print all log output to stderr and - // final LSN to stdout. So we pipe only stdout, while stderr will be automatically - // redirected to the caller output. - let sync_output = sync_handle - .wait_with_output() - .expect("postgres --sync-safekeepers failed"); - if !sync_output.status.success() { - anyhow::bail!( - "postgres --sync-safekeepers exited with non-zero status: {}", - sync_output.status, - ); - } - - let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); - - Ok(lsn) -} diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 472a49af4b..33f903f0e1 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -4,12 +4,12 @@ mod pg_helpers_tests { use std::fs::File; use compute_tools::pg_helpers::*; - use compute_tools::zenith::ClusterSpec; + use compute_tools::spec::ComputeSpec; #[test] fn params_serialize() { let file = File::open("tests/cluster_spec.json").unwrap(); - let spec: ClusterSpec = serde_json::from_reader(file).unwrap(); + let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( spec.cluster.databases.first().unwrap().to_pg_options(), @@ -24,7 +24,7 @@ mod pg_helpers_tests { #[test] fn settings_serialize() { let file = File::open("tests/cluster_spec.json").unwrap(); - let spec: ClusterSpec = serde_json::from_reader(file).unwrap(); + let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( spec.cluster.settings.as_pg_settings(), diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 41417aab9a..21311eea9a 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -tar = "0.4.33" +tar = "0.4.38" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 2243a0a5f8..925e2f14ee 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -9,3 +9,6 @@ auth_type = 'Trust' id = 1 pg_port = 5454 http_port = 7676 + +[etcd_broker] +broker_endpoints = ['http://127.0.0.1:2379'] diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 92d0e080d8..045acd7519 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -231,8 +231,13 @@ impl PostgresNode { .context("page server 'basebackup' command failed")?; // Read the archive directly from the `CopyOutReader` - tar::Archive::new(copyreader) - .unpack(&self.pgdata()) + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); + ar.unpack(&self.pgdata()) .context("extracting base backup failed")?; Ok(()) @@ -274,6 +279,8 @@ impl PostgresNode { conf.append("listen_addresses", &self.address.ip().to_string()); conf.append("port", &self.address.port().to_string()); conf.append("wal_keep_size", "0"); + // walproposer panics when basebackup is invalid, it is pointless to restart in this case. + conf.append("restart_after_crash", "off"); // Configure the node to fetch pages from pageserver let pageserver_connstr = { diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs new file mode 100644 index 0000000000..bc39b7dea3 --- /dev/null +++ b/control_plane/src/etcd.rs @@ -0,0 +1,97 @@ +use std::{ + fs, + path::PathBuf, + process::{Command, Stdio}, +}; + +use anyhow::Context; +use nix::{ + sys::signal::{kill, Signal}, + unistd::Pid, +}; + +use crate::{local_env, read_pidfile}; + +pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { + let etcd_broker = &env.etcd_broker; + println!( + "Starting etcd broker using {}", + etcd_broker.etcd_binary_path.display() + ); + + let etcd_data_dir = env.base_data_dir.join("etcd"); + fs::create_dir_all(&etcd_data_dir).with_context(|| { + format!( + "Failed to create etcd data dir: {}", + etcd_data_dir.display() + ) + })?; + + let etcd_stdout_file = + fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| { + format!( + "Failed to create ectd stout file in directory {}", + etcd_data_dir.display() + ) + })?; + let etcd_stderr_file = + fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| { + format!( + "Failed to create ectd stderr file in directory {}", + etcd_data_dir.display() + ) + })?; + let client_urls = etcd_broker.comma_separated_endpoints(); + + let etcd_process = Command::new(&etcd_broker.etcd_binary_path) + .args(&[ + format!("--data-dir={}", etcd_data_dir.display()), + format!("--listen-client-urls={client_urls}"), + format!("--advertise-client-urls={client_urls}"), + // Set --quota-backend-bytes to keep the etcd virtual memory + // size smaller. Our test etcd clusters are very small. + // See https://github.com/etcd-io/etcd/issues/7910 + "--quota-backend-bytes=100000000".to_string(), + ]) + .stdout(Stdio::from(etcd_stdout_file)) + .stderr(Stdio::from(etcd_stderr_file)) + .spawn() + .context("Failed to spawn etcd subprocess")?; + let pid = etcd_process.id(); + + let etcd_pid_file_path = etcd_pid_file_path(env); + fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| { + format!( + "Failed to create etcd pid file at {}", + etcd_pid_file_path.display() + ) + })?; + + Ok(()) +} + +pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { + let etcd_path = &env.etcd_broker.etcd_binary_path; + println!("Stopping etcd broker at {}", etcd_path.display()); + + let etcd_pid_file_path = etcd_pid_file_path(env); + let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| { + format!( + "Failed to read etcd pid filea at {}", + etcd_pid_file_path.display() + ) + })?); + + kill(pid, Signal::SIGTERM).with_context(|| { + format!( + "Failed to stop etcd with pid {pid} at {}", + etcd_pid_file_path.display() + ) + })?; + + Ok(()) +} + +fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf { + env.base_data_dir.join("etcd.pid") +} diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index a2ecdd3d64..4dfca588ad 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -12,6 +12,7 @@ use std::path::Path; use std::process::Command; pub mod compute; +pub mod etcd; pub mod local_env; pub mod postgresql_conf; pub mod safekeeper; @@ -48,3 +49,12 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { cmd } } + +fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { + for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { + if let Ok(value) = std::env::var(env_key) { + cmd = cmd.env(env_key, value); + } + } + cmd +} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 5aeff505b6..2623f65242 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -4,6 +4,7 @@ //! script which will use local paths. use anyhow::{bail, ensure, Context}; +use reqwest::Url; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use std::collections::HashMap; @@ -14,7 +15,7 @@ use std::process::{Command, Stdio}; use utils::{ auth::{encode_from_key_file, Claims, Scope}, postgres_backend::AuthType, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use crate::safekeeper::SafekeeperNode; @@ -59,13 +60,7 @@ pub struct LocalEnv { #[serde(default)] pub private_key_path: PathBuf, - // A comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'. - #[serde(default)] - pub broker_endpoints: Option, - - /// A prefix to all to any key when pushing/polling etcd from a node. - #[serde(default)] - pub broker_etcd_prefix: Option, + pub etcd_broker: EtcdBroker, pub pageserver: PageServerConf, @@ -81,11 +76,67 @@ pub struct LocalEnv { branch_name_mappings: HashMap>, } +/// Etcd broker config for cluster internal communication. +#[serde_as] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +pub struct EtcdBroker { + /// A prefix to all to any key when pushing/polling etcd from a node. + #[serde(default)] + pub broker_etcd_prefix: Option, + + /// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'. + #[serde(default)] + #[serde_as(as = "Vec")] + pub broker_endpoints: Vec, + + /// Etcd binary path to use. + #[serde(default)] + pub etcd_binary_path: PathBuf, +} + +impl EtcdBroker { + pub fn locate_etcd() -> anyhow::Result { + let which_output = Command::new("which") + .arg("etcd") + .output() + .context("Failed to run 'which etcd' command")?; + let stdout = String::from_utf8_lossy(&which_output.stdout); + ensure!( + which_output.status.success(), + "'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}", + which_output.status, + String::from_utf8_lossy(&which_output.stderr) + ); + + let etcd_path = PathBuf::from(stdout.trim()); + ensure!( + etcd_path.is_file(), + "'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}", + etcd_path.display() + ); + + Ok(etcd_path) + } + + pub fn comma_separated_endpoints(&self) -> String { + self.broker_endpoints.iter().map(Url::as_str).fold( + String::new(), + |mut comma_separated_urls, url| { + if !comma_separated_urls.is_empty() { + comma_separated_urls.push(','); + } + comma_separated_urls.push_str(url); + comma_separated_urls + }, + ) + } +} + #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct PageServerConf { // node id - pub id: ZNodeId, + pub id: NodeId, // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, @@ -100,7 +151,7 @@ pub struct PageServerConf { impl Default for PageServerConf { fn default() -> Self { Self { - id: ZNodeId(0), + id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), auth_type: AuthType::Trust, @@ -112,19 +163,23 @@ impl Default for PageServerConf { #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { - pub id: ZNodeId, + pub id: NodeId, pub pg_port: u16, pub http_port: u16, pub sync: bool, + pub remote_storage: Option, + pub backup_threads: Option, } impl Default for SafekeeperConf { fn default() -> Self { Self { - id: ZNodeId(0), + id: NodeId(0), pg_port: 0, http_port: 0, sync: true, + remote_storage: None, + backup_threads: None, } } } @@ -184,12 +239,7 @@ impl LocalEnv { if old_timeline_id == &timeline_id { Ok(()) } else { - bail!( - "branch '{}' is already mapped to timeline {}, cannot map to another timeline {}", - branch_name, - old_timeline_id, - timeline_id - ); + bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"); } } else { existing_values.push((tenant_id, timeline_id)); @@ -225,7 +275,7 @@ impl LocalEnv { /// /// Unlike 'load_config', this function fills in any defaults that are missing /// from the config file. - pub fn create_config(toml: &str) -> anyhow::Result { + pub fn parse_config(toml: &str) -> anyhow::Result { let mut env: LocalEnv = toml::from_str(toml)?; // Find postgres binaries. @@ -238,26 +288,11 @@ impl LocalEnv { env.pg_distrib_dir = cwd.join("tmp_install") } } - if !env.pg_distrib_dir.join("bin/postgres").exists() { - bail!( - "Can't find postgres binary at {}", - env.pg_distrib_dir.display() - ); - } // Find zenith binaries. if env.zenith_distrib_dir == Path::new("") { env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); } - for binary in ["pageserver", "safekeeper"] { - if !env.zenith_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{}' in zenith distrib dir '{}'", - binary, - env.zenith_distrib_dir.display() - ); - } - } // If no initial tenant ID was given, generate it. if env.default_tenant_id.is_none() { @@ -346,11 +381,42 @@ impl LocalEnv { base_path != Path::new(""), "repository base path is missing" ); + ensure!( !base_path.exists(), "directory '{}' already exists. Perhaps already initialized?", base_path.display() ); + if !self.pg_distrib_dir.join("bin/postgres").exists() { + bail!( + "Can't find postgres binary at {}", + self.pg_distrib_dir.display() + ); + } + for binary in ["pageserver", "safekeeper"] { + if !self.zenith_distrib_dir.join(binary).exists() { + bail!( + "Can't find binary '{}' in zenith distrib dir '{}'", + binary, + self.zenith_distrib_dir.display() + ); + } + } + + for binary in ["pageserver", "safekeeper"] { + if !self.zenith_distrib_dir.join(binary).exists() { + bail!( + "Can't find binary '{binary}' in zenith distrib dir '{}'", + self.zenith_distrib_dir.display() + ); + } + } + if !self.pg_distrib_dir.join("bin/postgres").exists() { + bail!( + "Can't find postgres binary at {}", + self.pg_distrib_dir.display() + ); + } fs::create_dir(&base_path)?; @@ -408,7 +474,35 @@ impl LocalEnv { fn base_path() -> PathBuf { match std::env::var_os("ZENITH_REPO_DIR") { - Some(val) => PathBuf::from(val.to_str().unwrap()), - None => ".zenith".into(), + Some(val) => PathBuf::from(val), + None => PathBuf::from(".zenith"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_conf_parsing() { + let simple_conf_toml = include_str!("../simple.conf"); + let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml); + assert!( + simple_conf_parse_result.is_ok(), + "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" + ); + + let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']"; + let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']"; + let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); + assert!( + spoiled_url_toml.contains(spoiled_url_str), + "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}" + ); + let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); + assert!( + spoiled_url_parse_result.is_err(), + "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}" + ); } } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 074ee72f69..972b6d48ae 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -18,12 +18,12 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; -use crate::{fill_rust_env_vars, read_pidfile}; +use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { @@ -52,7 +52,7 @@ impl ResponseErrorMessageExt for Response { Err(SafekeeperHttpError::Response( match self.json::() { Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), + Err(_) => format!("Http error ({}) at {url}.", status.as_u16()), }, )) } @@ -65,7 +65,7 @@ impl ResponseErrorMessageExt for Response { // #[derive(Debug)] pub struct SafekeeperNode { - pub id: ZNodeId, + pub id: NodeId, pub conf: SafekeeperConf, @@ -75,17 +75,12 @@ pub struct SafekeeperNode { pub http_base_url: String, pub pageserver: Arc, - - broker_endpoints: Option, - broker_etcd_prefix: Option, } impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { let pageserver = Arc::new(PageServerNode::from_env(env)); - println!("initializing for sk {} for {}", conf.id, conf.http_port); - SafekeeperNode { id: conf.id, conf: conf.clone(), @@ -94,8 +89,6 @@ impl SafekeeperNode { http_client: Client::new(), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), pageserver, - broker_endpoints: env.broker_endpoints.clone(), - broker_etcd_prefix: env.broker_etcd_prefix.clone(), } } @@ -107,7 +100,7 @@ impl SafekeeperNode { .unwrap() } - pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf { + pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref()) } @@ -142,12 +135,22 @@ impl SafekeeperNode { if !self.conf.sync { cmd.arg("--no-sync"); } - if let Some(ref ep) = self.broker_endpoints { - cmd.args(&["--broker-endpoints", ep]); + + let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints(); + if !comma_separated_endpoints.is_empty() { + cmd.args(&["--broker-endpoints", &comma_separated_endpoints]); } - if let Some(prefix) = self.broker_etcd_prefix.as_deref() { + if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() { cmd.args(&["--broker-etcd-prefix", prefix]); } + if let Some(threads) = self.conf.backup_threads { + cmd.args(&["--backup-threads", threads.to_string().as_ref()]); + } + if let Some(ref remote_storage) = self.conf.remote_storage { + cmd.args(&["--remote-storage", remote_storage]); + } + + fill_aws_secrets_vars(&mut cmd); if !cmd.status()?.success() { bail!( @@ -210,12 +213,13 @@ impl SafekeeperNode { let pid = Pid::from_raw(pid); let sig = if immediate { - println!("Stop safekeeper immediately"); + print!("Stopping safekeeper {} immediately..", self.id); Signal::SIGQUIT } else { - println!("Stop safekeeper gracefully"); + print!("Stopping safekeeper {} gracefully..", self.id); Signal::SIGTERM }; + io::stdout().flush().unwrap(); match kill(pid, sig) { Ok(_) => (), Err(Errno::ESRCH) => { @@ -237,25 +241,35 @@ impl SafekeeperNode { // TODO Remove this "timeout" and handle it on caller side instead. // Shutting down may take a long time, // if safekeeper flushes a lot of data + let mut tcp_stopped = false; for _ in 0..100 { - if let Err(_e) = TcpStream::connect(&address) { - println!("Safekeeper stopped receiving connections"); - - //Now check status - match self.check_status() { - Ok(_) => { - println!("Safekeeper status is OK. Wait a bit."); - thread::sleep(Duration::from_secs(1)); - } - Err(err) => { - println!("Safekeeper status is: {}", err); - return Ok(()); + if !tcp_stopped { + if let Err(err) = TcpStream::connect(&address) { + tcp_stopped = true; + if err.kind() != io::ErrorKind::ConnectionRefused { + eprintln!("\nSafekeeper connection failed with error: {err}"); } } - } else { - println!("Safekeeper still receives connections"); - thread::sleep(Duration::from_secs(1)); } + if tcp_stopped { + // Also check status on the HTTP port + match self.check_status() { + Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => { + println!("done!"); + return Ok(()); + } + Err(err) => { + eprintln!("\nSafekeeper status check failed with error: {err}"); + return Ok(()); + } + Ok(()) => { + // keep waiting + } + } + } + print!("."); + io::stdout().flush().unwrap(); + thread::sleep(Duration::from_secs(1)); } bail!("Failed to stop safekeeper with pid {}", pid); @@ -280,7 +294,7 @@ impl SafekeeperNode { &self, tenant_id: ZTenantId, timeline_id: ZTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result<()> { Ok(self .http_request( diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d2e63a22de..24cdbce8f3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -25,7 +25,7 @@ use utils::{ }; use crate::local_env::LocalEnv; -use crate::{fill_rust_env_vars, read_pidfile}; +use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; use pageserver::tenant_mgr::TenantInfo; #[derive(Error, Debug)] @@ -121,6 +121,16 @@ impl PageServerNode { ); let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr); + let broker_endpoints_param = format!( + "broker_endpoints=[{}]", + self.env + .etcd_broker + .broker_endpoints + .iter() + .map(|url| format!("'{url}'")) + .collect::>() + .join(",") + ); let mut args = Vec::with_capacity(20); args.push("--init"); @@ -129,8 +139,19 @@ impl PageServerNode { args.extend(["-c", &authg_type_param]); args.extend(["-c", &listen_http_addr_param]); args.extend(["-c", &listen_pg_addr_param]); + args.extend(["-c", &broker_endpoints_param]); args.extend(["-c", &id]); + let broker_etcd_prefix_param = self + .env + .etcd_broker + .broker_etcd_prefix + .as_ref() + .map(|prefix| format!("broker_etcd_prefix='{prefix}'")); + if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { + args.extend(["-c", broker_etcd_prefix_param]); + } + for config_override in config_overrides { args.extend(["-c", config_override]); } @@ -260,12 +281,13 @@ impl PageServerNode { let pid = Pid::from_raw(read_pidfile(&pid_file)?); let sig = if immediate { - println!("Stop pageserver immediately"); + print!("Stopping pageserver immediately.."); Signal::SIGQUIT } else { - println!("Stop pageserver gracefully"); + print!("Stopping pageserver gracefully.."); Signal::SIGTERM }; + io::stdout().flush().unwrap(); match kill(pid, sig) { Ok(_) => (), Err(Errno::ESRCH) => { @@ -287,25 +309,36 @@ impl PageServerNode { // TODO Remove this "timeout" and handle it on caller side instead. // Shutting down may take a long time, // if pageserver checkpoints a lot of data + let mut tcp_stopped = false; for _ in 0..100 { - if let Err(_e) = TcpStream::connect(&address) { - println!("Pageserver stopped receiving connections"); - - //Now check status - match self.check_status() { - Ok(_) => { - println!("Pageserver status is OK. Wait a bit."); - thread::sleep(Duration::from_secs(1)); - } - Err(err) => { - println!("Pageserver status is: {}", err); - return Ok(()); + if !tcp_stopped { + if let Err(err) = TcpStream::connect(&address) { + tcp_stopped = true; + if err.kind() != io::ErrorKind::ConnectionRefused { + eprintln!("\nPageserver connection failed with error: {err}"); } } - } else { - println!("Pageserver still receives connections"); - thread::sleep(Duration::from_secs(1)); } + if tcp_stopped { + // Also check status on the HTTP port + + match self.check_status() { + Err(PageserverHttpError::Transport(err)) if err.is_connect() => { + println!("done!"); + return Ok(()); + } + Err(err) => { + eprintln!("\nPageserver status check failed with error: {err}"); + return Ok(()); + } + Ok(()) => { + // keep waiting + } + } + } + print!("."); + io::stdout().flush().unwrap(); + thread::sleep(Duration::from_secs(1)); } bail!("Failed to stop pageserver with pid {}", pid); @@ -460,12 +493,3 @@ impl PageServerNode { Ok(timeline_info_response) } } - -fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { - for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { - if let Ok(value) = std::env::var(env_key) { - cmd = cmd.env(env_key, value); - } - } - cmd -} diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 93bb5f9cd7..6bcbc76551 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -1,13 +1,20 @@ #!/bin/sh set -eux +broker_endpoints_param="${BROKER_ENDPOINT:-absent}" +if [ "$broker_endpoints_param" != "absent" ]; then + broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']" +else + broker_endpoints_param='' +fi + if [ "$1" = 'pageserver' ]; then if [ ! -d "/data/tenants" ]; then echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" + pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param fi echo "Staring pageserver at 0.0.0.0:6400" - pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data + pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data else "$@" fi diff --git a/docs/docker.md b/docs/docker.md index cc54d012dd..100cdd248b 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -1,20 +1,20 @@ -# Docker images of Zenith +# Docker images of Neon ## Images Currently we build two main images: -- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). -- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres). +- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). +- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). -And additional intermediate images: +And additional intermediate image: -- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools. +- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools. ## Building pipeline -1. Image `zenithdb/compute-tools` is re-built automatically. +We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs -2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo. +1. `neondatabase/compute-tools` and `neondatabase/compute-node` -3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically. +2. `neondatabase/neon` diff --git a/docs/glossary.md b/docs/glossary.md index ecc57b9ed1..a014446010 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup. ### Branch -We can create branch at certain LSN using `zenith timeline branch` command. +We can create branch at certain LSN using `neon_local timeline branch` command. Each Branch lives in a corresponding timeline[] and has an ancestor[]. @@ -91,7 +91,7 @@ The layer map tracks what layers exist in a timeline. ### Layered repository -Zenith repository implementation that keeps data in layers. +Neon repository implementation that keeps data in layers. ### LSN The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log. @@ -101,7 +101,7 @@ It is printed as two hexadecimal numbers of up to 8 digits each, separated by a Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html) Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery. -In postgres and Zenith lsns are used to describe certain points in WAL handling. +In Postgres and Neon LSNs are used to describe certain points in WAL handling. PostgreSQL LSNs and functions to monitor them: * `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location. @@ -111,13 +111,13 @@ PostgreSQL LSNs and functions to monitor them: * `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html): -Zenith safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md) +Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md) * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. -Zenith pageserver LSNs: +Neon pageserver LSNs: * `last_record_lsn` - the end of last processed WAL record. * `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN. * `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash. @@ -132,7 +132,7 @@ This is the unit of data exchange between compute node and pageserver. ### Pageserver -Zenith storage engine: repositories + wal receiver + page service + wal redo. +Neon storage engine: repositories + wal receiver + page service + wal redo. ### Page service @@ -184,10 +184,10 @@ relation exceeds that size, it is split into multiple segments. SLRUs include pg_clog, pg_multixact/members, and pg_multixact/offsets. There are other SLRUs in PostgreSQL, but they don't need to be stored permanently (e.g. pg_subtrans), -or we do not support them in zenith yet (pg_commit_ts). +or we do not support them in neon yet (pg_commit_ts). ### Tenant (Multitenancy) -Tenant represents a single customer, interacting with Zenith. +Tenant represents a single customer, interacting with Neon. Wal redo[] activity, timelines[], layers[] are managed for each tenant independently. One pageserver[] can serve multiple tenants at once. One safekeeper diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md index 4543be3dae..d4716156d1 100644 --- a/docs/rfcs/004-durability.md +++ b/docs/rfcs/004-durability.md @@ -22,7 +22,7 @@ In addition to the WAL safekeeper nodes, the WAL is archived in S3. WAL that has been archived to S3 can be removed from the safekeepers, so the safekeepers don't need a lot of disk space. - +``` +----------------+ +-----> | WAL safekeeper | | +----------------+ @@ -42,23 +42,23 @@ safekeepers, so the safekeepers don't need a lot of disk space. \ \ \ - \ +--------+ - \ | | - +--> | S3 | - | | - +--------+ - + \ +--------+ + \ | | + +------> | S3 | + | | + +--------+ +``` Every WAL safekeeper holds a section of WAL, and a VCL value. The WAL can be divided into three portions: - +``` VCL LSN | | V V .................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX Archived WAL Completed WAL In-flight WAL - +``` Note that all this WAL kept in a safekeeper is a contiguous section. This is different from Aurora: In Aurora, there can be holes in the diff --git a/docs/settings.md b/docs/settings.md index 017d349bb6..9564ef626f 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -25,10 +25,14 @@ max_file_descriptors = '100' # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zenith_admin' +broker_etcd_prefix = 'neon' +broker_endpoints = ['some://etcd'] + # [remote_storage] ``` -The config above shows default values for all basic pageserver settings. +The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, +see the corresponding section below. Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank. Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start. @@ -46,6 +50,17 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage= Note that TOML distinguishes between strings and integers, the former require single or double quotes around them. +#### broker_endpoints + +A list of endpoints (etcd currently) to connect and pull the information from. +Mandatory, does not have a default, since requires etcd to be started as a separate process, +and its connection url should be specified separately. + +#### broker_etcd_prefix + +A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster. +Default is `neon`. + #### checkpoint_distance `checkpoint_distance` is the amount of incoming WAL that is held in diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 5ddc6208d2..c8d4baff62 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -91,18 +91,22 @@ so manual installation of dependencies is not recommended. A single virtual environment with all dependencies is described in the single `Pipfile`. ### Prerequisites -- Install Python 3.7 (the minimal supported version) or greater. +- Install Python 3.9 (the minimal supported version) or greater. - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected. - - If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.: + - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu sudo add-apt-repository ppa:deadsnakes/ppa sudo apt update - sudo apt install python3.7 + sudo apt install python3.9 ``` - Install `poetry` - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`. -- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI. +- Install dependencies via `./scripts/pysync`. + - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) + so if you have different version some linting tools can yield different result locally vs in the CI. + - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`. + This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning. Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 1b27f99ccf..7fe142502b 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -16,12 +16,16 @@ use tokio::{sync::mpsc, task::JoinHandle}; use tracing::*; use utils::{ lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; +/// Default value to use for prefixing to all etcd keys with. +/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster. +pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon"; + #[derive(Debug, Deserialize, Serialize)] struct SafekeeperTimeline { - safekeeper_id: ZNodeId, + safekeeper_id: NodeId, info: SkTimelineInfo, } @@ -39,10 +43,10 @@ pub struct SkTimelineInfo { #[serde_as(as = "Option")] #[serde(default)] pub commit_lsn: Option, - /// LSN up to which safekeeper offloaded WAL to s3. + /// LSN up to which safekeeper has backed WAL. #[serde_as(as = "Option")] #[serde(default)] - pub s3_wal_lsn: Option, + pub backup_lsn: Option, /// LSN of last checkpoint uploaded by pageserver. #[serde_as(as = "Option")] #[serde(default)] @@ -67,7 +71,7 @@ pub enum BrokerError { /// A way to control the data retrieval from a certain subscription. pub struct SkTimelineSubscription { safekeeper_timeline_updates: - mpsc::UnboundedReceiver>>, + mpsc::UnboundedReceiver>>, kind: SkTimelineSubscriptionKind, watcher_handle: JoinHandle>, watcher: Watcher, @@ -77,7 +81,7 @@ impl SkTimelineSubscription { /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet. pub async fn fetch_data( &mut self, - ) -> Option>> { + ) -> Option>> { self.safekeeper_timeline_updates.recv().await } @@ -104,28 +108,28 @@ impl SkTimelineSubscription { /// The subscription kind to the timeline updates from safekeeper. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct SkTimelineSubscriptionKind { - broker_prefix: String, + broker_etcd_prefix: String, kind: SubscriptionKind, } impl SkTimelineSubscriptionKind { - pub fn all(broker_prefix: String) -> Self { + pub fn all(broker_etcd_prefix: String) -> Self { Self { - broker_prefix, + broker_etcd_prefix, kind: SubscriptionKind::All, } } - pub fn tenant(broker_prefix: String, tenant: ZTenantId) -> Self { + pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self { Self { - broker_prefix, + broker_etcd_prefix, kind: SubscriptionKind::Tenant(tenant), } } - pub fn timeline(broker_prefix: String, timeline: ZTenantTimelineId) -> Self { + pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self { Self { - broker_prefix, + broker_etcd_prefix, kind: SubscriptionKind::Timeline(timeline), } } @@ -134,12 +138,12 @@ impl SkTimelineSubscriptionKind { match self.kind { SubscriptionKind::All => Regex::new(&format!( r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", - self.broker_prefix + self.broker_etcd_prefix )) .expect("wrong regex for 'everything' subscription"), SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!( r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", - self.broker_prefix + self.broker_etcd_prefix )) .expect("wrong regex for 'tenant' subscription"), SubscriptionKind::Timeline(ZTenantTimelineId { @@ -147,7 +151,7 @@ impl SkTimelineSubscriptionKind { timeline_id, }) => Regex::new(&format!( r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$", - self.broker_prefix + self.broker_etcd_prefix )) .expect("wrong regex for 'timeline' subscription"), } @@ -156,16 +160,16 @@ impl SkTimelineSubscriptionKind { /// Etcd key to use for watching a certain timeline updates from safekeepers. pub fn watch_key(&self) -> String { match self.kind { - SubscriptionKind::All => self.broker_prefix.to_string(), + SubscriptionKind::All => self.broker_etcd_prefix.to_string(), SubscriptionKind::Tenant(tenant_id) => { - format!("{}/{tenant_id}/safekeeper", self.broker_prefix) + format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix) } SubscriptionKind::Timeline(ZTenantTimelineId { tenant_id, timeline_id, }) => format!( "{}/{tenant_id}/{timeline_id}/safekeeper", - self.broker_prefix + self.broker_etcd_prefix ), } } @@ -217,7 +221,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates( break; } - let mut timeline_updates: HashMap> = HashMap::new(); + let mut timeline_updates: HashMap> = HashMap::new(); // Keep track that the timeline data updates from etcd arrive in the right order. // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering. @@ -295,18 +299,18 @@ fn parse_etcd_key_value( parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?, ), - ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), ), SubscriptionKind::Tenant(tenant_id) => ( ZTenantTimelineId::new( tenant_id, parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, ), - ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), ), SubscriptionKind::Timeline(zttid) => ( zttid, - ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), ), }; diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index 3b6ff4691d..8ff5d1d421 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -prometheus = {version = "0.13", default_features=false} # removes protobuf dependency +prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency libc = "0.2" lazy_static = "1.4" once_cell = "1.8.0" diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 8756a078c3..9929fc6d45 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,7 +3,6 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use lazy_static::lazy_static; -use once_cell::race::OnceBox; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_gauge, Gauge}; pub use prometheus::{register_gauge_vec, GaugeVec}; @@ -27,48 +26,15 @@ pub fn gather() -> Vec { prometheus::gather() } -static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new(); - -/// Sets a prefix which will be used for all common metrics, typically a service -/// name like 'pageserver'. Should be executed exactly once in the beginning of -/// any executable which uses common metrics. -pub fn set_common_metrics_prefix(prefix: &'static str) { - // Not unwrap() because metrics may be initialized after multiple threads have been started. - COMMON_METRICS_PREFIX - .set(prefix.into()) - .unwrap_or_else(|_| { - eprintln!( - "set_common_metrics_prefix() was called second time with '{}', exiting", - prefix - ); - std::process::exit(1); - }); -} - -/// Prepends a prefix to a common metric name so they are distinguished between -/// different services, see -/// A call to set_common_metrics_prefix() is necessary prior to calling this. -pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String { - // Not unwrap() because metrics may be initialized after multiple threads have been started. - format!( - "{}_{}", - COMMON_METRICS_PREFIX.get().unwrap_or_else(|| { - eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting"); - std::process::exit(1); - }), - unprefixed_metric_name - ) -} - lazy_static! { static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!( - new_common_metric_name("disk_io_bytes"), + "libmetrics_disk_io_bytes_total", "Bytes written and read from disk, grouped by the operation (read|write)", &["io_operation"] ) .expect("Failed to register disk i/o bytes int gauge vec"); static ref MAXRSS_KB: IntGauge = register_int_gauge!( - new_common_metric_name("maxrss_kb"), + "libmetrics_maxrss_kb", "Memory usage (Maximum Resident Set Size)" ) .expect("Failed to register maxrss_kb int gauge"); diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 7be5ca1b93..129c93cf6d 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -20,5 +20,10 @@ serde = { version = "1.0", features = ["derive"] } utils = { path = "../utils" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } +[dev-dependencies] +env_logger = "0.9" +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +wal_generate = { path = "wal_generate" } + [build-dependencies] bindgen = "0.59.1" diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 7882058868..32a3022c5a 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -15,7 +15,7 @@ use crate::XLogPageHeaderData; use crate::XLogRecord; use crate::XLOG_PAGE_MAGIC; -use anyhow::bail; +use anyhow::{bail, ensure}; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; use bytes::{Buf, Bytes}; @@ -30,6 +30,7 @@ use std::path::{Path, PathBuf}; use std::time::SystemTime; use utils::bin_ser::DeserializeError; use utils::bin_ser::SerializeError; +use utils::const_assert; use utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; @@ -149,8 +150,9 @@ fn find_end_of_wal_segment( ) -> anyhow::Result { // step back to the beginning of the page to read it in... let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ; + let mut skipping_first_contrecord: bool = false; let mut contlen: usize = 0; - let mut wal_crc: u32 = 0; + let mut xl_crc: u32 = 0; let mut crc: u32 = 0; let mut rec_offs: usize = 0; let mut buf = [0u8; XLOG_BLCKSZ]; @@ -158,11 +160,15 @@ fn find_end_of_wal_segment( let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); file.seek(SeekFrom::Start(offs as u64))?; + // xl_crc is the last field in XLogRecord, will not be read into rec_hdr + const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD); let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; + trace!("find_end_of_wal_segment(data_dir={}, segno={}, tli={}, wal_seg_size={}, start_offset=0x{:x})", data_dir.display(), segno, tli, wal_seg_size, start_offset); while offs < wal_seg_size { // we are at the beginning of the page; read it in if offs % XLOG_BLCKSZ == 0 { + trace!("offs=0x{:x}: new page", offs); let bytes_read = file.read(&mut buf)?; if bytes_read != buf.len() { bail!( @@ -176,30 +182,49 @@ fn find_end_of_wal_segment( let xlp_magic = LittleEndian::read_u16(&buf[0..2]); let xlp_info = LittleEndian::read_u16(&buf[2..4]); let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]); + trace!( + " xlp_magic=0x{:x}, xlp_info=0x{:x}, xlp_rem_len={}", + xlp_magic, + xlp_info, + xlp_rem_len + ); // this is expected in current usage when valid WAL starts after page header if xlp_magic != XLOG_PAGE_MAGIC as u16 { trace!( - "invalid WAL file {}.partial magic {} at {:?}", + " invalid WAL file {}.partial magic {} at {:?}", file_name, xlp_magic, Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)), ); } if offs == 0 { - offs = XLOG_SIZE_OF_XLOG_LONG_PHD; + offs += XLOG_SIZE_OF_XLOG_LONG_PHD; if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 { - offs += ((xlp_rem_len + 7) & !7) as usize; + trace!(" first record is contrecord"); + skipping_first_contrecord = true; + contlen = xlp_rem_len as usize; + if offs < start_offset { + // Pre-condition failed: the beginning of the segment is unexpectedly corrupted. + ensure!(start_offset - offs >= contlen, + "start_offset is in the middle of the first record (which happens to be a contrecord), \ + expected to be on a record boundary. Is beginning of the segment corrupted?"); + contlen = 0; + // keep skipping_first_contrecord to avoid counting the contrecord as valid, we did not check it. + } + } else { + trace!(" first record is not contrecord"); } } else { offs += XLOG_SIZE_OF_XLOG_SHORT_PHD; } // ... and step forward again if asked + trace!(" skipped header to 0x{:x}", offs); offs = max(offs, start_offset); - // beginning of the next record } else if contlen == 0 { let page_offs = offs % XLOG_BLCKSZ; let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize; + trace!("offs=0x{:x}: new record, xl_tot_len={}", offs, xl_tot_len); if xl_tot_len == 0 { info!( "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}", @@ -212,10 +237,25 @@ fn find_end_of_wal_segment( ); break; // zeros, reached the end } - last_valid_rec_pos = offs; + if skipping_first_contrecord { + skipping_first_contrecord = false; + trace!(" first contrecord has been just completed"); + } else { + trace!( + " updating last_valid_rec_pos: 0x{:x} --> 0x{:x}", + last_valid_rec_pos, + offs + ); + last_valid_rec_pos = offs; + } offs += 4; rec_offs = 4; contlen = xl_tot_len - 4; + trace!( + " reading rec_hdr[0..4] <-- [0x{:x}; 0x{:x})", + page_offs, + page_offs + 4 + ); rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]); } else { // we're continuing a record, possibly from previous page. @@ -224,42 +264,118 @@ fn find_end_of_wal_segment( // read the rest of the record, or as much as fits on this page. let n = min(contlen, pageleft); - // fill rec_hdr (header up to (but not including) xl_crc field) + trace!( + "offs=0x{:x}, record continuation, pageleft={}, contlen={}", + offs, + pageleft, + contlen + ); + // fill rec_hdr header up to (but not including) xl_crc field + trace!( + " rec_offs={}, XLOG_RECORD_CRC_OFFS={}, XLOG_SIZE_OF_XLOG_RECORD={}", + rec_offs, + XLOG_RECORD_CRC_OFFS, + XLOG_SIZE_OF_XLOG_RECORD + ); if rec_offs < XLOG_RECORD_CRC_OFFS { let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n); + trace!( + " reading rec_hdr[{}..{}] <-- [0x{:x}; 0x{:x})", + rec_offs, + rec_offs + len, + page_offs, + page_offs + len + ); rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]); } if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD { let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS; - wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); + // All records are aligned on 8-byte boundary, so their 8-byte frames + // cannot be split between pages. As xl_crc is the last field, + // its content is always on the same page. + const_assert!(XLOG_RECORD_CRC_OFFS % 8 == 4); + // We should always start reading aligned records even in incorrect WALs so if + // the condition is false it is likely a bug. However, it is localized somewhere + // in this function, hence we do not crash and just report failure instead. + ensure!(crc_offs % 8 == 4, "Record is not aligned properly (bug?)"); + xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); + trace!( + " reading xl_crc: [0x{:x}; 0x{:x}) = 0x{:x}", + crc_offs, + crc_offs + 4, + xl_crc + ); crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]); - } else { - crc ^= 0xFFFFFFFFu32; + trace!( + " initializing crc: [0x{:x}; 0x{:x}); crc = 0x{:x}", + crc_offs + 4, + page_offs + n, + crc + ); + } else if rec_offs > XLOG_RECORD_CRC_OFFS { + // As all records are 8-byte aligned, the header is already fully read and `crc` is initialized in the branch above. + ensure!(rec_offs >= XLOG_SIZE_OF_XLOG_RECORD); + let old_crc = crc; crc = crc32c_append(crc, &buf[page_offs..page_offs + n]); + trace!( + " appending to crc: [0x{:x}; 0x{:x}); 0x{:x} --> 0x{:x}", + page_offs, + page_offs + n, + old_crc, + crc + ); + } else { + // Correct because of the way conditions are written above. + assert!(rec_offs + n < XLOG_SIZE_OF_XLOG_RECORD); + // If `skipping_first_contrecord == true`, we may be reading from a middle of a record + // which started in the previous segment. Hence there is no point in validating the header. + if !skipping_first_contrecord && rec_offs + n > XLOG_RECORD_CRC_OFFS { + info!( + "Curiously corrupted WAL: a record stops inside the header; \ + offs=0x{:x}, record continuation, pageleft={}, contlen={}", + offs, pageleft, contlen + ); + break; + } + // Do nothing: we are still reading the header. It's accounted in CRC in the end of the record. } - crc = !crc; rec_offs += n; offs += n; contlen -= n; if contlen == 0 { - crc = !crc; + trace!(" record completed at 0x{:x}", offs); crc = crc32c_append(crc, &rec_hdr); offs = (offs + 7) & !7; // pad on 8 bytes boundary */ - if crc == wal_crc { + trace!( + " padded offs to 0x{:x}, crc is {:x}, expected crc is {:x}", + offs, + crc, + xl_crc + ); + if skipping_first_contrecord { + // do nothing, the flag will go down on next iteration when we're reading new record + trace!(" first conrecord has been just completed"); + } else if crc == xl_crc { // record is valid, advance the result to its end (with // alignment to the next record taken into account) + trace!( + " updating last_valid_rec_pos: 0x{:x} --> 0x{:x}", + last_valid_rec_pos, + offs + ); last_valid_rec_pos = offs; } else { info!( "CRC mismatch {} vs {} at {}", - crc, wal_crc, last_valid_rec_pos + crc, xl_crc, last_valid_rec_pos ); break; } } } } + trace!("last_valid_rec_pos=0x{:x}", last_valid_rec_pos); Ok(last_valid_rec_pos as u32) } @@ -476,78 +592,126 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result anyhow::Result, + expected_end_of_wal_non_partial: Lsn, + last_segment: &str, + ) { + use wal_generate::*; + // 1. Generate some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); - let data_dir = top_path.join("test_output/test_find_end_of_wal"); - let initdb_path = top_path.join("tmp_install/bin/initdb"); - let lib_path = top_path.join("tmp_install/lib"); - if data_dir.exists() { - fs::remove_dir_all(&data_dir).unwrap(); + let cfg = Conf { + pg_distrib_dir: top_path.join("tmp_install"), + datadir: top_path.join(format!("test_output/{}", test_name)), + }; + if cfg.datadir.exists() { + fs::remove_dir_all(&cfg.datadir).unwrap(); } - println!("Using initdb from '{}'", initdb_path.display()); - println!("Data directory '{}'", data_dir.display()); - let initdb_output = Command::new(initdb_path) - .args(&["-D", data_dir.to_str().unwrap()]) - .arg("--no-instructions") - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", &lib_path) - .env("DYLD_LIBRARY_PATH", &lib_path) - .output() - .unwrap(); - assert!( - initdb_output.status.success(), - "initdb failed. Status: '{}', stdout: '{}', stderr: '{}'", - initdb_output.status, - String::from_utf8_lossy(&initdb_output.stdout), - String::from_utf8_lossy(&initdb_output.stderr), - ); + cfg.initdb().unwrap(); + let mut srv = cfg.start_server().unwrap(); + let expected_wal_end: Lsn = + u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into(); + srv.kill(); // 2. Pick WAL generated by initdb - let wal_dir = data_dir.join("pg_wal"); + let wal_dir = cfg.datadir.join("pg_wal"); let wal_seg_size = 16 * 1024 * 1024; // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated) let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); let wal_end = Lsn(wal_end); - println!("wal_end={}, tli={}", wal_end, tli); - assert_eq!(wal_end, "0/2000000".parse::().unwrap()); + info!( + "find_end_of_wal returned (wal_end={}, tli={})", + wal_end, tli + ); + assert_eq!(wal_end, expected_end_of_wal_non_partial); // 4. Get the actual end of WAL by pg_waldump - let waldump_path = top_path.join("tmp_install/bin/pg_waldump"); - let waldump_output = Command::new(waldump_path) - .arg(wal_dir.join("000000010000000000000001")) - .env_clear() - .env("LD_LIBRARY_PATH", &lib_path) - .env("DYLD_LIBRARY_PATH", &lib_path) - .output() - .unwrap(); - let waldump_output = std::str::from_utf8(&waldump_output.stderr).unwrap(); - println!("waldump_output = '{}'", &waldump_output); - let re = Regex::new(r"invalid record length at (.+):").unwrap(); - let caps = re.captures(waldump_output).unwrap(); + let waldump_output = cfg + .pg_waldump("000000010000000000000001", last_segment) + .unwrap() + .stderr; + let waldump_output = std::str::from_utf8(&waldump_output).unwrap(); + let caps = match Regex::new(r"invalid record length at (.+):") + .unwrap() + .captures(waldump_output) + { + Some(caps) => caps, + None => { + error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output); + panic!(); + } + }; let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); + info!( + "waldump erred on {}, expected wal end at {}", + waldump_wal_end, expected_wal_end + ); + assert_eq!(waldump_wal_end, expected_wal_end); // 5. Rename file to partial to actually find last valid lsn fs::rename( - wal_dir.join("000000010000000000000001"), - wal_dir.join("000000010000000000000001.partial"), + wal_dir.join(last_segment), + wal_dir.join(format!("{}.partial", last_segment)), ) .unwrap(); let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); let wal_end = Lsn(wal_end); - println!("wal_end={}, tli={}", wal_end, tli); + info!( + "find_end_of_wal returned (wal_end={}, tli={})", + wal_end, tli + ); assert_eq!(wal_end, waldump_wal_end); } + #[test] + pub fn test_find_end_of_wal_simple() { + init_logging(); + test_end_of_wal( + "test_find_end_of_wal_simple", + wal_generate::generate_simple, + "0/2000000".parse::().unwrap(), + "000000010000000000000001", + ); + } + + #[test] + pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() { + init_logging(); + test_end_of_wal( + "test_find_end_of_wal_crossing_segment_followed_by_small_one", + wal_generate::generate_wal_record_crossing_segment_followed_by_small_one, + "0/3000000".parse::().unwrap(), + "000000010000000000000002", + ); + } + + #[test] + #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO + pub fn test_find_end_of_wal_last_crossing_segment() { + init_logging(); + test_end_of_wal( + "test_find_end_of_wal_last_crossing_segment", + wal_generate::generate_last_wal_record_crossing_segment, + "0/3000000".parse::().unwrap(), + "000000010000000000000002", + ); + } + /// Check the math in update_next_xid /// /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL, diff --git a/libs/postgres_ffi/wal_generate/Cargo.toml b/libs/postgres_ffi/wal_generate/Cargo.toml new file mode 100644 index 0000000000..a10671dddd --- /dev/null +++ b/libs/postgres_ffi/wal_generate/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "wal_generate" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0" +clap = "3.0" +env_logger = "0.9" +log = "0.4" +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tempfile = "3.2" diff --git a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs new file mode 100644 index 0000000000..07ceb31c7f --- /dev/null +++ b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs @@ -0,0 +1,58 @@ +use anyhow::*; +use clap::{App, Arg}; +use wal_generate::*; + +fn main() -> Result<()> { + env_logger::Builder::from_env( + env_logger::Env::default().default_filter_or("wal_generate=info"), + ) + .init(); + let arg_matches = App::new("Postgres WAL generator") + .about("Generates Postgres databases with specific WAL properties") + .arg( + Arg::new("datadir") + .short('D') + .long("datadir") + .takes_value(true) + .help("Data directory for the Postgres server") + .required(true) + ) + .arg( + Arg::new("pg-distrib-dir") + .long("pg-distrib-dir") + .takes_value(true) + .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)") + .default_value("/usr/local") + ) + .arg( + Arg::new("type") + .long("type") + .takes_value(true) + .help("Type of WAL to generate") + .possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"]) + .required(true) + ) + .get_matches(); + + let cfg = Conf { + pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), + datadir: arg_matches.value_of("datadir").unwrap().into(), + }; + cfg.initdb()?; + let mut srv = cfg.start_server()?; + let lsn = match arg_matches.value_of("type").unwrap() { + "simple" => generate_simple(&mut srv.connect_with_timeout()?)?, + "last_wal_record_crossing_segment" => { + generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)? + } + "wal_record_crossing_segment_followed_by_small_one" => { + generate_wal_record_crossing_segment_followed_by_small_one( + &mut srv.connect_with_timeout()?, + )? + } + a => panic!("Unknown --type argument: {}", a), + }; + println!("end_of_wal = {}", lsn); + srv.kill(); + Ok(()) +} diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_generate/src/lib.rs new file mode 100644 index 0000000000..a5cd81d68a --- /dev/null +++ b/libs/postgres_ffi/wal_generate/src/lib.rs @@ -0,0 +1,278 @@ +use anyhow::*; +use core::time::Duration; +use log::*; +use postgres::types::PgLsn; +use postgres::Client; +use std::cmp::Ordering; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::time::Instant; +use tempfile::{tempdir, TempDir}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Conf { + pub pg_distrib_dir: PathBuf, + pub datadir: PathBuf, +} + +pub struct PostgresServer { + process: std::process::Child, + _unix_socket_dir: TempDir, + client_config: postgres::Config, +} + +impl Conf { + fn pg_bin_dir(&self) -> PathBuf { + self.pg_distrib_dir.join("bin") + } + + fn pg_lib_dir(&self) -> PathBuf { + self.pg_distrib_dir.join("lib") + } + + fn new_pg_command(&self, command: impl AsRef) -> Result { + let path = self.pg_bin_dir().join(command); + ensure!(path.exists(), "Command {:?} does not exist", path); + let mut cmd = Command::new(path); + cmd.env_clear() + .env("LD_LIBRARY_PATH", self.pg_lib_dir()) + .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()); + Ok(cmd) + } + + pub fn initdb(&self) -> Result<()> { + if let Some(parent) = self.datadir.parent() { + info!("Pre-creating parent directory {:?}", parent); + // Tests may be run concurrently and there may be a race to create `test_output/`. + // std::fs::create_dir_all is guaranteed to have no races with another thread creating directories. + std::fs::create_dir_all(parent)?; + } + info!( + "Running initdb in {:?} with user \"postgres\"", + self.datadir + ); + let output = self + .new_pg_command("initdb")? + .arg("-D") + .arg(self.datadir.as_os_str()) + .args(&["-U", "postgres", "--no-instructions", "--no-sync"]) + .output()?; + debug!("initdb output: {:?}", output); + ensure!( + output.status.success(), + "initdb failed, stdout and stderr follow:\n{}{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr), + ); + Ok(()) + } + + pub fn start_server(&self) -> Result { + info!("Starting Postgres server in {:?}", self.datadir); + let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) + let unix_socket_dir_path = unix_socket_dir.path().to_owned(); + let server_process = self + .new_pg_command("postgres")? + .args(&["-c", "listen_addresses="]) + .arg("-k") + .arg(unix_socket_dir_path.as_os_str()) + .arg("-D") + .arg(self.datadir.as_os_str()) + .args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed + .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output + .args(&["-c", "shared_preload_libraries=zenith"]) // can only be loaded at startup + // Disable background processes as much as possible + .args(&["-c", "wal_writer_delay=10s"]) + .args(&["-c", "autovacuum=off"]) + .stderr(Stdio::null()) + .spawn()?; + let server = PostgresServer { + process: server_process, + _unix_socket_dir: unix_socket_dir, + client_config: { + let mut c = postgres::Config::new(); + c.host_path(&unix_socket_dir_path); + c.user("postgres"); + c.connect_timeout(Duration::from_millis(1000)); + c + }, + }; + Ok(server) + } + + pub fn pg_waldump( + &self, + first_segment_name: &str, + last_segment_name: &str, + ) -> Result { + let first_segment_file = self.datadir.join(first_segment_name); + let last_segment_file = self.datadir.join(last_segment_name); + info!( + "Running pg_waldump for {} .. {}", + first_segment_file.display(), + last_segment_file.display() + ); + let output = self + .new_pg_command("pg_waldump")? + .args(&[ + &first_segment_file.as_os_str(), + &last_segment_file.as_os_str(), + ]) + .output()?; + debug!("waldump output: {:?}", output); + Ok(output) + } +} + +impl PostgresServer { + pub fn connect_with_timeout(&self) -> Result { + let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap(); + while Instant::now() < retry_until { + use std::result::Result::Ok; + if let Ok(client) = self.client_config.connect(postgres::NoTls) { + return Ok(client); + } + std::thread::sleep(Duration::from_millis(100)); + } + bail!("Connection timed out"); + } + + pub fn kill(&mut self) { + self.process.kill().unwrap(); + self.process.wait().unwrap(); + } +} + +impl Drop for PostgresServer { + fn drop(&mut self) { + use std::result::Result::Ok; + match self.process.try_wait() { + Ok(Some(_)) => return, + Ok(None) => { + warn!("Server was not terminated, will be killed"); + } + Err(e) => { + error!("Unable to get status of the server: {}, will be killed", e); + } + } + let _ = self.process.kill(); + } +} + +pub trait PostgresClientExt: postgres::GenericClient { + fn pg_current_wal_insert_lsn(&mut self) -> Result { + Ok(self + .query_one("SELECT pg_current_wal_insert_lsn()", &[])? + .get(0)) + } + fn pg_current_wal_flush_lsn(&mut self) -> Result { + Ok(self + .query_one("SELECT pg_current_wal_flush_lsn()", &[])? + .get(0)) + } +} + +impl PostgresClientExt for C {} + +fn generate_internal( + client: &mut C, + f: impl Fn(&mut C, PgLsn) -> Result>, +) -> Result { + client.execute("create extension if not exists zenith_test_utils", &[])?; + + let wal_segment_size = client.query_one( + "select cast(setting as bigint) as setting, unit \ + from pg_settings where name = 'wal_segment_size'", + &[], + )?; + ensure!( + wal_segment_size.get::<_, String>("unit") == "B", + "Unexpected wal_segment_size unit" + ); + ensure!( + wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024, + "Unexpected wal_segment_size in bytes" + ); + + let initial_lsn = client.pg_current_wal_insert_lsn()?; + info!("LSN initial = {}", initial_lsn); + + let last_lsn = match f(client, initial_lsn)? { + None => client.pg_current_wal_insert_lsn()?, + Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) { + Ordering::Less => bail!("Some records were inserted after the generated WAL"), + Ordering::Equal => last_lsn, + Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), + }, + }; + + // Some records may be not flushed, e.g. non-transactional logical messages. + client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; + match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) { + Ordering::Less => bail!("Some records were flushed after the generated WAL"), + Ordering::Equal => {} + Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"), + } + Ok(last_lsn) +} + +pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result { + generate_internal(client, |client, _| { + client.execute("CREATE table t(x int)", &[])?; + Ok(None) + }) +} + +fn generate_single_logical_message( + client: &mut impl postgres::GenericClient, + transactional: bool, +) -> Result { + generate_internal(client, |client, initial_lsn| { + ensure!( + initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), + "Initial LSN is too far in the future" + ); + + let message_lsn: PgLsn = client + .query_one( + "select pg_logical_emit_message($1, 'big-16mb-msg', \ + concat(repeat('abcd', 16 * 256 * 1024), 'end')) as message_lsn", + &[&transactional], + )? + .get("message_lsn"); + ensure!( + message_lsn > PgLsn::from(0x0200_0000 + 4 * 8192), + "Logical message did not cross the segment boundary" + ); + ensure!( + message_lsn < PgLsn::from(0x0400_0000), + "Logical message crossed two segments" + ); + + if transactional { + // Transactional logical messages are part of a transaction, so the one above is + // followed by a small COMMIT record. + + let after_message_lsn = client.pg_current_wal_insert_lsn()?; + ensure!( + message_lsn < after_message_lsn, + "No record found after the emitted message" + ); + Ok(Some(after_message_lsn)) + } else { + Ok(Some(message_lsn)) + } + }) +} + +pub fn generate_wal_record_crossing_segment_followed_by_small_one( + client: &mut impl postgres::GenericClient, +) -> Result { + generate_single_logical_message(client, true) +} + +pub fn generate_last_wal_record_crossing_segment( + client: &mut C, +) -> Result { + generate_single_logical_message(client, false) +} diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 291f6e50ac..b11b3cf371 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -5,14 +5,17 @@ edition = "2021" [dependencies] anyhow = { version = "1.0", features = ["backtrace"] } -tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } -tokio-util = { version = "0.7", features = ["io"] } -tracing = "0.1.27" +async-trait = "0.1" +metrics = { version = "0.1", path = "../metrics" } +once_cell = "1.8.0" rusoto_core = "0.48" rusoto_s3 = "0.48" serde = { version = "1.0", features = ["derive"] } serde_json = "1" -async-trait = "0.1" +tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } +tokio-util = { version = "0.7", features = ["io"] } +toml_edit = { version = "0.13", features = ["easy"] } +tracing = "0.1.27" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 9bbb855dd5..0889cb720c 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -16,8 +16,10 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::Context; +use anyhow::{bail, Context}; + use tokio::io; +use toml_edit::Item; use tracing::info; pub use self::{ @@ -87,7 +89,8 @@ pub trait RemoteStorage: Send + Sync { async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>; } -/// TODO kb +/// Every storage, currently supported. +/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. pub enum GenericRemoteStorage { Local(LocalFs), S3(S3Bucket), @@ -202,6 +205,90 @@ pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) .with_extension(new_extension.as_ref()) } +impl RemoteStorageConfig { + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + let local_path = toml.get("local_path"); + let bucket_name = toml.get("bucket_name"); + let bucket_region = toml.get("bucket_region"); + + let max_concurrent_syncs = NonZeroUsize::new( + parse_optional_integer("max_concurrent_syncs", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), + ) + .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; + + let max_sync_errors = NonZeroU32::new( + parse_optional_integer("max_sync_errors", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), + ) + .context("Failed to parse 'max_sync_errors' as a positive integer")?; + + let concurrency_limit = NonZeroUsize::new( + parse_optional_integer("concurrency_limit", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), + ) + .context("Failed to parse 'concurrency_limit' as a positive integer")?; + + let storage = match (local_path, bucket_name, bucket_region) { + (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), + (_, Some(_), None) => { + bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") + } + (_, None, Some(_)) => { + bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") + } + (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { + bucket_name: parse_toml_string("bucket_name", bucket_name)?, + bucket_region: parse_toml_string("bucket_region", bucket_region)?, + prefix_in_bucket: toml + .get("prefix_in_bucket") + .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) + .transpose()?, + endpoint: toml + .get("endpoint") + .map(|endpoint| parse_toml_string("endpoint", endpoint)) + .transpose()?, + concurrency_limit, + }), + (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( + parse_toml_string("local_path", local_path)?, + )), + (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), + }; + + Ok(RemoteStorageConfig { + max_concurrent_syncs, + max_sync_errors, + storage, + }) + } +} + +// Helper functions to parse a toml Item +fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> +where + I: TryFrom, + E: std::error::Error + Send + Sync + 'static, +{ + let toml_integer = match item.get(name) { + Some(item) => item + .as_integer() + .with_context(|| format!("configure option {name} is not an integer"))?, + None => return Ok(None), + }; + + I::try_from(toml_integer) + .map(Some) + .with_context(|| format!("configure option {name} is too large")) +} + +fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { + let s = item + .as_str() + .with_context(|| format!("configure option {name} is not a string"))?; + Ok(s.to_string()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 01aaf7ca7e..80d6966494 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -23,6 +23,71 @@ use crate::{strip_path_prefix, RemoteStorage, S3Config}; use super::StorageMetadata; +pub(super) mod metrics { + use metrics::{register_int_counter_vec, IntCounterVec}; + use once_cell::sync::Lazy; + + static S3_REQUESTS_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_requests_count", + "Number of s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + static S3_REQUESTS_FAIL_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_failures_count", + "Number of failed s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + pub fn inc_get_object() { + S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc(); + } + + pub fn inc_get_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["get_object"]) + .inc(); + } + + pub fn inc_put_object() { + S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc(); + } + + pub fn inc_put_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["put_object"]) + .inc(); + } + + pub fn inc_delete_object() { + S3_REQUESTS_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_delete_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_list_objects() { + S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc(); + } + + pub fn inc_list_objects_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["list_objects"]) + .inc(); + } +} + const S3_PREFIX_SEPARATOR: char = '/'; #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)] @@ -152,6 +217,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 list")?; + + metrics::inc_list_objects(); + let fetch_response = self .client .list_objects_v2(ListObjectsV2Request { @@ -160,7 +228,11 @@ impl RemoteStorage for S3Bucket { continuation_token, ..ListObjectsV2Request::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_list_objects_fail(); + e + })?; document_keys.extend( fetch_response .contents @@ -190,6 +262,8 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 upload")?; + + metrics::inc_put_object(); self.client .put_object(PutObjectRequest { body: Some(StreamingBody::new_with_size( @@ -201,7 +275,11 @@ impl RemoteStorage for S3Bucket { metadata: metadata.map(|m| m.0), ..PutObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_put_object_fail(); + e + })?; Ok(()) } @@ -215,6 +293,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 download")?; + + metrics::inc_get_object(); + let object_output = self .client .get_object(GetObjectRequest { @@ -222,7 +303,11 @@ impl RemoteStorage for S3Bucket { key: from.key().to_owned(), ..GetObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_get_object_fail(); + e + })?; if let Some(body) = object_output.body { let mut from = io::BufReader::new(body.into_async_read()); @@ -251,6 +336,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 range download")?; + + metrics::inc_get_object(); + let object_output = self .client .get_object(GetObjectRequest { @@ -259,7 +347,11 @@ impl RemoteStorage for S3Bucket { range, ..GetObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_get_object_fail(); + e + })?; if let Some(body) = object_output.body { let mut from = io::BufReader::new(body.into_async_read()); @@ -275,13 +367,20 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 delete")?; + + metrics::inc_delete_object(); + self.client .delete_object(DeleteObjectRequest { bucket: self.bucket_name.clone(), key: path.key().to_owned(), ..DeleteObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_delete_object_fail(); + e + })?; Ok(()) } } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 77acab496f..51bff5f6eb 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -5,7 +5,7 @@ use anyhow::anyhow; use hyper::header::AUTHORIZATION; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; use lazy_static::lazy_static; -use metrics::{new_common_metric_name, register_int_counter, Encoder, IntCounter, TextEncoder}; +use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; @@ -18,7 +18,7 @@ use super::error::ApiError; lazy_static! { static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!( - new_common_metric_name("serve_metrics_count"), + "libmetrics_metric_handler_requests_total", "Number of metric requests made" ) .expect("failed to define a metric"); diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 4810909712..15d4c7a81e 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -95,3 +95,11 @@ macro_rules! project_git_version { ); }; } + +/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime. +#[macro_export] +macro_rules! const_assert { + ($($args:tt)*) => { + const _: () = assert!($($args)*); + }; +} diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index c09d8c67ce..3dab2a625c 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -26,6 +26,9 @@ impl Lsn { /// Maximum possible value for an LSN pub const MAX: Lsn = Lsn(u64::MAX); + /// Invalid value for InvalidXLogRecPtr, as defined in xlogdefs.h + pub const INVALID: Lsn = Lsn(0); + /// Subtract a number, returning None on overflow. pub fn checked_sub>(self, other: T) -> Option { let other: u64 = other.into(); @@ -103,6 +106,12 @@ impl Lsn { pub fn is_aligned(&self) -> bool { *self == self.align() } + + /// Return if the LSN is valid + /// mimics postgres XLogRecPtrIsInvalid macro + pub fn is_valid(self) -> bool { + self != Lsn::INVALID + } } impl From for Lsn { diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index 44d81cda50..0ef174da4d 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -218,7 +218,7 @@ impl ZTenantTimelineId { impl fmt::Display for ZTenantTimelineId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}-{}", self.tenant_id, self.timeline_id) + write!(f, "{}/{}", self.tenant_id, self.timeline_id) } } @@ -226,9 +226,9 @@ impl fmt::Display for ZTenantTimelineId { // by the console. #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] #[serde(transparent)] -pub struct ZNodeId(pub u64); +pub struct NodeId(pub u64); -impl fmt::Display for ZNodeId { +impl fmt::Display for NodeId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml deleted file mode 100644 index a3fda0b246..0000000000 --- a/monitoring/docker-compose.yml +++ /dev/null @@ -1,25 +0,0 @@ -version: "3" -services: - - prometheus: - container_name: prometheus - image: prom/prometheus:latest - volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yml - # ports: - # - "9090:9090" - # TODO: find a proper portable solution - network_mode: "host" - - grafana: - image: grafana/grafana:latest - volumes: - - ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml - environment: - - GF_AUTH_ANONYMOUS_ENABLED=true - - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - - GF_AUTH_DISABLE_LOGIN_FORM=true - # ports: - # - "3000:3000" - # TODO: find a proper portable solution - network_mode: "host" diff --git a/monitoring/grafana.yaml b/monitoring/grafana.yaml deleted file mode 100644 index eac8879e6c..0000000000 --- a/monitoring/grafana.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: 1 - -datasources: -- name: Prometheus - type: prometheus - access: proxy - orgId: 1 - url: http://localhost:9090 - basicAuth: false - isDefault: false - version: 1 - editable: false diff --git a/monitoring/prometheus.yaml b/monitoring/prometheus.yaml deleted file mode 100644 index ba55d53737..0000000000 --- a/monitoring/prometheus.yaml +++ /dev/null @@ -1,5 +0,0 @@ -scrape_configs: - - job_name: 'default' - scrape_interval: 10s - static_configs: - - targets: ['localhost:9898'] diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 6538cdefc4..8d39fe5d0d 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -1,10 +1,10 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{App, AppSettings, Arg, ArgMatches}; use control_plane::compute::ComputeControlPlane; -use control_plane::local_env; -use control_plane::local_env::LocalEnv; +use control_plane::local_env::{EtcdBroker, LocalEnv}; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage::PageServerNode; +use control_plane::{etcd, local_env}; use pageserver::config::defaults::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, @@ -14,6 +14,7 @@ use safekeeper::defaults::{ DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; use std::collections::{BTreeSet, HashMap}; +use std::path::Path; use std::process::exit; use std::str::FromStr; use utils::{ @@ -21,39 +22,38 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, project_git_version, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use pageserver::timelines::TimelineInfo; // Default id of a safekeeper node, if not specified on the command line. -const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); -const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); +const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -fn default_conf() -> String { +fn default_conf(etcd_binary_path: &Path) -> String { format!( r#" # Default built-in configuration, defined in main.rs +[etcd_broker] +broker_endpoints = ['http://localhost:2379'] +etcd_binary_path = '{etcd_binary_path}' + [pageserver] -id = {pageserver_id} -listen_pg_addr = '{pageserver_pg_addr}' -listen_http_addr = '{pageserver_http_addr}' +id = {DEFAULT_PAGESERVER_ID} +listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}' +listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}' auth_type = '{pageserver_auth_type}' [[safekeepers]] -id = {safekeeper_id} -pg_port = {safekeeper_pg_port} -http_port = {safekeeper_http_port} +id = {DEFAULT_SAFEKEEPER_ID} +pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} +http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} "#, - pageserver_id = DEFAULT_PAGESERVER_ID, - pageserver_pg_addr = DEFAULT_PAGESERVER_PG_ADDR, - pageserver_http_addr = DEFAULT_PAGESERVER_HTTP_ADDR, + etcd_binary_path = etcd_binary_path.display(), pageserver_auth_type = AuthType::Trust, - safekeeper_id = DEFAULT_SAFEKEEPER_ID, - safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT, - safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT, ) } @@ -167,12 +167,12 @@ fn main() -> Result<()> { .subcommand(App::new("create") .arg(tenant_id_arg.clone()) .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) - .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) - ) + .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + ) .subcommand(App::new("config") .arg(tenant_id_arg.clone()) - .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) - ) + .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + ) ) .subcommand( App::new("pageserver") @@ -275,7 +275,7 @@ fn main() -> Result<()> { "pageserver" => handle_pageserver(sub_args, &env), "pg" => handle_pg(sub_args, &env), "safekeeper" => handle_safekeeper(sub_args, &env), - _ => bail!("unexpected subcommand {}", sub_name), + _ => bail!("unexpected subcommand {sub_name}"), }; if original_env != env { @@ -289,7 +289,7 @@ fn main() -> Result<()> { Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, Ok(None) => (), Err(e) => { - eprintln!("command failed: {:?}", e); + eprintln!("command failed: {e:?}"); exit(1); } } @@ -468,21 +468,21 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result Result { +fn handle_init(init_match: &ArgMatches) -> anyhow::Result { let initial_timeline_id_arg = parse_timeline_id(init_match)?; // Create config file let toml_file: String = if let Some(config_path) = init_match.value_of("config") { // load and parse the file std::fs::read_to_string(std::path::Path::new(config_path)) - .with_context(|| format!("Could not read configuration file \"{}\"", config_path))? + .with_context(|| format!("Could not read configuration file '{config_path}'"))? } else { // Built-in default config - default_conf() + default_conf(&EtcdBroker::locate_etcd()?) }; let mut env = - LocalEnv::create_config(&toml_file).context("Failed to create neon configuration")?; + LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; env.init().context("Failed to initialize neon repository")?; // default_tenantid was generated by the `env.init()` call above @@ -497,7 +497,7 @@ fn handle_init(init_match: &ArgMatches) -> Result { &pageserver_config_overrides(init_match), ) .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {}", e); + eprintln!("pageserver init failed: {e}"); exit(1); }); @@ -860,7 +860,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result { +fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result { if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { Ok(SafekeeperNode::from_env(env, node)) } else { @@ -876,7 +876,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul // All the commands take an optional safekeeper name argument let sk_id = if let Some(id_str) = sub_args.value_of("id") { - ZNodeId(id_str.parse().context("while parsing safekeeper id")?) + NodeId(id_str.parse().context("while parsing safekeeper id")?) } else { DEFAULT_SAFEKEEPER_ID }; @@ -920,20 +920,23 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { + etcd::start_etcd_process(env)?; let pageserver = PageServerNode::from_env(env); // Postgres nodes are not started automatically if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { - eprintln!("pageserver start failed: {}", e); + eprintln!("pageserver start failed: {e}"); + try_stop_etcd_process(env); exit(1); } for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start() { - eprintln!("safekeeper '{}' start failed: {}", safekeeper.id, e); + eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id); + try_stop_etcd_process(env); exit(1); } } @@ -963,5 +966,14 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e); } } + + try_stop_etcd_process(env); + Ok(()) } + +fn try_stop_etcd_process(env: &local_env::LocalEnv) { + if let Err(e) = etcd::stop_etcd_process(env) { + eprintln!("etcd stop failed: {e}"); + } +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index f4f8ca1fe6..c0efe59669 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -58,6 +58,7 @@ git-version = "0.3.5" zstd = { version = "0.11.1", features = ["experimental"] } postgres_ffi = { path = "../libs/postgres_ffi" } +etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } remote_storage = { path = "../libs/remote_storage" } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 92d35130d8..46d824b2e2 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,8 +10,9 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{BufMut, BytesMut}; +use fail::fail_point; use std::fmt::Write as FmtWrite; use std::io; use std::io::Write; @@ -30,11 +31,16 @@ use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a> { - ar: Builder<&'a mut dyn Write>, +pub struct Basebackup<'a, W> +where + W: Write, +{ + ar: Builder>, timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, + + finished: bool, } // Create basebackup with non-rel data in it. Omit relational data. @@ -44,12 +50,15 @@ pub struct Basebackup<'a> { // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a> Basebackup<'a> { +impl<'a, W> Basebackup<'a, W> +where + W: Write, +{ pub fn new( - write: &'a mut dyn Write, + write: W, timeline: &'a Arc, req_lsn: Option, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -90,14 +99,15 @@ impl<'a> Basebackup<'a> { ); Ok(Basebackup { - ar: Builder::new(write), + ar: Builder::new(AbortableWrite::new(write)), timeline, lsn: backup_lsn, prev_record_lsn: backup_prev, + finished: false, }) } - pub fn send_tarball(&mut self) -> anyhow::Result<()> { + pub fn send_tarball(mut self) -> anyhow::Result<()> { // Create pgdata subdirs structure for dir in pg_constants::PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; @@ -135,9 +145,14 @@ impl<'a> Basebackup<'a> { self.add_twophase_file(xid)?; } + fail_point!("basebackup-before-control-file", |_| { + bail!("failpoint basebackup-before-control-file") + }); + // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file()?; self.ar.finish()?; + self.finished = true; debug!("all tarred up!"); Ok(()) } @@ -331,6 +346,19 @@ impl<'a> Basebackup<'a> { } } +impl<'a, W> Drop for Basebackup<'a, W> +where + W: Write, +{ + /// If the basebackup was not finished, prevent the Archive::drop() from + /// writing the end-of-archive marker. + fn drop(&mut self) { + if !self.finished { + self.ar.get_mut().abort(); + } + } +} + // // Create new tarball entry header // @@ -366,3 +394,49 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result
{ header.set_cksum(); Ok(header) } + +/// A wrapper that passes through all data to the underlying Write, +/// until abort() is called. +/// +/// tar::Builder has an annoying habit of finishing the archive with +/// a valid tar end-of-archive marker (two 512-byte sectors of zeros), +/// even if an error occurs and we don't finish building the archive. +/// We'd rather abort writing the tarball immediately than construct +/// a seemingly valid but incomplete archive. This wrapper allows us +/// to swallow the end-of-archive marker that Builder::drop() emits, +/// without writing it to the underlying sink. +/// +struct AbortableWrite { + w: W, + aborted: bool, +} + +impl AbortableWrite { + pub fn new(w: W) -> Self { + AbortableWrite { w, aborted: false } + } + + pub fn abort(&mut self) { + self.aborted = true; + } +} + +impl Write for AbortableWrite +where + W: Write, +{ + fn write(&mut self, data: &[u8]) -> io::Result { + if self.aborted { + Ok(data.len()) + } else { + self.w.write(data) + } + } + fn flush(&mut self) -> io::Result<()> { + if self.aborted { + Ok(()) + } else { + self.w.flush() + } + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 190e38e341..ac90500b97 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -38,7 +38,6 @@ fn version() -> String { } fn main() -> anyhow::Result<()> { - metrics::set_common_metrics_prefix("pageserver"); let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(&*version()) @@ -98,6 +97,8 @@ fn main() -> anyhow::Result<()> { let features: &[&str] = &[ #[cfg(feature = "failpoints")] "failpoints", + #[cfg(feature = "profiling")] + "profiling", ]; println!("{{\"features\": {features:?} }}"); return Ok(()); @@ -183,13 +184,8 @@ fn main() -> anyhow::Result<()> { // as a ref. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - // If failpoints are used, terminate the whole pageserver process if they are hit. + // Initialize up failpoints support let scenario = FailScenario::setup(); - if fail::has_failpoints() { - std::panic::set_hook(Box::new(|_| { - std::process::exit(1); - })); - } // Basic initialization of things that don't change after startup virtual_file::init(conf.max_file_descriptors); @@ -258,7 +254,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // Otherwise, the coverage data will be damaged. match daemonize.exit_action(|| exit_now(0)).start() { Ok(_) => info!("Success, daemonized"), - Err(err) => error!(%err, "could not daemonize"), + Err(err) => bail!("{err}. could not daemonize. bailing."), } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ab5a2ec2be..a9516a6237 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,17 +5,18 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; -use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config}; +use remote_storage::RemoteStorageConfig; use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; + use std::path::{Path, PathBuf}; use std::str::FromStr; use std::time::Duration; use toml_edit; use toml_edit::{Document, Item}; +use url::Url; use utils::{ postgres_backend::AuthType, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::layered_repository::TIMELINES_SEGMENT_NAME; @@ -83,7 +84,7 @@ pub mod defaults { pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers // can safely distinguish different pageservers - pub id: ZNodeId, + pub id: NodeId, /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, @@ -117,6 +118,13 @@ pub struct PageServerConf { pub profiling: ProfilingConfig, pub default_tenant_conf: TenantConf, + + /// A prefix to add in etcd brokers before every key. + /// Can be used for isolating different pageserver groups withing the same etcd cluster. + pub broker_etcd_prefix: String, + + /// Etcd broker endpoints to connect to. + pub broker_endpoints: Vec, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -178,9 +186,11 @@ struct PageServerConfigBuilder { auth_validation_public_key_path: BuilderValue>, remote_storage_config: BuilderValue>, - id: BuilderValue, + id: BuilderValue, profiling: BuilderValue, + broker_etcd_prefix: BuilderValue, + broker_endpoints: BuilderValue>, } impl Default for PageServerConfigBuilder { @@ -206,6 +216,8 @@ impl Default for PageServerConfigBuilder { remote_storage_config: Set(None), id: NotSet, profiling: Set(ProfilingConfig::Disabled), + broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()), + broker_endpoints: Set(Vec::new()), } } } @@ -262,7 +274,15 @@ impl PageServerConfigBuilder { self.remote_storage_config = BuilderValue::Set(remote_storage_config) } - pub fn id(&mut self, node_id: ZNodeId) { + pub fn broker_endpoints(&mut self, broker_endpoints: Vec) { + self.broker_endpoints = BuilderValue::Set(broker_endpoints) + } + + pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) { + self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix) + } + + pub fn id(&mut self, node_id: NodeId) { self.id = BuilderValue::Set(node_id) } @@ -270,7 +290,11 @@ impl PageServerConfigBuilder { self.profiling = BuilderValue::Set(profiling) } - pub fn build(self) -> Result { + pub fn build(self) -> anyhow::Result { + let broker_endpoints = self + .broker_endpoints + .ok_or(anyhow!("No broker endpoints provided"))?; + Ok(PageServerConf { listen_pg_addr: self .listen_pg_addr @@ -306,6 +330,10 @@ impl PageServerConfigBuilder { profiling: self.profiling.ok_or(anyhow!("missing profiling"))?, // TenantConf is handled separately default_tenant_conf: TenantConf::default(), + broker_endpoints, + broker_etcd_prefix: self + .broker_etcd_prefix + .ok_or(anyhow!("missing broker_etcd_prefix"))?, }) } } @@ -347,7 +375,7 @@ impl PageServerConf { /// validating the input and failing on errors. /// /// This leaves any options not present in the file in the built-in defaults. - pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result { + pub fn parse_and_validate(toml: &Document, workdir: &Path) -> anyhow::Result { let mut builder = PageServerConfigBuilder::default(); builder.workdir(workdir.to_owned()); @@ -372,13 +400,24 @@ impl PageServerConf { )), "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) + builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?)) } "tenant_config" => { t_conf = Self::parse_toml_tenant_conf(item)?; } - "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), + "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), "profiling" => builder.profiling(parse_toml_from_str(key, item)?), + "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?), + "broker_endpoints" => builder.broker_endpoints( + parse_toml_array(key, item)? + .into_iter() + .map(|endpoint_str| { + endpoint_str.parse::().with_context(|| { + format!("Array item {endpoint_str} for key {key} is not a valid url endpoint") + }) + }) + .collect::>()?, + ), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -451,64 +490,6 @@ impl PageServerConf { Ok(t_conf) } - /// subroutine of parse_config(), to parse the `[remote_storage]` table. - fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result { - let local_path = toml.get("local_path"); - let bucket_name = toml.get("bucket_name"); - let bucket_region = toml.get("bucket_region"); - - let max_concurrent_syncs = NonZeroUsize::new( - parse_optional_integer("max_concurrent_syncs", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), - ) - .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; - - let max_sync_errors = NonZeroU32::new( - parse_optional_integer("max_sync_errors", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), - ) - .context("Failed to parse 'max_sync_errors' as a positive integer")?; - - let concurrency_limit = NonZeroUsize::new( - parse_optional_integer("concurrency_limit", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), - ) - .context("Failed to parse 'concurrency_limit' as a positive integer")?; - - let storage = match (local_path, bucket_name, bucket_region) { - (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), - (_, Some(_), None) => { - bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") - } - (_, None, Some(_)) => { - bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") - } - (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { - bucket_name: parse_toml_string("bucket_name", bucket_name)?, - bucket_region: parse_toml_string("bucket_region", bucket_region)?, - prefix_in_bucket: toml - .get("prefix_in_bucket") - .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) - .transpose()?, - endpoint: toml - .get("endpoint") - .map(|endpoint| parse_toml_string("endpoint", endpoint)) - .transpose()?, - concurrency_limit, - }), - (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( - parse_toml_string("local_path", local_path)?, - )), - (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), - }; - - Ok(RemoteStorageConfig { - max_concurrent_syncs, - max_sync_errors, - storage, - }) - } - #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> PathBuf { PathBuf::from(format!("../tmp_check/test_{test_name}")) @@ -517,7 +498,7 @@ impl PageServerConf { #[cfg(test)] pub fn dummy_conf(repo_dir: PathBuf) -> Self { PageServerConf { - id: ZNodeId(0), + id: NodeId(0), wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, @@ -532,6 +513,8 @@ impl PageServerConf { remote_storage_config: None, profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::dummy_conf(), + broker_endpoints: Vec::new(), + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), } } } @@ -557,23 +540,6 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result { Ok(i as u64) } -fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> -where - I: TryFrom, - E: std::error::Error + Send + Sync + 'static, -{ - let toml_integer = match item.get(name) { - Some(item) => item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?, - None => return Ok(None), - }; - - I::try_from(toml_integer) - .map(Some) - .with_context(|| format!("configure option {name} is too large")) -} - fn parse_toml_duration(name: &str, item: &Item) -> Result { let s = item .as_str() @@ -582,20 +548,46 @@ fn parse_toml_duration(name: &str, item: &Item) -> Result { Ok(humantime::parse_duration(s)?) } -fn parse_toml_from_str(name: &str, item: &Item) -> Result +fn parse_toml_from_str(name: &str, item: &Item) -> anyhow::Result where - T: FromStr, + T: FromStr, + ::Err: std::fmt::Display, { let v = item .as_str() .with_context(|| format!("configure option {name} is not a string"))?; - T::from_str(v) + T::from_str(v).map_err(|e| { + anyhow!( + "Failed to parse string as {parse_type} for configure option {name}: {e}", + parse_type = stringify!(T) + ) + }) +} + +fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result> { + let array = item + .as_array() + .with_context(|| format!("configure option {name} is not an array"))?; + + array + .iter() + .map(|value| { + value + .as_str() + .map(str::to_string) + .with_context(|| format!("Array item {value:?} for key {name} is not a string")) + }) + .collect() } #[cfg(test)] mod tests { - use std::fs; + use std::{ + fs, + num::{NonZeroU32, NonZeroUsize}, + }; + use remote_storage::{RemoteStorageKind, S3Config}; use tempfile::{tempdir, TempDir}; use super::*; @@ -622,17 +614,21 @@ id = 10 fn parse_defaults() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - // we have to create dummy pathes to overcome the validation errors - let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display()); + let broker_endpoint = "http://127.0.0.1:7777"; + // we have to create dummy values to overcome the validation errors + let config_string = format!( + "pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']", + pg_distrib_dir.display() + ); let toml = config_string.parse()?; let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")); + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( parsed_config, PageServerConf { - id: ZNodeId(10), + id: NodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, @@ -647,6 +643,10 @@ id = 10 remote_storage_config: None, profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), + broker_endpoints: vec![broker_endpoint + .parse() + .expect("Failed to parse a valid broker endpoint URL")], + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), }, "Correct defaults should be used when no config values are provided" ); @@ -658,20 +658,21 @@ id = 10 fn parse_basic_config() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; + let broker_endpoint = "http://127.0.0.1:7777"; let config_string = format!( - "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'", + "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']", pg_distrib_dir.display() ); let toml = config_string.parse()?; let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")); + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( parsed_config, PageServerConf { - id: ZNodeId(10), + id: NodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), wait_lsn_timeout: Duration::from_secs(111), @@ -686,6 +687,10 @@ id = 10 remote_storage_config: None, profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), + broker_endpoints: vec![broker_endpoint + .parse() + .expect("Failed to parse a valid broker endpoint URL")], + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), }, "Should be able to parse all basic config values correctly" ); @@ -697,6 +702,7 @@ id = 10 fn parse_remote_fs_storage_config() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; + let broker_endpoint = "http://127.0.0.1:7777"; let local_storage_path = tempdir.path().join("local_remote_storage"); @@ -716,6 +722,7 @@ local_path = '{}'"#, let config_string = format!( r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' +broker_endpoints = ['{broker_endpoint}'] {remote_storage_config_str}"#, pg_distrib_dir.display(), @@ -724,7 +731,9 @@ pg_distrib_dir='{}' let toml = config_string.parse()?; let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) .remote_storage_config .expect("Should have remote storage config for the local FS"); @@ -734,7 +743,7 @@ pg_distrib_dir='{}' max_concurrent_syncs: NonZeroUsize::new( remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS ) - .unwrap(), + .unwrap(), max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) .unwrap(), storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), @@ -757,6 +766,7 @@ pg_distrib_dir='{}' let max_concurrent_syncs = NonZeroUsize::new(111).unwrap(); let max_sync_errors = NonZeroU32::new(222).unwrap(); let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); + let broker_endpoint = "http://127.0.0.1:7777"; let identical_toml_declarations = &[ format!( @@ -779,6 +789,7 @@ concurrency_limit = {s3_concurrency_limit}"# let config_string = format!( r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' +broker_endpoints = ['{broker_endpoint}'] {remote_storage_config_str}"#, pg_distrib_dir.display(), @@ -787,7 +798,9 @@ pg_distrib_dir='{}' let toml = config_string.parse()?; let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) .remote_storage_config .expect("Should have remote storage config for S3"); diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index e9aaa72416..e00ccda2a1 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; #[serde_as] @@ -42,7 +42,7 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId #[derive(Serialize)] pub struct StatusResponse { - pub id: ZNodeId, + pub id: NodeId, } impl TenantCreateRequest { diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 9932a2d08d..55f7b3c5a7 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -123,6 +123,53 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + get: + description: Get wal receiver's data attached to the timeline + responses: + "200": + description: WalReceiverEntry + content: + application/json: + schema: + $ref: "#/components/schemas/WalReceiverEntry" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "404": + description: Error when no wal receiver is running or found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/{timeline_id}/attach: parameters: @@ -520,6 +567,21 @@ components: type: integer current_logical_size_non_incremental: type: integer + WalReceiverEntry: + type: object + required: + - thread_id + - wal_producer_connstr + properties: + thread_id: + type: integer + wal_producer_connstr: + type: string + last_received_msg_lsn: + type: string + format: hex + last_received_msg_ts: + type: integer Error: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0104df826e..bb650a34ed 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -224,6 +224,30 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + + let wal_receiver = tokio::task::spawn_blocking(move || { + let _enter = + info_span!("wal_receiver_get", tenant = %tenant_id, timeline = %timeline_id).entered(); + + crate::walreceiver::get_wal_receiver_entry(tenant_id, timeline_id) + }) + .await + .map_err(ApiError::from_err)? + .ok_or_else(|| { + ApiError::NotFound(format!( + "WAL receiver not found for tenant {} and timeline {}", + tenant_id, timeline_id + )) + })?; + + json_response(StatusCode::OK, wal_receiver) +} + async fn timeline_attach_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -485,6 +509,10 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", + wal_receiver_get_handler, + ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/attach", timeline_attach_handler, diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c7536cc959..0d7c6f54c8 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1230,7 +1230,7 @@ impl LayeredTimeline { }), disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), - last_freeze_at: AtomicLsn::new(0), + last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), @@ -1357,7 +1357,9 @@ impl LayeredTimeline { let mut timeline_owned; let mut timeline = self; - let mut path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); + // For debugging purposes, collect the path of layers that we traversed + // through. It's included in the error message if we fail to find the key. + let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { *cached_lsn @@ -1387,32 +1389,24 @@ impl LayeredTimeline { if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. - - // For debugging purposes, print the path of layers that we traversed - // through. - for (r, c, l) in path { - error!( - "PATH: result {:?}, cont_lsn {}, layer: {}", - r, - c, - l.filename().display() - ); - } - bail!("could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", - key, - Lsn(cont_lsn.0 - 1), - request_lsn, - timeline.ancestor_lsn) + return layer_traversal_error(format!( + "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + key, + Lsn(cont_lsn.0 - 1), + request_lsn, + timeline.ancestor_lsn + ), traversal_path); } prev_lsn = cont_lsn; } ValueReconstructResult::Missing => { - bail!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, - cont_lsn, - request_lsn - ) + return layer_traversal_error( + format!( + "could not find data for key {} at LSN {}, for request at LSN {}", + key, cont_lsn, request_lsn + ), + traversal_path, + ); } } @@ -1447,7 +1441,7 @@ impl LayeredTimeline { reconstruct_state, )?; cont_lsn = lsn_floor; - path.push((result, cont_lsn, open_layer.clone())); + traversal_path.push((result, cont_lsn, open_layer.clone())); continue; } } @@ -1462,7 +1456,7 @@ impl LayeredTimeline { reconstruct_state, )?; cont_lsn = lsn_floor; - path.push((result, cont_lsn, frozen_layer.clone())); + traversal_path.push((result, cont_lsn, frozen_layer.clone())); continue 'outer; } } @@ -1477,7 +1471,7 @@ impl LayeredTimeline { reconstruct_state, )?; cont_lsn = lsn_floor; - path.push((result, cont_lsn, layer)); + traversal_path.push((result, cont_lsn, layer)); } else if timeline.ancestor_timeline.is_some() { // Nothing on this timeline. Traverse to parent result = ValueReconstructResult::Continue; @@ -1621,22 +1615,30 @@ impl LayeredTimeline { pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { let last_lsn = self.get_last_record_lsn(); + // Has more than 'checkpoint_distance' of WAL been accumulated? let distance = last_lsn.widening_sub(self.last_freeze_at.load()); if distance >= self.get_checkpoint_distance().into() { + // Yes. Freeze the current in-memory layer. self.freeze_inmem_layer(true); self.last_freeze_at.store(last_lsn); - } - if let Ok(guard) = self.layer_flush_lock.try_lock() { - drop(guard); - let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush thread", - false, - move || self_clone.flush_frozen_layers(false), - )?; + + // Launch a thread to flush the frozen layer to disk, unless + // a thread was already running. (If the thread was running + // at the time that we froze the layer, it must've seen the + // the layer we just froze before it exited; see comments + // in flush_frozen_layers()) + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + thread_mgr::spawn( + thread_mgr::ThreadKind::LayerFlushThread, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush thread", + false, + move || self_clone.flush_frozen_layers(false), + )?; + } } Ok(()) } @@ -1944,41 +1946,87 @@ impl LayeredTimeline { Ok(new_path) } + /// + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. + /// fn compact_level0(&self, target_file_size: u64) -> Result<()> { let layers = self.layers.read().unwrap(); - - let level0_deltas = layers.get_level0_deltas()?; - - // We compact or "shuffle" the level-0 delta layers when they've - // accumulated over the compaction threshold. - if level0_deltas.len() < self.get_compaction_threshold() { - return Ok(()); - } + let mut level0_deltas = layers.get_level0_deltas()?; drop(layers); - // FIXME: this function probably won't work correctly if there's overlap - // in the deltas. - let lsn_range = level0_deltas - .iter() - .map(|l| l.get_lsn_range()) - .reduce(|a, b| min(a.start, b.start)..max(a.end, b.end)) - .unwrap(); + // Only compact if enough layers have accumulated. + if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { + return Ok(()); + } - let all_values_iter = level0_deltas.iter().map(|l| l.iter()).kmerge_by(|a, b| { - if let Ok((a_key, a_lsn, _)) = a { - if let Ok((b_key, b_lsn, _)) = b { - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.get_lsn_range().start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.get_lsn_range().end; + let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)]; + for l in level0_deltas_iter { + let lsn_range = l.get_lsn_range(); + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(Arc::clone(l)); + prev_lsn_end = lsn_range.end; + } + let lsn_range = Range { + start: deltas_to_compact.first().unwrap().get_lsn_range().start, + end: deltas_to_compact.last().unwrap().get_lsn_range().end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + for l in deltas_to_compact.iter() { + info!("compact includes {}", l.filename().display()); + } + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let all_values_iter = deltas_to_compact + .iter() + .map(|l| l.iter()) + .kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + } else { + false } } else { - false + true } - } else { - true - } - }); + }); // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. @@ -2044,8 +2092,8 @@ impl LayeredTimeline { // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones - let mut layer_paths_do_delete = HashSet::with_capacity(level0_deltas.len()); - for l in level0_deltas { + let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); + for l in deltas_to_compact { l.delete()?; if let Some(path) = l.local_path() { layer_paths_do_delete.insert(path); @@ -2117,7 +2165,7 @@ impl LayeredTimeline { let gc_info = self.gc_info.read().unwrap(); let retain_lsns = &gc_info.retain_lsns; - let cutoff = gc_info.cutoff; + let cutoff = min(gc_info.cutoff, disk_consistent_lsn); let pitr = gc_info.pitr; // Calculate pitr cutoff point. @@ -2246,12 +2294,20 @@ impl LayeredTimeline { // is 102, then it might not have been fully flushed to disk // before crash. // - // FIXME: This logic is wrong. See https://github.com/zenithdb/zenith/issues/707 - if !layers.newer_image_layer_exists( - &l.get_key_range(), - l.get_lsn_range().end, - disk_consistent_lsn + 1, - )? { + // For example, imagine that the following layers exist: + // + // 1000 - image (A) + // 1000-2000 - delta (B) + // 2000 - image (C) + // 2000-3000 - delta (D) + // 3000 - image (E) + // + // If GC horizon is at 2500, we can remove layers A and B, but + // we cannot remove C, even though it's older than 2500, because + // the delta layer 2000-3000 depends on it. + if !layers + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? + { debug!( "keeping {} because it is the latest layer", l.filename().display() @@ -2367,6 +2423,32 @@ impl LayeredTimeline { } } +/// Helper function for get_reconstruct_data() to add the path of layers traversed +/// to an error, as anyhow context information. +fn layer_traversal_error( + msg: String, + path: Vec<(ValueReconstructResult, Lsn, Arc)>, +) -> anyhow::Result<()> { + // We want the original 'msg' to be the outermost context. The outermost context + // is the most high-level information, which also gets propagated to the client. + let mut msg_iter = path + .iter() + .map(|(r, c, l)| { + format!( + "layer traversal: result {:?}, cont_lsn {}, layer: {}", + r, + c, + l.filename().display() + ) + }) + .chain(std::iter::once(msg)); + // Construct initial message from the first traversed layer + let err = anyhow!(msg_iter.next().unwrap()); + + // Append all subsequent traversals, and the error message 'msg', as contexts. + Err(msg_iter.fold(err, |err, msg| err.context(msg))) +} + struct LayeredTimelineWriter<'a> { tl: &'a LayeredTimeline, _write_guard: MutexGuard<'a, ()>, @@ -2436,7 +2518,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } -fn load_metadata( +pub fn load_metadata( conf: &'static PageServerConf, timeline_id: ZTimelineId, tenant_id: ZTenantId, diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 28071e0caa..9179beb8c8 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -56,6 +56,7 @@ use crate::virtual_file::VirtualFile; use crate::walrecord; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{anyhow, bail, ensure, Context, Result}; +use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs; use std::io::{BufWriter, Write}; @@ -532,6 +533,28 @@ impl DeltaLayer { } } + fn temp_path_for( + conf: &PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + key_start: Key, + lsn_range: &Range, + ) -> PathBuf { + let rand_string: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(8) + .map(char::from) + .collect(); + + conf.timeline_path(&timelineid, &tenantid).join(format!( + "{}-XXX__{:016X}-{:016X}.{}.temp", + key_start, + u64::from(lsn_range.start), + u64::from(lsn_range.end), + rand_string + )) + } + /// /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. @@ -746,12 +769,8 @@ impl DeltaLayerWriter { // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = conf.timeline_path(&timelineid, &tenantid).join(format!( - "{}-XXX__{:016X}-{:016X}.temp", - key_start, - u64::from(lsn_range.start), - u64::from(lsn_range.end) - )); + let path = DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, &lsn_range); + let mut file = VirtualFile::create(&path)?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64))?; @@ -960,6 +979,8 @@ impl DeltaLayerWriter { }), }; + // fsync the file + file.sync_all()?; // Rename the file to its final name // // Note: This overwrites any existing file. There shouldn't be any. diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index e747192d96..0c9ad75048 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -444,6 +444,13 @@ where /// /// stack[0] is the current root page, stack.last() is the leaf. /// + /// We maintain the length of the stack to be always greater than zero. + /// Two exceptions are: + /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one. + /// So because other methods cannot see the intermediate state invariant still holds. + /// 2. `Self::finish`. It consumes self and does not return it back, + /// which means that this is where the structure is destroyed. + /// Thus stack of zero length cannot be observed by other methods. stack: Vec>, /// Last key that was appended to the tree. Used to sanity check that append @@ -482,7 +489,10 @@ where fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> { // Try to append to the current leaf buffer - let last = self.stack.last_mut().unwrap(); + let last = self + .stack + .last_mut() + .expect("should always have at least one item"); let level = last.level; if last.push(key, value) { return Ok(()); @@ -512,19 +522,25 @@ where Ok(()) } + /// Flush the bottommost node in the stack to disk. Appends a downlink to its parent, + /// and recursively flushes the parent too, if it becomes full. If the root page becomes full, + /// creates a new root page, increasing the height of the tree. fn flush_node(&mut self) -> Result<()> { - let last = self.stack.pop().unwrap(); + // Get the current bottommost node in the stack and flush it to disk. + let last = self + .stack + .pop() + .expect("should always have at least one item"); let buf = last.pack(); let downlink_key = last.first_key(); let downlink_ptr = self.writer.write_blk(buf)?; - // Append the downlink to the parent + // Append the downlink to the parent. If there is no parent, ie. this was the root page, + // create a new root page, increasing the height of the tree. if self.stack.is_empty() { self.stack.push(BuildNode::new(last.level + 1)); } - self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))?; - - Ok(()) + self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr)) } /// @@ -540,7 +556,10 @@ where self.flush_node()?; } - let root = self.stack.first().unwrap(); + let root = self + .stack + .first() + .expect("by the check above we left one item there"); let buf = root.pack(); let root_blknum = self.writer.write_blk(buf)?; diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index b723e53345..33d38d1ba0 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -39,6 +39,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use hex; +use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs; use std::io::Write; @@ -305,6 +306,22 @@ impl ImageLayer { } } + fn temp_path_for( + conf: &PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + fname: &ImageFileName, + ) -> PathBuf { + let rand_string: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(8) + .map(char::from) + .collect(); + + conf.timeline_path(&timelineid, &tenantid) + .join(format!("{}.{}.temp", fname, rand_string)) + } + /// /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. @@ -462,7 +479,7 @@ impl ImageLayer { /// pub struct ImageLayerWriter { conf: &'static PageServerConf, - _path: PathBuf, + path: PathBuf, timelineid: ZTimelineId, tenantid: ZTenantId, key_range: Range, @@ -482,12 +499,10 @@ impl ImageLayerWriter { key_range: &Range, lsn: Lsn, ) -> anyhow::Result { - // Create the file - // - // Note: This overwrites any existing file. There shouldn't be any. - // FIXME: throw an error instead? - let path = ImageLayer::path_for( - &PathOrConf::Conf(conf), + // Create the file initially with a temporary filename. + // We'll atomically rename it to the final name when we're done. + let path = ImageLayer::temp_path_for( + conf, timelineid, tenantid, &ImageFileName { @@ -513,7 +528,7 @@ impl ImageLayerWriter { let writer = ImageLayerWriter { conf, - _path: path, + path, timelineid, tenantid, key_range: key_range.clone(), @@ -611,6 +626,25 @@ impl ImageLayerWriter { index_root_blk, }), }; + + // fsync the file + file.sync_all()?; + + // Rename the file to its final name + // + // Note: This overwrites any existing file. There shouldn't be any. + // FIXME: throw an error instead? + let final_path = ImageLayer::path_for( + &PathOrConf::Conf(self.conf), + self.timelineid, + self.tenantid, + &ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn, + }, + ); + std::fs::rename(self.path, &final_path)?; + trace!("created image layer {}", layer.path().display()); Ok(layer) diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 7491294c03..f7f51bf21f 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -201,18 +201,14 @@ impl LayerMap { NUM_ONDISK_LAYERS.dec(); } - /// Is there a newer image layer for given key-range? + /// Is there a newer image layer for given key- and LSN-range? /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. - /// We ignore layers newer than disk_consistent_lsn because they will be removed at restart - /// We also only look at historic layers - //#[allow(dead_code)] - pub fn newer_image_layer_exists( + pub fn image_layer_exists( &self, key_range: &Range, - lsn: Lsn, - disk_consistent_lsn: Lsn, + lsn_range: &Range, ) -> Result { let mut range_remain = key_range.clone(); @@ -225,8 +221,7 @@ impl LayerMap { let img_lsn = l.get_lsn_range().start; if !l.is_incremental() && l.get_key_range().contains(&range_remain.start) - && img_lsn > lsn - && img_lsn < disk_consistent_lsn + && lsn_range.contains(&img_lsn) { made_progress = true; let img_key_end = l.get_key_range().end; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 28d6bf2621..1c07b63072 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -305,7 +305,29 @@ fn page_service_conn_main( let mut conn_handler = PageServerHandler::new(conf, auth); let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; - pgbackend.run(&mut conn_handler) + match pgbackend.run(&mut conn_handler) { + Ok(()) => { + // we've been requested to shut down + Ok(()) + } + Err(err) => { + let root_cause_io_err_kind = err + .root_cause() + .downcast_ref::() + .map(|e| e.kind()); + + // `ConnectionReset` error happens when the Postgres client closes the connection. + // As this disconnection happens quite often and is expected, + // we decided to downgrade the logging level to `INFO`. + // See: https://github.com/neondatabase/neon/issues/1683. + if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) { + info!("Postgres client disconnected"); + Ok(()) + } else { + Err(err) + } + } + } } #[derive(Debug)] @@ -593,7 +615,8 @@ impl PageServerHandler { /* Send a tarball of the latest layer on the timeline */ { let mut writer = CopyDataSink { pgb }; - let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; + + let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; span.record("lsn", &basebackup.lsn.to_string().as_str()); basebackup.send_tarball()?; } @@ -730,7 +753,18 @@ impl postgres_backend::Handler for PageServerHandler { for failpoint in failpoints.split(';') { if let Some((name, actions)) = failpoint.split_once('=') { info!("cfg failpoint: {} {}", name, actions); - fail::cfg(name, actions).unwrap(); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + if actions == "exit" { + fail::cfg_callback(name, || { + info!("Exit requested by failpoint"); + std::process::exit(1); + }) + .unwrap(); + } else { + fail::cfg(name, actions).unwrap(); + } } else { bail!("Invalid failpoints format"); } diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 7755e67c8d..bbebcd1f36 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -9,7 +9,7 @@ //! //! * public API via to interact with the external world: //! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization -//! * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks, +//! * [`schedule_layer_upload`], [`schedule_layer_download`], and[`schedule_layer_delete`] to enqueue a new task //! to be processed by the async loop //! //! Here's a schematic overview of all interactions backup and the rest of the pageserver perform: @@ -44,8 +44,8 @@ //! query their downloads later if they are accessed. //! //! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. -//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint. -//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). +//! If the storage sync loop was successfully started before, pageserver schedules the layer files and the updated metadata file for upload, every time a layer is flushed to disk. +//! The uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). //! See [`crate::layered_repository`] for the upload calls and the adjacent logic. //! //! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], @@ -54,7 +54,7 @@ //! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future //! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory //! -//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits. +//! When the pageserver terminates, the sync loop finishes current sync task (if any) and exits. //! //! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file). //! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed @@ -66,13 +66,13 @@ //! when the newer image is downloaded //! //! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. -//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files. +//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. //! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. //! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], //! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. //! //! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. -//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, +//! Bulk index data download happens only initially, on pageserver startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, //! when a new timeline is scheduled for the download. //! //! NOTES: @@ -89,13 +89,12 @@ //! Synchronization is done with the queue being emptied via separate thread asynchronously, //! attempting to fully store pageserver's local data on the remote storage in a custom format, beneficial for storing. //! -//! A queue is implemented in the [`sync_queue`] module as a pair of sender and receiver channels, to block on zero tasks instead of checking the queue. -//! The pair's shared buffer of a fixed size serves as an implicit queue, holding [`SyncTask`] for local files upload/download operations. +//! A queue is implemented in the [`sync_queue`] module as a VecDeque to hold the tasks, and a condition variable for blocking when the queue is empty. //! //! The queue gets emptied by a single thread with the loop, that polls the tasks in batches of deduplicated tasks. //! A task from the batch corresponds to a single timeline, with its files to sync merged together: given that only one task sync loop step is active at a time, //! timeline uploads and downloads can happen concurrently, in no particular order due to incremental nature of the timeline layers. -//! Deletion happens only after a successful upload only, otherwise the compation output might make the timeline inconsistent until both tasks are fully processed without errors. +//! Deletion happens only after a successful upload only, otherwise the compaction output might make the timeline inconsistent until both tasks are fully processed without errors. //! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task //! does otherwise: it requires to have the remote data updated first succesfully: blob files will be invisible to pageserver this way. //! @@ -138,8 +137,6 @@ //! NOTE: No real contents or checksum check happens right now and is a subject to improve later. //! //! After the whole timeline is downloaded, [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function is used to update pageserver memory stage for the timeline processed. -//! -//! When pageserver signals shutdown, current sync task gets finished and the loop exists. mod delete; mod download; @@ -153,10 +150,7 @@ use std::{ num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, path::{Path, PathBuf}, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }, + sync::{Arc, Condvar, Mutex}, }; use anyhow::{anyhow, bail, Context}; @@ -167,7 +161,6 @@ use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::{ fs, runtime::Runtime, - sync::mpsc::{self, error::TryRecvError, UnboundedReceiver, UnboundedSender}, time::{Duration, Instant}, }; use tracing::*; @@ -428,6 +421,14 @@ fn collect_timeline_files( entry_path.display() ) })?; + } else if entry_path.extension().and_then(OsStr::to_str) == Some("temp") { + info!("removing temp layer file at {}", entry_path.display()); + std::fs::remove_file(&entry_path).with_context(|| { + format!( + "failed to remove temp layer file at {}", + entry_path.display() + ) + })?; } else { timeline_files.insert(entry_path); } @@ -453,97 +454,77 @@ fn collect_timeline_files( Ok((timeline_id, metadata, timeline_files)) } -/// Wraps mpsc channel bits around into a queue interface. -/// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. +/// Global queue of sync tasks. +/// +/// 'queue' is protected by a mutex, and 'condvar' is used to wait for tasks to arrive. struct SyncQueue { - len: AtomicUsize, max_timelines_per_batch: NonZeroUsize, - sender: UnboundedSender<(ZTenantTimelineId, SyncTask)>, + + queue: Mutex>, + condvar: Condvar, } impl SyncQueue { - fn new( - max_timelines_per_batch: NonZeroUsize, - ) -> (Self, UnboundedReceiver<(ZTenantTimelineId, SyncTask)>) { - let (sender, receiver) = mpsc::unbounded_channel(); - ( - Self { - len: AtomicUsize::new(0), - max_timelines_per_batch, - sender, - }, - receiver, - ) + fn new(max_timelines_per_batch: NonZeroUsize) -> Self { + Self { + max_timelines_per_batch, + queue: Mutex::new(VecDeque::new()), + condvar: Condvar::new(), + } } + /// Queue a new task fn push(&self, sync_id: ZTenantTimelineId, new_task: SyncTask) { - match self.sender.send((sync_id, new_task)) { - Ok(()) => { - self.len.fetch_add(1, Ordering::Relaxed); - } - Err(e) => { - error!("failed to push sync task to queue: {e}"); - } + let mut q = self.queue.lock().unwrap(); + + q.push_back((sync_id, new_task)); + if q.len() <= 1 { + self.condvar.notify_one(); } } /// Fetches a task batch, getting every existing entry from the queue, grouping by timelines and merging the tasks for every timeline. - /// A timeline has to care to not to delete cetain layers from the remote storage before the corresponding uploads happen. - /// Otherwise, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. + /// A timeline has to care to not to delete certain layers from the remote storage before the corresponding uploads happen. + /// Other than that, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. /// Hence, we merge the layers together into single task per timeline and run those concurrently (with the deletion happening only after successful uploading). - async fn next_task_batch( - &self, - // The queue is based on two ends of a channel and has to be accessible statically without blocking for submissions from the sync code. - // Its receiver needs &mut, so we cannot place it in the same container with the other end and get both static and non-blocking access. - // Hence toss this around to use it from the sync loop directly as &mut. - sync_queue_receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - ) -> HashMap { - // request the first task in blocking fashion to do less meaningless work - let (first_sync_id, first_task) = if let Some(first_task) = sync_queue_receiver.recv().await - { - self.len.fetch_sub(1, Ordering::Relaxed); - first_task - } else { - info!("Queue sender part was dropped, aborting"); - return HashMap::new(); - }; + fn next_task_batch(&self) -> (HashMap, usize) { + // Wait for the first task in blocking fashion + let mut q = self.queue.lock().unwrap(); + while q.is_empty() { + q = self + .condvar + .wait_timeout(q, Duration::from_millis(1000)) + .unwrap() + .0; + + if thread_mgr::is_shutdown_requested() { + return (HashMap::new(), q.len()); + } + } + let (first_sync_id, first_task) = q.pop_front().unwrap(); + let mut timelines_left_to_batch = self.max_timelines_per_batch.get() - 1; - let mut tasks_to_process = self.len(); + let tasks_to_process = q.len(); let mut batches = HashMap::with_capacity(tasks_to_process); batches.insert(first_sync_id, SyncTaskBatch::new(first_task)); let mut tasks_to_reenqueue = Vec::with_capacity(tasks_to_process); - // Pull the queue channel until we get all tasks that were there at the beginning of the batch construction. + // Greedily grab as many other tasks that we can. // Yet do not put all timelines in the batch, but only the first ones that fit the timeline limit. - // Still merge the rest of the pulled tasks and reenqueue those for later. - while tasks_to_process > 0 { - match sync_queue_receiver.try_recv() { - Ok((sync_id, new_task)) => { - self.len.fetch_sub(1, Ordering::Relaxed); - tasks_to_process -= 1; - - match batches.entry(sync_id) { - hash_map::Entry::Occupied(mut v) => v.get_mut().add(new_task), - hash_map::Entry::Vacant(v) => { - timelines_left_to_batch = timelines_left_to_batch.saturating_sub(1); - if timelines_left_to_batch == 0 { - tasks_to_reenqueue.push((sync_id, new_task)); - } else { - v.insert(SyncTaskBatch::new(new_task)); - } - } + // Re-enqueue the tasks that don't fit in this batch. + while let Some((sync_id, new_task)) = q.pop_front() { + match batches.entry(sync_id) { + hash_map::Entry::Occupied(mut v) => v.get_mut().add(new_task), + hash_map::Entry::Vacant(v) => { + timelines_left_to_batch = timelines_left_to_batch.saturating_sub(1); + if timelines_left_to_batch == 0 { + tasks_to_reenqueue.push((sync_id, new_task)); + } else { + v.insert(SyncTaskBatch::new(new_task)); } } - Err(TryRecvError::Disconnected) => { - debug!("Sender disconnected, batch collection aborted"); - break; - } - Err(TryRecvError::Empty) => { - debug!("No more data in the sync queue, task batch is not full"); - break; - } } } @@ -553,14 +534,15 @@ impl SyncQueue { tasks_to_reenqueue.len() ); for (id, task) in tasks_to_reenqueue { - self.push(id, task); + q.push_back((id, task)); } - batches + (batches, q.len()) } + #[cfg(test)] fn len(&self) -> usize { - self.len.load(Ordering::Relaxed) + self.queue.lock().unwrap().len() } } @@ -823,7 +805,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent") } -/// Uses a remote storage given to start the storage sync loop. +/// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. pub(super) fn spawn_storage_sync_thread( conf: &'static PageServerConf, @@ -836,7 +818,7 @@ where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - let (sync_queue, sync_queue_receiver) = SyncQueue::new(max_concurrent_timelines_sync); + let sync_queue = SyncQueue::new(max_concurrent_timelines_sync); SYNC_QUEUE .set(sync_queue) .map_err(|_queue| anyhow!("Could not initialize sync queue"))?; @@ -864,7 +846,7 @@ where local_timeline_files, ); - let loop_index = remote_index.clone(); + let remote_index_clone = remote_index.clone(); thread_mgr::spawn( ThreadKind::StorageSync, None, @@ -875,12 +857,7 @@ where storage_sync_loop( runtime, conf, - ( - Arc::new(storage), - loop_index, - sync_queue, - sync_queue_receiver, - ), + (Arc::new(storage), remote_index_clone, sync_queue), max_sync_errors, ); Ok(()) @@ -896,12 +873,7 @@ where fn storage_sync_loop( runtime: Runtime, conf: &'static PageServerConf, - (storage, index, sync_queue, mut sync_queue_receiver): ( - Arc, - RemoteIndex, - &SyncQueue, - UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - ), + (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, ) where P: Debug + Send + Sync + 'static, @@ -909,16 +881,35 @@ fn storage_sync_loop( { info!("Starting remote storage sync loop"); loop { - let loop_index = index.clone(); let loop_storage = Arc::clone(&storage); + + let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch(); + + if thread_mgr::is_shutdown_requested() { + info!("Shutdown requested, stopping"); + break; + } + + REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); + if remaining_queue_length > 0 || !batched_tasks.is_empty() { + info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); + } else { + debug!("No tasks to process"); + continue; + } + + // Concurrently perform all the tasks in the batch let loop_step = runtime.block_on(async { tokio::select! { - step = loop_step( + step = process_batches( conf, - (loop_storage, loop_index, sync_queue, &mut sync_queue_receiver), max_sync_errors, + loop_storage, + &index, + batched_tasks, + sync_queue, ) - .instrument(info_span!("storage_sync_loop_step")) => step, + .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step), _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()), } }); @@ -944,31 +935,18 @@ fn storage_sync_loop( } } -async fn loop_step( +async fn process_batches( conf: &'static PageServerConf, - (storage, index, sync_queue, sync_queue_receiver): ( - Arc, - RemoteIndex, - &SyncQueue, - &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - ), max_sync_errors: NonZeroU32, -) -> ControlFlow<(), HashMap>> + storage: Arc, + index: &RemoteIndex, + batched_tasks: HashMap, + sync_queue: &SyncQueue, +) -> HashMap> where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - let batched_tasks = sync_queue.next_task_batch(sync_queue_receiver).await; - - let remaining_queue_length = sync_queue.len(); - REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); - if remaining_queue_length > 0 || !batched_tasks.is_empty() { - info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); - } else { - debug!("No tasks to process"); - return ControlFlow::Continue(HashMap::new()); - } - let mut sync_results = batched_tasks .into_iter() .map(|(sync_id, batch)| { @@ -993,6 +971,7 @@ where ZTenantId, HashMap, > = HashMap::new(); + while let Some((sync_id, state_update)) = sync_results.next().await { debug!("Finished storage sync task for sync id {sync_id}"); if let Some(state_update) = state_update { @@ -1003,7 +982,7 @@ where } } - ControlFlow::Continue(new_timeline_states) + new_timeline_states } async fn process_sync_task_batch( @@ -1376,7 +1355,6 @@ where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - info!("Updating remote index for the timeline"); let updated_remote_timeline = { let mut index_accessor = index.write().await; @@ -1443,7 +1421,7 @@ where IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline) .context("Failed to create an index part from the updated remote timeline")?; - info!("Uploading remote data for the timeline"); + info!("Uploading remote index for the timeline"); upload_index_part(conf, storage, sync_id, new_index_part) .await .context("Failed to upload new index part") @@ -1685,7 +1663,7 @@ mod tests { #[tokio::test] async fn separate_task_ids_batch() { - let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); assert_eq!(sync_queue.len(), 0); let sync_id_2 = ZTenantTimelineId { @@ -1720,7 +1698,7 @@ mod tests { let submitted_tasks_count = sync_queue.len(); assert_eq!(submitted_tasks_count, 3); - let mut batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + let (mut batch, _) = sync_queue.next_task_batch(); assert_eq!( batch.len(), submitted_tasks_count, @@ -1746,7 +1724,7 @@ mod tests { #[tokio::test] async fn same_task_id_separate_tasks_batch() { - let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); assert_eq!(sync_queue.len(), 0); let download = LayersDownload { @@ -1769,7 +1747,7 @@ mod tests { let submitted_tasks_count = sync_queue.len(); assert_eq!(submitted_tasks_count, 3); - let mut batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + let (mut batch, _) = sync_queue.next_task_batch(); assert_eq!( batch.len(), 1, @@ -1801,7 +1779,7 @@ mod tests { #[tokio::test] async fn same_task_id_same_tasks_batch() { - let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(1).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(1).unwrap()); let download_1 = LayersDownload { layers_to_skip: HashSet::from([PathBuf::from("sk1")]), }; @@ -1823,11 +1801,11 @@ mod tests { sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_1.clone())); sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_2.clone())); - sync_queue.push(sync_id_2, SyncTask::download(download_3.clone())); + sync_queue.push(sync_id_2, SyncTask::download(download_3)); sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_4.clone())); assert_eq!(sync_queue.len(), 4); - let mut smallest_batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + let (mut smallest_batch, _) = sync_queue.next_task_batch(); assert_eq!( smallest_batch.len(), 1, diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 047ad6c2be..91c618d201 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -119,7 +119,7 @@ mod tests { #[tokio::test] async fn delete_timeline_negative() -> anyhow::Result<()> { let harness = RepoHarness::create("delete_timeline_negative")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new( tempdir()?.path().to_path_buf(), @@ -152,7 +152,7 @@ mod tests { #[tokio::test] async fn delete_timeline() -> anyhow::Result<()> { let harness = RepoHarness::create("delete_timeline")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "c", "d"]; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 98a0a0e2fc..a28867f27e 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -286,7 +286,7 @@ mod tests { #[tokio::test] async fn download_timeline() -> anyhow::Result<()> { let harness = RepoHarness::create("download_timeline")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; @@ -385,7 +385,7 @@ mod tests { #[tokio::test] async fn download_timeline_negatives() -> anyhow::Result<()> { let harness = RepoHarness::create("download_timeline_negatives")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index f9d606f2b8..625ec7aed6 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -240,7 +240,7 @@ mod tests { #[tokio::test] async fn regular_layer_upload() -> anyhow::Result<()> { let harness = RepoHarness::create("regular_layer_upload")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; @@ -327,7 +327,7 @@ mod tests { #[tokio::test] async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { let harness = RepoHarness::create("layer_upload_after_local_fs_update")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 9bde9a5c4a..cc35d79d16 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -2,7 +2,7 @@ //! page server. use crate::config::PageServerConf; -use crate::layered_repository::LayeredRepository; +use crate::layered_repository::{load_metadata, LayeredRepository}; use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::{Repository, TimelineSyncStatusUpdate}; use crate::storage_sync::index::RemoteIndex; @@ -22,6 +22,7 @@ use std::collections::HashMap; use std::fmt; use std::sync::Arc; use tracing::*; +use utils::lsn::Lsn; use utils::zid::{ZTenantId, ZTimelineId}; @@ -281,6 +282,7 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { false, move || crate::tenant_threads::gc_loop(tenant_id), ) + .map(|_thread_id| ()) // update the `Result::Ok` type to match the outer function's return signature .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}")); if let Err(e) = &gc_spawn_result { @@ -326,8 +328,8 @@ pub fn get_local_timeline_with_load( return Ok(Arc::clone(page_tline)); } - let page_tline = new_local_timeline(&tenant.repo, timeline_id) - .with_context(|| format!("Failed to create new local timeline for tenant {tenant_id}"))?; + let page_tline = load_local_timeline(&tenant.repo, timeline_id) + .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; tenant .local_timelines .insert(timeline_id, Arc::clone(&page_tline)); @@ -364,7 +366,7 @@ pub fn detach_timeline( Ok(()) } -fn new_local_timeline( +fn load_local_timeline( repo: &RepositoryImpl, timeline_id: ZTimelineId, ) -> anyhow::Result>> { @@ -398,6 +400,26 @@ pub fn list_tenants() -> Vec { .collect() } +/// Check if a given timeline is "broken" \[1\]. +/// The function returns an error if the timeline is "broken". +/// +/// \[1\]: it's not clear now how should we classify a timeline as broken. +/// A timeline is categorized as broken when any of following conditions is true: +/// - failed to load the timeline's metadata +/// - the timeline's disk consistent LSN is zero +fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> { + let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id()) + .context("failed to load metadata")?; + + // A timeline with zero disk consistent LSN can happen when the page server + // failed to checkpoint the timeline import data when creating that timeline. + if metadata.disk_consistent_lsn() == Lsn::INVALID { + bail!("Timeline {timeline_id} has a zero disk consistent LSN."); + } + + Ok(()) +} + fn init_local_repository( conf: &'static PageServerConf, tenant_id: ZTenantId, @@ -413,7 +435,13 @@ fn init_local_repository( match init_status { LocalTimelineInitStatus::LocallyComplete => { debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); + if let Err(err) = check_broken_timeline(&repo, timeline_id) { + info!( + "Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository" + ); + } else { + status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); + } } LocalTimelineInitStatus::NeedsSync => { debug!( @@ -457,8 +485,8 @@ fn apply_timeline_remote_sync_status_updates( bail!("Local timeline {timeline_id} already registered") } Entry::Vacant(v) => { - v.insert(new_local_timeline(repo, timeline_id).with_context(|| { - format!("Failed to register new local timeline for tenant {tenant_id}") + v.insert(load_local_timeline(repo, timeline_id).with_context(|| { + format!("Failed to register add local timeline for tenant {tenant_id}") })?); } }, diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index b908f220ee..473cddda58 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -139,7 +139,7 @@ pub fn spawn( name: &str, shutdown_process_on_error: bool, f: F, -) -> std::io::Result<()> +) -> std::io::Result where F: FnOnce() -> anyhow::Result<()> + Send + 'static, { @@ -193,7 +193,7 @@ where drop(jh_guard); // The thread is now running. Nothing more to do here - Ok(()) + Ok(thread_id) } /// This wrapper function runs in a newly-spawned thread. It initializes the diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 7cfd33c40b..9ab063107c 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -45,6 +45,8 @@ pub struct LocalTimelineInfo { #[serde_as(as = "Option")] pub prev_record_lsn: Option, #[serde_as(as = "DisplayFromStr")] + pub latest_gc_cutoff_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, @@ -68,6 +70,7 @@ impl LocalTimelineInfo { disk_consistent_lsn: datadir_tline.tline.get_disk_consistent_lsn(), last_record_lsn, prev_record_lsn: Some(datadir_tline.tline.get_prev_record_lsn()), + latest_gc_cutoff_lsn: *datadir_tline.tline.get_latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Loaded, current_logical_size: Some(datadir_tline.get_current_logical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { @@ -91,6 +94,7 @@ impl LocalTimelineInfo { disk_consistent_lsn: metadata.disk_consistent_lsn(), last_record_lsn: metadata.disk_consistent_lsn(), prev_record_lsn: metadata.prev_record_lsn(), + latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Unloaded, current_logical_size: None, current_logical_size_non_incremental: None, @@ -281,7 +285,9 @@ fn bootstrap_timeline( ) -> Result<()> { let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); - let initdb_path = conf.tenant_path(&tenantid).join("tmp"); + let initdb_path = conf + .tenant_path(&tenantid) + .join(format!("tmp-timeline-{}", tli)); // Init temporarily repo to get bootstrap data run_initdb(conf, &initdb_path)?; @@ -296,10 +302,15 @@ fn bootstrap_timeline( let timeline = repo.create_empty_timeline(tli, lsn)?; let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + page_tline.tline.checkpoint(CheckpointConfig::Forced)?; - println!( - "created initial timeline {} timeline.lsn {}", + info!( + "created root timeline {} timeline.lsn {}", tli, page_tline.tline.get_last_record_lsn() ); diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index b7a33364c9..b8f349af8f 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -18,6 +18,8 @@ use lazy_static::lazy_static; use postgres_ffi::waldecoder::*; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::cell::Cell; use std::collections::HashMap; use std::str::FromStr; @@ -35,11 +37,19 @@ use utils::{ zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; -// -// We keep one WAL Receiver active per timeline. -// -struct WalReceiverEntry { +/// +/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`. +/// We keep one WAL receiver active per timeline. +/// +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct WalReceiverEntry { + thread_id: u64, wal_producer_connstr: String, + #[serde_as(as = "Option")] + last_received_msg_lsn: Option, + /// the timestamp (in microseconds) of the last received message + last_received_msg_ts: Option, } lazy_static! { @@ -74,7 +84,7 @@ pub fn launch_wal_receiver( receiver.wal_producer_connstr = wal_producer_connstr.into(); } None => { - thread_mgr::spawn( + let thread_id = thread_mgr::spawn( ThreadKind::WalReceiver, Some(tenantid), Some(timelineid), @@ -88,7 +98,10 @@ pub fn launch_wal_receiver( )?; let receiver = WalReceiverEntry { + thread_id, wal_producer_connstr: wal_producer_connstr.into(), + last_received_msg_lsn: None, + last_received_msg_ts: None, }; receivers.insert((tenantid, timelineid), receiver); @@ -99,15 +112,13 @@ pub fn launch_wal_receiver( Ok(()) } -// Look up current WAL producer connection string in the hash table -fn get_wal_producer_connstr(tenantid: ZTenantId, timelineid: ZTimelineId) -> String { +/// Look up a WAL receiver's data in the global `WAL_RECEIVERS` +pub fn get_wal_receiver_entry( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, +) -> Option { let receivers = WAL_RECEIVERS.lock().unwrap(); - - receivers - .get(&(tenantid, timelineid)) - .unwrap() - .wal_producer_connstr - .clone() + receivers.get(&(tenant_id, timeline_id)).cloned() } // @@ -118,7 +129,18 @@ fn thread_main(conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: info!("WAL receiver thread started"); // Look up the current WAL producer address - let wal_producer_connstr = get_wal_producer_connstr(tenant_id, timeline_id); + let wal_producer_connstr = { + match get_wal_receiver_entry(tenant_id, timeline_id) { + Some(e) => e.wal_producer_connstr, + None => { + info!( + "Unable to create the WAL receiver thread: no WAL receiver entry found for tenant {} and timeline {}", + tenant_id, timeline_id + ); + return; + } + } + }; // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server, // and start streaming WAL from it. @@ -318,6 +340,28 @@ fn walreceiver_main( let apply_lsn = u64::from(timeline_remote_consistent_lsn); let ts = SystemTime::now(); + // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS` + { + let mut receivers = WAL_RECEIVERS.lock().unwrap(); + let entry = match receivers.get_mut(&(tenant_id, timeline_id)) { + Some(e) => e, + None => { + anyhow::bail!( + "no WAL receiver entry found for tenant {} and timeline {}", + tenant_id, + timeline_id + ); + } + }; + + entry.last_received_msg_lsn = Some(last_lsn); + entry.last_received_msg_ts = Some( + ts.duration_since(SystemTime::UNIX_EPOCH) + .expect("Received message time should be before UNIX EPOCH!") + .as_micros(), + ); + } + // Send zenith feedback message. // Regular standby_status_update fields are put into this message. let zenith_status_update = ZenithFeedback { diff --git a/poetry.lock b/poetry.lock index a7cbe0aa3c..6e552d2cd3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -21,9 +21,6 @@ category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -typing-extensions = {version = ">=3.6.5", markers = "python_version < \"3.8\""} - [[package]] name = "asyncpg" version = "0.24.0" @@ -32,9 +29,6 @@ category = "main" optional = false python-versions = ">=3.6.0" -[package.dependencies] -typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} - [package.extras] dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=6.0)", "Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"] @@ -125,7 +119,6 @@ python-versions = ">=3.6" [package.dependencies] botocore-stubs = "*" -typing-extensions = {version = "*", markers = "python_version < \"3.9\""} [package.extras] accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.20.0)"] @@ -454,9 +447,6 @@ category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -typing-extensions = {version = "*", markers = "python_version < \"3.9\""} - [[package]] name = "cached-property" version = "1.5.2" @@ -524,7 +514,6 @@ python-versions = ">=3.6" [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} [[package]] name = "colorama" @@ -605,7 +594,6 @@ optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" [package.dependencies] -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} mccabe = ">=0.6.0,<0.7.0" pycodestyle = ">=2.7.0,<2.8.0" pyflakes = ">=2.3.0,<2.4.0" @@ -664,23 +652,6 @@ category = "main" optional = false python-versions = ">=3.5" -[[package]] -name = "importlib-metadata" -version = "4.10.1" -description = "Read metadata from Python packages" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} -zipp = ">=0.5" - -[package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] -perf = ["ipython"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"] - [[package]] name = "iniconfig" version = "1.1.1" @@ -759,9 +730,6 @@ category = "main" optional = false python-versions = ">=2.7" -[package.dependencies] -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} - [package.extras] docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy", "enum34", "jsonlib"] @@ -785,7 +753,6 @@ python-versions = "*" [package.dependencies] attrs = ">=17.4.0" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} pyrsistent = ">=0.14.0" six = ">=1.11.0" @@ -822,7 +789,7 @@ python-versions = "*" [[package]] name = "moto" -version = "3.1.7" +version = "3.1.9" description = "A library that allows your python tests to easily mock out the boto library" category = "main" optional = false @@ -840,7 +807,6 @@ flask = {version = "*", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} Jinja2 = ">=2.10.1" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} MarkupSafe = "!=2.0.0a1" @@ -868,6 +834,7 @@ ds = ["sshpubkeys (>=3.1.0)"] dynamodb = ["docker (>=2.5.1)"] dynamodb2 = ["docker (>=2.5.1)"] dynamodbstreams = ["docker (>=2.5.1)"] +ebs = ["sshpubkeys (>=3.1.0)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] glue = ["pyparsing (>=3.0.0)"] @@ -889,7 +856,6 @@ python-versions = ">=3.5" [package.dependencies] mypy-extensions = ">=0.4.3,<0.5.0" toml = "*" -typed-ast = {version = ">=1.4.0,<1.5.0", markers = "python_version < \"3.8\""} typing-extensions = ">=3.7.4" [package.extras] @@ -946,13 +912,21 @@ category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} - [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "prometheus-client" +version = "0.14.1" +description = "Python client for the Prometheus monitoring system." +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +twisted = ["twisted"] + [[package]] name = "psycopg2-binary" version = "2.9.3" @@ -1003,7 +977,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] name = "pyjwt" -version = "2.3.0" +version = "2.4.0" description = "JSON Web Token implementation in Python" category = "main" optional = false @@ -1049,7 +1023,6 @@ python-versions = ">=3.6" atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" @@ -1082,6 +1055,17 @@ python-versions = "*" [package.dependencies] pytest = ">=3.2.5" +[[package]] +name = "pytest-timeout" +version = "2.1.0" +description = "pytest plugin to abort hanging tests" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pytest = ">=5.0.0" + [[package]] name = "pytest-xdist" version = "2.5.0" @@ -1256,14 +1240,6 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -[[package]] -name = "typed-ast" -version = "1.4.3" -description = "a fork of Python 2 and 3 ast modules with type comment support" -category = "dev" -optional = false -python-versions = "*" - [[package]] name = "types-psycopg2" version = "2.9.6" @@ -1360,22 +1336,10 @@ category = "dev" optional = false python-versions = "*" -[[package]] -name = "zipp" -version = "3.7.0" -description = "Backport of pathlib-compatible object wrapper for zip files" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] - [metadata] lock-version = "1.1" -python-versions = "^3.7" -content-hash = "dc63b6e02d0ceccdc4b5616e9362c149a27fdcc6c54fda63a3b115a5b980c42e" +python-versions = "^3.9" +content-hash = "be9c00bb5081535805824242fea2a03b2f82fa9466856d618e24b3140c7da6a0" [metadata.files] aiopg = [ @@ -1571,10 +1535,6 @@ idna = [ {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] -importlib-metadata = [ - {file = "importlib_metadata-4.10.1-py3-none-any.whl", hash = "sha256:899e2a40a8c4a1aec681feef45733de8a6c58f3f6a0dbed2eb6574b4387a77b6"}, - {file = "importlib_metadata-4.10.1.tar.gz", hash = "sha256:951f0d8a5b7260e9db5e41d429285b5f451e928479f19d80818878527d36e95e"}, -] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, @@ -1693,8 +1653,8 @@ mccabe = [ {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, ] moto = [ - {file = "moto-3.1.7-py3-none-any.whl", hash = "sha256:4ab6fb8dd150343e115d75e3dbdb5a8f850fc7236790819d7cef438c11ee6e89"}, - {file = "moto-3.1.7.tar.gz", hash = "sha256:20607a0fd0cf6530e05ffb623ca84d3f45d50bddbcec2a33705a0cf471e71289"}, + {file = "moto-3.1.9-py3-none-any.whl", hash = "sha256:8928ec168e5fd88b1127413b2fa570a80d45f25182cdad793edd208d07825269"}, + {file = "moto-3.1.9.tar.gz", hash = "sha256:ba683e70950b6579189bc12d74c1477aa036c090c6ad8b151a22f5896c005113"}, ] mypy = [ {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"}, @@ -1741,6 +1701,10 @@ pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, ] +prometheus-client = [ + {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"}, + {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"}, +] psycopg2-binary = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, @@ -1831,8 +1795,8 @@ pyflakes = [ {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, ] pyjwt = [ - {file = "PyJWT-2.3.0-py3-none-any.whl", hash = "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"}, - {file = "PyJWT-2.3.0.tar.gz", hash = "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41"}, + {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, + {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, ] pyparsing = [ {file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"}, @@ -1873,6 +1837,10 @@ pytest-lazy-fixture = [ {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, ] +pytest-timeout = [ + {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, + {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, +] pytest-xdist = [ {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"}, {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"}, @@ -1970,38 +1938,6 @@ toml = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] -typed-ast = [ - {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6"}, - {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075"}, - {file = "typed_ast-1.4.3-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528"}, - {file = "typed_ast-1.4.3-cp35-cp35m-win32.whl", hash = "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428"}, - {file = "typed_ast-1.4.3-cp35-cp35m-win_amd64.whl", hash = "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3"}, - {file = "typed_ast-1.4.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f"}, - {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341"}, - {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace"}, - {file = "typed_ast-1.4.3-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f"}, - {file = "typed_ast-1.4.3-cp36-cp36m-win32.whl", hash = "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363"}, - {file = "typed_ast-1.4.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7"}, - {file = "typed_ast-1.4.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266"}, - {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e"}, - {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04"}, - {file = "typed_ast-1.4.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899"}, - {file = "typed_ast-1.4.3-cp37-cp37m-win32.whl", hash = "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c"}, - {file = "typed_ast-1.4.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805"}, - {file = "typed_ast-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a"}, - {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff"}, - {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41"}, - {file = "typed_ast-1.4.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39"}, - {file = "typed_ast-1.4.3-cp38-cp38-win32.whl", hash = "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927"}, - {file = "typed_ast-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40"}, - {file = "typed_ast-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3"}, - {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4"}, - {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0"}, - {file = "typed_ast-1.4.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3"}, - {file = "typed_ast-1.4.3-cp39-cp39-win32.whl", hash = "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808"}, - {file = "typed_ast-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c"}, - {file = "typed_ast-1.4.3.tar.gz", hash = "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"}, -] types-psycopg2 = [ {file = "types-psycopg2-2.9.6.tar.gz", hash = "sha256:753b50b38da0e61bc8f89d149f2c4420c7e18535a87963d17b72343eb98f7c32"}, {file = "types_psycopg2-2.9.6-py3-none-any.whl", hash = "sha256:2cfd855e1562ebb5da595ee9401da93a308d69121ccd359cb8341f94ba4b6d1c"}, @@ -2092,7 +2028,3 @@ yapf = [ {file = "yapf-0.31.0-py2.py3-none-any.whl", hash = "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"}, {file = "yapf-0.31.0.tar.gz", hash = "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d"}, ] -zipp = [ - {file = "zipp-3.7.0-py3-none-any.whl", hash = "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"}, - {file = "zipp-3.7.0.tar.gz", hash = "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d"}, -] diff --git a/proxy/src/auth_backend/link.rs b/proxy/src/auth_backend/link.rs index 9bdb9e21c4..8e5fcb32a9 100644 --- a/proxy/src/auth_backend/link.rs +++ b/proxy/src/auth_backend/link.rs @@ -5,12 +5,9 @@ use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ - "☀️ Welcome to Neon!\n", - "To proceed with database creation, open the following link:\n\n", + "Welcome to Neon!\n", + "Authenticate by visiting:\n", " {redirect_uri}{session_id}\n\n", - "It needs to be done once and we will send you '.pgpass' file,\n", - "which will allow you to access or create ", - "databases without opening your web browser." ], redirect_uri = redirect_uri, session_id = session_id, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 077a07beb9..6f1b56bfe4 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -61,7 +61,8 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result anyhow::Result<()> { - metrics::set_common_metrics_prefix("zenith_proxy"); let arg_matches = App::new("Neon proxy/router") .version(GIT_VERSION) .arg( diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 821ce377f5..642e50c2c1 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -5,7 +5,7 @@ use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; use lazy_static::lazy_static; -use metrics::{new_common_metric_name, register_int_counter, IntCounter}; +use metrics::{register_int_counter, IntCounter}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, *}; @@ -15,17 +15,17 @@ const ERR_PROTO_VIOLATION: &str = "protocol violation"; lazy_static! { static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!( - new_common_metric_name("num_connections_accepted"), + "proxy_accepted_connections_total", "Number of TCP client connections accepted." ) .unwrap(); static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!( - new_common_metric_name("num_connections_closed"), + "proxy_closed_connections_total", "Number of TCP client connections closed." ) .unwrap(); static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!( - new_common_metric_name("num_bytes_proxied"), + "proxy_io_bytes_total", "Number of bytes sent/received between any client and backend." ) .unwrap(); diff --git a/pyproject.toml b/pyproject.toml index 335c6d61d8..c965535049 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "" authors = [] [tool.poetry.dependencies] -python = "^3.7" +python = "^3.9" pytest = "^6.2.5" psycopg2-binary = "^2.9.1" typing-extensions = "^3.10.0" @@ -23,6 +23,8 @@ boto3-stubs = "^1.20.40" moto = {version = "^3.0.0", extras = ["server"]} backoff = "^1.11.1" pytest-lazy-fixture = "^0.6.3" +prometheus-client = "^0.14.1" +pytest-timeout = "^2.1.0" [tool.poetry.dev-dependencies] yapf = "==0.31.0" diff --git a/pytest.ini b/pytest.ini index abc69b765b..da9ab8c12f 100644 --- a/pytest.ini +++ b/pytest.ini @@ -9,3 +9,4 @@ minversion = 6.0 log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s log_date_format = %Y-%m-%d %H:%M:%S log_cli = true +timeout = 300 diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 417cf58cd5..373108c61b 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,10 @@ const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-util = { version = "0.7", features = ["io"] } git-version = "0.3.5" +async-trait = "0.1" +once_cell = "1.10.0" +futures = "0.3.13" +toml_edit = { version = "0.13", features = ["easy"] } postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/safekeeper/README.md b/safekeeper/README.md index 3f097d0c24..a4bb260932 100644 --- a/safekeeper/README.md +++ b/safekeeper/README.md @@ -1,6 +1,6 @@ # WAL service -The zenith WAL service acts as a holding area and redistribution +The neon WAL service acts as a holding area and redistribution center for recently generated WAL. The primary Postgres server streams the WAL to the WAL safekeeper, and treats it like a (synchronous) replica. A replication slot is used in the primary to prevent the @@ -94,7 +94,7 @@ Q: What if the compute node evicts a page, needs it back, but the page is yet A: If the compute node has evicted a page, changes to it have been WAL-logged (that's why it is called Write Ahead logging; there are some exceptions like index builds, but these are exceptions). These WAL records will eventually - reach the Page Server. The Page Server notes that the compute note requests + reach the Page Server. The Page Server notes that the compute node requests pages with a very recent LSN and will not respond to the compute node until a corresponding WAL is received from WAL safekeepers. diff --git a/safekeeper/README_PROTO.md b/safekeeper/README_PROTO.md index 5d79f8c2d3..6b2ae50254 100644 --- a/safekeeper/README_PROTO.md +++ b/safekeeper/README_PROTO.md @@ -151,7 +151,7 @@ It is assumed that in case of loosing local data by some safekeepers, it should * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `NodeID`: pair (term,UUID) -* `Pager`: Zenith component restoring pages from WAL stream +* `Pager`: Neon component restoring pages from WAL stream * `Replica`: read-only computatio node * `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 6955d2aa5c..a7628482d9 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -6,33 +6,37 @@ use clap::{App, Arg}; use const_format::formatcp; use daemonize::Daemonize; use fs2::FileExt; +use remote_storage::RemoteStorageConfig; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::thread; use tokio::sync::mpsc; +use toml_edit::Document; use tracing::*; use url::{ParseError, Url}; use safekeeper::control_file::{self}; -use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; +use safekeeper::defaults::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, +}; +use safekeeper::http; use safekeeper::remove_wal; use safekeeper::timeline::GlobalTimelines; +use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use safekeeper::{broker, callmemaybe}; -use safekeeper::{http, s3_offload}; use utils::{ http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, - zid::ZNodeId, + zid::NodeId, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); -fn main() -> Result<()> { - metrics::set_common_metrics_prefix("safekeeper"); +fn main() -> anyhow::Result<()> { let arg_matches = App::new("Zenith safekeeper") .about("Store WAL stream to local file system and push it to WAL receivers") .version(GIT_VERSION) @@ -72,12 +76,6 @@ fn main() -> Result<()> { .long("pageserver") .takes_value(true), ) - .arg( - Arg::new("ttl") - .long("ttl") - .takes_value(true) - .help("interval for keeping WAL at safekeeper node, after which them will be uploaded to S3 and removed locally"), - ) .arg( Arg::new("recall") .long("recall") @@ -119,12 +117,20 @@ fn main() -> Result<()> { .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), ) .arg( - Arg::new("enable-s3-offload") - .long("enable-s3-offload") + Arg::new("wal-backup-threads").long("backup-threads").takes_value(true).help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")), + ).arg( + Arg::new("remote-storage") + .long("remote-storage") + .takes_value(true) + .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"\", \"bucket_region\":\"\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]//, mirroring structure on the file system.") + ) + .arg( + Arg::new("enable-wal-backup") + .long("enable-wal-backup") .takes_value(true) .default_value("true") .default_missing_value("true") - .help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."), + .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."), ) .get_matches(); @@ -158,17 +164,13 @@ fn main() -> Result<()> { conf.listen_http_addr = addr.to_owned(); } - if let Some(ttl) = arg_matches.value_of("ttl") { - conf.ttl = Some(humantime::parse_duration(ttl)?); - } - if let Some(recall) = arg_matches.value_of("recall") { conf.recall_period = humantime::parse_duration(recall)?; } let mut given_id = None; if let Some(given_id_str) = arg_matches.value_of("id") { - given_id = Some(ZNodeId( + given_id = Some(NodeId( given_id_str .parse() .context("failed to parse safekeeper id")?, @@ -177,15 +179,27 @@ fn main() -> Result<()> { if let Some(addr) = arg_matches.value_of("broker-endpoints") { let collected_ep: Result, ParseError> = addr.split(',').map(Url::parse).collect(); - conf.broker_endpoints = Some(collected_ep?); + conf.broker_endpoints = collected_ep.context("Failed to parse broker endpoint urls")?; } if let Some(prefix) = arg_matches.value_of("broker-etcd-prefix") { conf.broker_etcd_prefix = prefix.to_string(); } + if let Some(backup_threads) = arg_matches.value_of("wal-backup-threads") { + conf.backup_runtime_threads = backup_threads + .parse() + .with_context(|| format!("Failed to parse backup threads {}", backup_threads))?; + } + if let Some(storage_conf) = arg_matches.value_of("remote-storage") { + // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse + let storage_conf_toml = format!("remote_storage = {}", storage_conf); + let parsed_toml = storage_conf_toml.parse::()?; // parse + let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again + conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?); + } // Seems like there is no better way to accept bool values explicitly in clap. - conf.s3_offload_enabled = arg_matches - .value_of("enable-s3-offload") + conf.wal_backup_enabled = arg_matches + .value_of("enable-wal-backup") .unwrap() .parse() .context("failed to parse bool enable-s3-offload bool")?; @@ -193,7 +207,7 @@ fn main() -> Result<()> { start_safekeeper(conf, given_id, arg_matches.is_present("init")) } -fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { +fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { let log_file = logging::init("safekeeper.log", conf.daemonize)?; info!("version: {GIT_VERSION}"); @@ -246,14 +260,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b // Otherwise, the coverage data will be damaged. match daemonize.exit_action(|| exit_now(0)).start() { Ok(_) => info!("Success, daemonized"), - Err(e) => error!("Error, {}", e), + Err(err) => bail!("Error: {err}. could not daemonize. bailing."), } } let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); - GlobalTimelines::set_callmemaybe_tx(callmemaybe_tx); + let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + GlobalTimelines::init(callmemaybe_tx, wal_backup_launcher_tx); let conf_ = conf.clone(); threads.push( @@ -271,17 +286,6 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b })?, ); - if conf.ttl.is_some() { - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("S3 offload thread".into()) - .spawn(|| { - s3_offload::thread_main(conf_); - })?, - ); - } - let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() .name("Safekeeper thread".into()) @@ -309,7 +313,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b .unwrap(); threads.push(callmemaybe_thread); - if conf.broker_endpoints.is_some() { + if !conf.broker_endpoints.is_empty() { let conf_ = conf.clone(); threads.push( thread::Builder::new() @@ -318,6 +322,8 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b broker::thread_main(conf_); })?, ); + } else { + warn!("No broker endpoints providing, starting without node sync") } let conf_ = conf.clone(); @@ -329,6 +335,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b })?, ); + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("wal backup launcher thread".into()) + .spawn(move || { + wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx); + })?, + ); + // TODO: put more thoughts into handling of failed threads // We probably should restart them. @@ -344,14 +359,14 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b } /// Determine safekeeper id and set it in config. -fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { +fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { let id_file_path = conf.workdir.join(ID_FILE_NAME); - let my_id: ZNodeId; + let my_id: NodeId; // If ID exists, read it in; otherwise set one passed match fs::read(&id_file_path) { Ok(id_serialized) => { - my_id = ZNodeId( + my_id = NodeId( std::str::from_utf8(&id_serialized) .context("failed to parse safekeeper id")? .parse() diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index d9c60c9db0..5bcb197205 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,5 +1,6 @@ //! Communication with etcd, providing safekeeper peers and pageserver coordination. +use anyhow::anyhow; use anyhow::Context; use anyhow::Error; use anyhow::Result; @@ -7,12 +8,14 @@ use etcd_broker::Client; use etcd_broker::PutOptions; use etcd_broker::SkTimelineSubscriptionKind; use std::time::Duration; +use tokio::spawn; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use url::Url; use crate::{timeline::GlobalTimelines, SafeKeeperConf}; -use utils::zid::{ZNodeId, ZTenantTimelineId}; +use utils::zid::{NodeId, ZTenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; @@ -34,19 +37,131 @@ pub fn thread_main(conf: SafeKeeperConf) { /// Key to per timeline per safekeeper data. fn timeline_safekeeper_path( - broker_prefix: String, + broker_etcd_prefix: String, zttid: ZTenantTimelineId, - sk_id: ZNodeId, + sk_id: NodeId, ) -> String { format!( "{}/{sk_id}", - SkTimelineSubscriptionKind::timeline(broker_prefix, zttid).watch_key() + SkTimelineSubscriptionKind::timeline(broker_etcd_prefix, zttid).watch_key() ) } +pub struct Election { + pub election_name: String, + pub candidate_name: String, + pub broker_endpoints: Vec, +} + +impl Election { + pub fn new(election_name: String, candidate_name: String, broker_endpoints: Vec) -> Self { + Self { + election_name, + candidate_name, + broker_endpoints, + } + } +} + +pub struct ElectionLeader { + client: Client, + keep_alive: JoinHandle>, +} + +impl ElectionLeader { + pub async fn check_am_i( + &mut self, + election_name: String, + candidate_name: String, + ) -> Result { + let resp = self.client.leader(election_name).await?; + + let kv = resp.kv().ok_or(anyhow!("failed to get leader response"))?; + let leader = kv.value_str()?; + + Ok(leader == candidate_name) + } + + pub async fn give_up(self) { + self.keep_alive.abort(); + // TODO: it'll be wise to resign here but it'll happen after lease expiration anyway + // should we await for keep alive termination? + let _ = self.keep_alive.await; + } +} + +pub async fn get_leader(req: &Election) -> Result { + let mut client = Client::connect(req.broker_endpoints.clone(), None) + .await + .context("Could not connect to etcd")?; + + let lease = client + .lease_grant(LEASE_TTL_SEC, None) + .await + .context("Could not acquire a lease"); + + let lease_id = lease.map(|l| l.id()).unwrap(); + + let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id)); + + if let Err(e) = client + .campaign( + req.election_name.clone(), + req.candidate_name.clone(), + lease_id, + ) + .await + { + keep_alive.abort(); + let _ = keep_alive.await; + return Err(e.into()); + } + + Ok(ElectionLeader { client, keep_alive }) +} + +async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> { + let (mut keeper, mut ka_stream) = client + .lease_keep_alive(lease_id) + .await + .context("failed to create keepalive stream")?; + + loop { + let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); + + keeper + .keep_alive() + .await + .context("failed to send LeaseKeepAliveRequest")?; + + ka_stream + .message() + .await + .context("failed to receive LeaseKeepAliveResponse")?; + + sleep(push_interval).await; + } +} + +pub fn get_campaign_name( + election_name: String, + broker_prefix: String, + timeline_id: &ZTenantTimelineId, +) -> String { + return format!( + "{}/{}", + SkTimelineSubscriptionKind::timeline(broker_prefix, *timeline_id).watch_key(), + election_name + ); +} + +pub fn get_candiate_name(system_id: NodeId) -> String { + format!("id_{}", system_id) +} + /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { - let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?; + let mut client = Client::connect(&conf.broker_endpoints, None).await?; // Get and maintain lease to automatically delete obsolete data let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; @@ -59,7 +174,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // sensitive and there is no risk of deadlock as we don't await while // lock is held. for zttid in GlobalTimelines::get_active_timelines() { - if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { + if let Some(tli) = GlobalTimelines::get_loaded(zttid) { let sk_info = tli.get_public_info(&conf)?; let put_opts = PutOptions::new().with_lease(lease.id()); client @@ -91,7 +206,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { /// Subscribe and fetch all the interesting data from the broker. async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { - let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?; + let mut client = Client::connect(&conf.broker_endpoints, None).await?; let mut subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates( &mut client, @@ -99,7 +214,6 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { ) .await .context("failed to subscribe for safekeeper info")?; - loop { match subscription.fetch_data().await { Some(new_info) => { @@ -107,12 +221,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { // note: there are blocking operations below, but it's considered fine for now if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { for (safekeeper_id, info) in sk_info { - tli.record_safekeeper_info(&info, safekeeper_id)? + tli.record_safekeeper_info(&info, safekeeper_id).await? } } } } None => { + // XXX it means we lost connection with etcd, error is consumed inside sub object debug!("timeline updates sender closed, aborting the pull loop"); return Ok(()); } @@ -143,11 +258,12 @@ async fn main_loop(conf: SafeKeeperConf) { }, res = async { pull_handle.as_mut().unwrap().await }, if pull_handle.is_some() => { // was it panic or normal error? - let err = match res { - Ok(res_internal) => res_internal.unwrap_err(), - Err(err_outer) => err_outer.into(), + match res { + Ok(res_internal) => if let Err(err_inner) = res_internal { + warn!("pull task failed: {:?}", err_inner); + } + Err(err_outer) => { warn!("pull task panicked: {:?}", err_outer) } }; - warn!("pull task failed: {:?}", err); pull_handle = None; }, _ = ticker.tick() => { diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 22716de1a0..8d36472540 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -165,7 +165,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -188,7 +188,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -211,7 +211,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -234,7 +234,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, peer_horizon_lsn: oldstate.peer_horizon_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), diff --git a/safekeeper/src/http/models.rs b/safekeeper/src/http/models.rs index ca18e64096..77efc0cc21 100644 --- a/safekeeper/src/http/models.rs +++ b/safekeeper/src/http/models.rs @@ -1,9 +1,9 @@ use serde::{Deserialize, Serialize}; -use utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; +use utils::zid::{NodeId, ZTenantId, ZTimelineId}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, - pub peer_ids: Vec, + pub peer_ids: Vec, } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 62fbd2ff2f..b0197a9a2a 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -20,14 +20,14 @@ use utils::{ RequestExt, RouterBuilder, }, lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::models::TimelineCreateRequest; #[derive(Debug, Serialize)] struct SafekeeperStatus { - id: ZNodeId, + id: NodeId, } /// Healthcheck handler. @@ -70,19 +70,19 @@ struct TimelineStatus { timeline_id: ZTimelineId, acceptor_state: AcceptorStateStatus, #[serde(serialize_with = "display_serialize")] + flush_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] timeline_start_lsn: Lsn, #[serde(serialize_with = "display_serialize")] local_start_lsn: Lsn, #[serde(serialize_with = "display_serialize")] commit_lsn: Lsn, #[serde(serialize_with = "display_serialize")] - s3_wal_lsn: Lsn, + backup_lsn: Lsn, #[serde(serialize_with = "display_serialize")] peer_horizon_lsn: Lsn, #[serde(serialize_with = "display_serialize")] remote_consistent_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - flush_lsn: Lsn, } /// Report info about timeline. @@ -107,13 +107,13 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result, pub recall_period: Duration, - pub my_id: ZNodeId, - pub broker_endpoints: Option>, + pub remote_storage: Option, + pub backup_runtime_threads: usize, + pub wal_backup_enabled: bool, + pub my_id: NodeId, + pub broker_endpoints: Vec, pub broker_etcd_prefix: String, - pub s3_offload_enabled: bool, } impl SafeKeeperConf { @@ -78,12 +81,13 @@ impl Default for SafeKeeperConf { no_sync: false, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - ttl: None, + remote_storage: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, - my_id: ZNodeId(0), - broker_endpoints: None, - broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(), - s3_offload_enabled: true, + my_id: NodeId(0), + broker_endpoints: Vec::new(), + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS, + wal_backup_enabled: true, } } } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 0ef335c9ed..88b7816912 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -85,16 +85,10 @@ impl<'pg> ReceiveWalConn<'pg> { _ => bail!("unexpected message {:?} instead of greeting", next_msg), } - // Register the connection and defer unregister. - spg.timeline - .get() - .on_compute_connect(self.pageserver_connstr.as_ref())?; - let _guard = ComputeConnectionGuard { - timeline: Arc::clone(spg.timeline.get()), - }; - let mut next_msg = Some(next_msg); + let mut first_time_through = true; + let mut _guard: Option = None; loop { if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) { // poll AppendRequest's without blocking and write WAL to disk without flushing, @@ -122,6 +116,18 @@ impl<'pg> ReceiveWalConn<'pg> { self.write_msg(&reply)?; } } + if first_time_through { + // Register the connection and defer unregister. Do that only + // after processing first message, as it sets wal_seg_size, + // wanted by many. + spg.timeline + .get() + .on_compute_connect(self.pageserver_connstr.as_ref())?; + _guard = Some(ComputeConnectionGuard { + timeline: Arc::clone(spg.timeline.get()), + }); + first_time_through = false; + } // blocking wait for the next message if next_msg.is_none() { diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 3278d51bd3..004c0243f9 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) { let active_tlis = GlobalTimelines::get_active_timelines(); for zttid in &active_tlis { if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) { + if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { warn!( "failed to remove WAL for tenant {} timeline {}: {}", tli.zttid.tenant_id, tli.zttid.timeline_id, e diff --git a/safekeeper/src/s3_offload.rs b/safekeeper/src/s3_offload.rs deleted file mode 100644 index 2851c0b8a0..0000000000 --- a/safekeeper/src/s3_offload.rs +++ /dev/null @@ -1,107 +0,0 @@ -// -// Offload old WAL segments to S3 and remove them locally -// Needs `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to be set -// if no IAM bucket access is used. -// - -use anyhow::{bail, Context}; -use postgres_ffi::xlog_utils::*; -use remote_storage::{ - GenericRemoteStorage, RemoteStorage, RemoteStorageConfig, S3Bucket, S3Config, S3ObjectKey, -}; -use std::collections::HashSet; -use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; -use std::path::Path; -use std::time::SystemTime; -use tokio::fs::{self, File}; -use tokio::io::BufReader; -use tokio::runtime; -use tokio::time::sleep; -use tracing::*; -use walkdir::WalkDir; - -use crate::SafeKeeperConf; - -pub fn thread_main(conf: SafeKeeperConf) { - // Create a new thread pool - // - // FIXME: keep it single-threaded for now, make it easier to debug with gdb, - // and we're not concerned with performance yet. - //let runtime = runtime::Runtime::new().unwrap(); - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - info!("Starting S3 offload task"); - - runtime.block_on(async { - main_loop(&conf).await.unwrap(); - }); -} - -async fn offload_files( - remote_storage: &S3Bucket, - listing: &HashSet, - dir_path: &Path, - conf: &SafeKeeperConf, -) -> anyhow::Result { - let horizon = SystemTime::now() - conf.ttl.unwrap(); - let mut n: u64 = 0; - for entry in WalkDir::new(dir_path) { - let entry = entry?; - let path = entry.path(); - - if path.is_file() - && IsXLogFileName(entry.file_name().to_str().unwrap()) - && entry.metadata().unwrap().created().unwrap() <= horizon - { - let remote_path = remote_storage.remote_object_id(path)?; - if !listing.contains(&remote_path) { - let file = File::open(&path).await?; - let file_length = file.metadata().await?.len() as usize; - remote_storage - .upload(BufReader::new(file), file_length, &remote_path, None) - .await?; - - fs::remove_file(&path).await?; - n += 1; - } - } - } - Ok(n) -} - -async fn main_loop(conf: &SafeKeeperConf) -> anyhow::Result<()> { - let remote_storage = match GenericRemoteStorage::new( - conf.workdir.clone(), - &RemoteStorageConfig { - max_concurrent_syncs: NonZeroUsize::new(10).unwrap(), - max_sync_errors: NonZeroU32::new(1).unwrap(), - storage: remote_storage::RemoteStorageKind::AwsS3(S3Config { - bucket_name: "zenith-testbucket".to_string(), - bucket_region: env::var("S3_REGION").context("S3_REGION env var is not set")?, - prefix_in_bucket: Some("walarchive/".to_string()), - endpoint: Some(env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?), - concurrency_limit: NonZeroUsize::new(20).unwrap(), - }), - }, - )? { - GenericRemoteStorage::Local(_) => { - bail!("Unexpected: got local storage for the remote config") - } - GenericRemoteStorage::S3(remote_storage) => remote_storage, - }; - - loop { - let listing = remote_storage - .list() - .await? - .into_iter() - .collect::>(); - let n = offload_files(&remote_storage, &listing, &conf.workdir, conf).await?; - info!("Offload {n} files to S3"); - sleep(conf.ttl.unwrap()).await; - } -} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fff1c269b6..0a7adb96b6 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -19,6 +19,7 @@ use lazy_static::lazy_static; use crate::control_file; use crate::send_wal::HotStandbyFeedback; + use crate::wal_storage; use metrics::{register_gauge_vec, Gauge, GaugeVec}; use postgres_ffi::xlog_utils::MAX_SEND_SIZE; @@ -26,7 +27,7 @@ use utils::{ bin_ser::LeSer, lsn::Lsn, pq_proto::{SystemId, ZenithFeedback}, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; pub const SK_MAGIC: u32 = 0xcafeceefu32; @@ -141,7 +142,7 @@ pub struct ServerInfo { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PeerInfo { /// LSN up to which safekeeper offloaded WAL to s3. - s3_wal_lsn: Lsn, + backup_lsn: Lsn, /// Term of the last entry. term: Term, /// LSN of the last record. @@ -153,7 +154,7 @@ pub struct PeerInfo { impl PeerInfo { fn new() -> Self { Self { - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, term: INVALID_TERM, flush_lsn: Lsn(0), commit_lsn: Lsn(0), @@ -164,7 +165,7 @@ impl PeerInfo { // vector-based node id -> peer state map with very limited functionality we // need/ #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>); +pub struct Peers(pub Vec<(NodeId, PeerInfo)>); /// Persistent information stored on safekeeper node /// On disk data is prefixed by magic and format version and followed by checksum. @@ -193,9 +194,9 @@ pub struct SafeKeeperState { /// Part of WAL acknowledged by quorum and available locally. Always points /// to record boundary. pub commit_lsn: Lsn, - /// First LSN not yet offloaded to s3. Useful to persist to avoid finding - /// out offloading progress on boot. - pub s3_wal_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn /// of last record streamed to everyone). Persisting it helps skipping /// recovery in walproposer, generally we compute it from peers. In @@ -217,14 +218,14 @@ pub struct SafeKeeperState { // are not flushed yet. pub struct SafekeeperMemState { pub commit_lsn: Lsn, - pub s3_wal_lsn: Lsn, // TODO: keep only persistent version + pub backup_lsn: Lsn, pub peer_horizon_lsn: Lsn, pub remote_consistent_lsn: Lsn, pub proposer_uuid: PgUuid, } impl SafeKeeperState { - pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { tenant_id: zttid.tenant_id, timeline_id: zttid.timeline_id, @@ -241,7 +242,7 @@ impl SafeKeeperState { timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: Lsn(0), - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, peer_horizon_lsn: Lsn(0), remote_consistent_lsn: Lsn(0), peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()), @@ -277,7 +278,7 @@ pub struct ProposerGreeting { #[derive(Debug, Serialize)] pub struct AcceptorGreeting { term: u64, - node_id: ZNodeId, + node_id: NodeId, } /// Vote request sent from proposer to safekeepers @@ -531,7 +532,7 @@ pub struct SafeKeeper { pub wal_store: WAL, - node_id: ZNodeId, // safekeeper's node id + node_id: NodeId, // safekeeper's node id } impl SafeKeeper @@ -544,7 +545,7 @@ where ztli: ZTimelineId, state: CTRL, mut wal_store: WAL, - node_id: ZNodeId, + node_id: NodeId, ) -> Result> { if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); @@ -559,7 +560,7 @@ where epoch_start_lsn: Lsn(0), inmem: SafekeeperMemState { commit_lsn: state.commit_lsn, - s3_wal_lsn: state.s3_wal_lsn, + backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, remote_consistent_lsn: state.remote_consistent_lsn, proposer_uuid: state.proposer_uuid, @@ -649,7 +650,6 @@ where self.state.persist(&state)?; } - // pass wal_seg_size to read WAL and find flush_lsn self.wal_store.init_storage(&self.state)?; info!( @@ -731,24 +731,36 @@ where { let mut state = self.state.clone(); - // Remeber point where WAL begins globally, if not yet. + // Here we learn initial LSN for the first time, set fields + // interested in that. + if state.timeline_start_lsn == Lsn(0) { + // Remember point where WAL begins globally. state.timeline_start_lsn = msg.timeline_start_lsn; info!( "setting timeline_start_lsn to {:?}", state.timeline_start_lsn ); - } - // Remember point where WAL begins locally, if not yet. (I doubt the - // second condition is ever possible) - if state.local_start_lsn == Lsn(0) || state.local_start_lsn >= msg.start_streaming_at { state.local_start_lsn = msg.start_streaming_at; info!("setting local_start_lsn to {:?}", state.local_start_lsn); } + // Initializing commit_lsn before acking first flushed record is + // important to let find_end_of_wal skip the whole in the beginning + // of the first segment. + // + // NB: on new clusters, this happens at the same time as + // timeline_start_lsn initialization, it is taken outside to provide + // upgrade. + self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn); + self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); + self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); + + // Initalizing backup_lsn is useful to avoid making backup think it should upload 0 segment. + self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); - self.state.persist(&state)?; + self.persist_control_file(state)?; } info!("start receiving WAL since {:?}", msg.start_streaming_at); @@ -772,25 +784,16 @@ where // that we receive new epoch_start_lsn, and we still need to sync // control file in this case. if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn { - self.persist_control_file()?; - } - - // We got our first commit_lsn, which means we should sync - // everything to disk, to initialize the state. - if self.state.commit_lsn == Lsn(0) && commit_lsn > Lsn(0) { - self.wal_store.flush_wal()?; - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } Ok(()) } - /// Persist in-memory state to the disk. - fn persist_control_file(&mut self) -> Result<()> { - let mut state = self.state.clone(); - + /// Persist in-memory state to the disk, taking other data from state. + fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; - state.s3_wal_lsn = self.inmem.s3_wal_lsn; + state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; state.remote_consistent_lsn = self.inmem.remote_consistent_lsn; state.proposer_uuid = self.inmem.proposer_uuid; @@ -823,13 +826,6 @@ where // do the job if !msg.wal_data.is_empty() { self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; - - // If this was the first record we ever received, initialize - // commit_lsn to help find_end_of_wal skip the hole in the - // beginning. - if self.global_commit_lsn == Lsn(0) { - self.global_commit_lsn = msg.h.begin_lsn; - } } // flush wal to the disk, if required @@ -852,7 +848,7 @@ where if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) < self.inmem.peer_horizon_lsn { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } trace!( @@ -898,11 +894,11 @@ where self.update_commit_lsn()?; } } - if let Some(s3_wal_lsn) = sk_info.s3_wal_lsn { - let new_s3_wal_lsn = max(s3_wal_lsn, self.inmem.s3_wal_lsn); + if let Some(backup_lsn) = sk_info.backup_lsn { + let new_backup_lsn = max(backup_lsn, self.inmem.backup_lsn); sync_control_file |= - self.state.s3_wal_lsn + (self.state.server.wal_seg_size as u64) < new_s3_wal_lsn; - self.inmem.s3_wal_lsn = new_s3_wal_lsn; + self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; + self.inmem.backup_lsn = new_backup_lsn; } if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { let new_remote_consistent_lsn = @@ -920,7 +916,7 @@ where self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; } if sync_control_file { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } Ok(()) } @@ -930,29 +926,23 @@ where /// offloading. /// While it is safe to use inmem values for determining horizon, /// we use persistent to make possible normal states less surprising. - pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo { - let s3_offload_horizon = if s3_offload_enabled { - self.state.s3_wal_lsn - } else { - Lsn(u64::MAX) - }; - let horizon_lsn = min( - min( - self.state.remote_consistent_lsn, - self.state.peer_horizon_lsn, - ), - s3_offload_horizon, + pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo { + let mut horizon_lsn = min( + self.state.remote_consistent_lsn, + self.state.peer_horizon_lsn, ); + if wal_backup_enabled { + horizon_lsn = min(horizon_lsn, self.state.backup_lsn); + } horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) } } #[cfg(test)] mod tests { - use std::ops::Deref; - use super::*; use crate::wal_storage::Storage; + use std::ops::Deref; // fake storage for tests struct InMemoryState { @@ -1013,7 +1003,8 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); + + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -1028,7 +1019,8 @@ mod tests { let storage = InMemoryState { persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, ZNodeId(0)).unwrap(); + + sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1045,7 +1037,8 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); + + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index d52dd6ea57..a89ed18071 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -315,7 +315,7 @@ impl ReplicationConn { } else { // TODO: also check once in a while whether we are walsender // to right pageserver. - if spg.timeline.get().check_deactivate(replica_id)? { + if spg.timeline.get().stop_walsender(replica_id)? { // Shut down, timeline is suspended. // TODO create proper error type for this bail!("end streaming to {:?}", spg.appname); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 84ad53d72d..74a61410fd 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -8,6 +8,7 @@ use lazy_static::lazy_static; use postgres_ffi::xlog_utils::XLogSegNo; use serde::Serialize; +use tokio::sync::watch; use std::cmp::{max, min}; use std::collections::HashMap; @@ -15,23 +16,23 @@ use std::fs::{self}; use std::sync::{Arc, Condvar, Mutex, MutexGuard}; use std::time::Duration; -use tokio::sync::mpsc::UnboundedSender; +use tokio::sync::mpsc::{Sender, UnboundedSender}; use tracing::*; use utils::{ lsn::Lsn, pq_proto::ZenithFeedback, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; - use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, SafekeeperMemState, }; use crate::send_wal::HotStandbyFeedback; + use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; @@ -81,10 +82,14 @@ struct SharedState { notified_commit_lsn: Lsn, /// State of replicas replicas: Vec>, - /// Inactive clusters shouldn't occupy any resources, so timeline is - /// activated whenever there is a compute connection or pageserver is not - /// caughtup (it must have latest WAL for new compute start) and suspended - /// otherwise. + /// True when WAL backup launcher oversees the timeline, making sure WAL is + /// offloaded, allows to bother launcher less. + wal_backup_active: bool, + /// True whenever there is at least some pending activity on timeline: live + /// compute connection, pageserver is not caughtup (it must have latest WAL + /// for new compute start) or WAL backuping is not finished. Practically it + /// means safekeepers broadcast info to peers about the timeline, old WAL is + /// trimmed. /// /// TODO: it might be better to remove tli completely from GlobalTimelines /// when tli is inactive instead of having this flag. @@ -99,10 +104,11 @@ impl SharedState { fn create( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; + let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; @@ -110,6 +116,7 @@ impl SharedState { notified_commit_lsn: Lsn(0), sk, replicas: Vec::new(), + wal_backup_active: false, active: false, num_computes: 0, pageserver_connstr: None, @@ -129,15 +136,62 @@ impl SharedState { notified_commit_lsn: Lsn(0), sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, replicas: Vec::new(), + wal_backup_active: false, active: false, num_computes: 0, pageserver_connstr: None, last_removed_segno: 0, }) } + fn is_active(&self) -> bool { + self.is_wal_backup_required() + // FIXME: add tracking of relevant pageservers and check them here individually, + // otherwise migration won't work (we suspend too early). + || self.sk.inmem.remote_consistent_lsn <= self.sk.inmem.commit_lsn + } - /// Activate the timeline: start/change walsender (via callmemaybe). - fn activate( + /// Mark timeline active/inactive and return whether s3 offloading requires + /// start/stop action. + fn update_status(&mut self) -> bool { + self.active = self.is_active(); + self.is_wal_backup_action_pending() + } + + /// Should we run s3 offloading in current state? + fn is_wal_backup_required(&self) -> bool { + let seg_size = self.get_wal_seg_size(); + self.num_computes > 0 || + // Currently only the whole segment is offloaded, so compare segment numbers. + (self.sk.inmem.commit_lsn.segment_number(seg_size) > + self.sk.inmem.backup_lsn.segment_number(seg_size)) + } + + /// Is current state of s3 offloading is not what it ought to be? + fn is_wal_backup_action_pending(&self) -> bool { + let res = self.wal_backup_active != self.is_wal_backup_required(); + if res { + let action_pending = if self.is_wal_backup_required() { + "start" + } else { + "stop" + }; + trace!( + "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}", + self.sk.state.timeline_id, action_pending, self.num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn + ); + } + res + } + + /// Returns whether s3 offloading is required and sets current status as + /// matching. + fn wal_backup_attend(&mut self) -> bool { + self.wal_backup_active = self.is_wal_backup_required(); + self.wal_backup_active + } + + /// start/change walsender (via callmemaybe). + fn callmemaybe_sub( &mut self, zttid: &ZTenantTimelineId, pageserver_connstr: Option<&String>, @@ -179,42 +233,42 @@ impl SharedState { ); } self.pageserver_connstr = pageserver_connstr.map(|c| c.to_owned()); - self.active = true; Ok(()) } /// Deactivate the timeline: stop callmemaybe. - fn deactivate( + fn callmemaybe_unsub( &mut self, zttid: &ZTenantTimelineId, callmemaybe_tx: &UnboundedSender, ) -> Result<()> { - if self.active { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Unsubscribe request to callmemaybe thread {}", - e - ); - }); - info!( - "timeline {} is unsubscribed from callmemaybe to {}", - zttid.timeline_id, - self.pageserver_connstr.as_ref().unwrap() - ); - } - self.active = false; + if let Some(ref pageserver_connstr) = self.pageserver_connstr { + let subscription_key = SubscriptionStateKey::new( + zttid.tenant_id, + zttid.timeline_id, + pageserver_connstr.to_owned(), + ); + callmemaybe_tx + .send(CallmeEvent::Unsubscribe(subscription_key)) + .unwrap_or_else(|e| { + error!( + "failed to send Unsubscribe request to callmemaybe thread {}", + e + ); + }); + info!( + "timeline {} is unsubscribed from callmemaybe to {}", + zttid.timeline_id, + self.pageserver_connstr.as_ref().unwrap() + ); } Ok(()) } + fn get_wal_seg_size(&self) -> usize { + self.sk.state.server.wal_seg_size as usize + } + /// Get combined state of all alive replicas pub fn get_replicas_state(&self) -> ReplicaState { let mut acc = ReplicaState::new(); @@ -278,6 +332,13 @@ impl SharedState { pub struct Timeline { pub zttid: ZTenantTimelineId, pub callmemaybe_tx: UnboundedSender, + /// Sending here asks for wal backup launcher attention (start/stop + /// offloading). Sending zttid instead of concrete command allows to do + /// sending without timeline lock. + wal_backup_launcher_tx: Sender, + commit_lsn_watch_tx: watch::Sender, + /// For breeding receivers. + commit_lsn_watch_rx: watch::Receiver, mutex: Mutex, /// conditional variable used to notify wal senders cond: Condvar, @@ -287,11 +348,17 @@ impl Timeline { fn new( zttid: ZTenantTimelineId, callmemaybe_tx: UnboundedSender, + wal_backup_launcher_tx: Sender, shared_state: SharedState, ) -> Timeline { + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = + watch::channel(shared_state.sk.inmem.commit_lsn); Timeline { zttid, callmemaybe_tx, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, mutex: Mutex::new(shared_state), cond: Condvar::new(), } @@ -301,13 +368,21 @@ impl Timeline { /// not running yet. /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes += 1; - // FIXME: currently we always adopt latest pageserver connstr, but we - // should have kind of generations assigned by compute to distinguish - // the latest one or even pass it through consensus to reliably deliver - // to all safekeepers. - shared_state.activate(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.num_computes += 1; + is_wal_backup_action_pending = shared_state.update_status(); + // FIXME: currently we always adopt latest pageserver connstr, but we + // should have kind of generations assigned by compute to distinguish + // the latest one or even pass it through consensus to reliably deliver + // to all safekeepers. + shared_state.callmemaybe_sub(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; + } + // Wake up wal backup launcher, if offloading not started yet. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + } Ok(()) } @@ -315,38 +390,43 @@ impl Timeline { /// pageserver doesn't need catchup. /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_disconnect(&self) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes -= 1; - // If there is no pageserver, can suspend right away; otherwise let - // walsender do that. - if shared_state.num_computes == 0 && shared_state.pageserver_connstr.is_none() { - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.num_computes -= 1; + is_wal_backup_action_pending = shared_state.update_status(); + } + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.blocking_send(self.zttid)?; } Ok(()) } - /// Deactivate tenant if there is no computes and pageserver is caughtup, - /// assuming the pageserver status is in replica_id. - /// Returns true if deactivated. - pub fn check_deactivate(&self, replica_id: usize) -> Result { + /// Whether we still need this walsender running? + /// TODO: check this pageserver is actually interested in this timeline. + pub fn stop_walsender(&self, replica_id: usize) -> Result { let mut shared_state = self.mutex.lock().unwrap(); - if !shared_state.active { - // already suspended - return Ok(true); - } if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); - let deactivate = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet - (replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. - replica_state.last_received_lsn >= shared_state.sk.inmem.commit_lsn); - if deactivate { - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + let stop = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet + (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. + replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); + if stop { + shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; return Ok(true); } } Ok(false) } + /// Returns whether s3 offloading is required and sets current status as + /// matching it. + pub fn wal_backup_attend(&self) -> bool { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.wal_backup_attend() + } + /// Deactivates the timeline, assuming it is being deleted. /// Returns whether the timeline was already active. /// @@ -354,10 +434,14 @@ impl Timeline { /// will stop by themselves eventually (possibly with errors, but no panics). There should be no /// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but /// we're deleting the timeline anyway. - pub fn deactivate_for_delete(&self) -> Result { - let mut shared_state = self.mutex.lock().unwrap(); - let was_active = shared_state.active; - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + pub async fn deactivate_for_delete(&self) -> Result { + let was_active: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + was_active = shared_state.active; + shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; + } + self.wal_backup_launcher_tx.send(self.zttid).await?; Ok(was_active) } @@ -391,6 +475,7 @@ impl Timeline { } // Notify caught-up WAL senders about new WAL data received + // TODO: replace-unify it with commit_lsn_watch. fn notify_wal_senders(&self, shared_state: &mut MutexGuard) { if shared_state.notified_commit_lsn < shared_state.sk.inmem.commit_lsn { shared_state.notified_commit_lsn = shared_state.sk.inmem.commit_lsn; @@ -398,12 +483,17 @@ impl Timeline { } } + pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { + self.commit_lsn_watch_rx.clone() + } + /// Pass arrived message to the safekeeper. pub fn process_msg( &self, msg: &ProposerAcceptorMessage, ) -> Result> { let mut rmsg: Option; + let commit_lsn: Lsn; { let mut shared_state = self.mutex.lock().unwrap(); rmsg = shared_state.sk.process_msg(msg)?; @@ -419,15 +509,31 @@ impl Timeline { // Ping wal sender that new data might be available. self.notify_wal_senders(&mut shared_state); + commit_lsn = shared_state.sk.inmem.commit_lsn; } + self.commit_lsn_watch_tx.send(commit_lsn)?; Ok(rmsg) } + pub fn get_wal_seg_size(&self) -> usize { + self.mutex.lock().unwrap().get_wal_seg_size() + } + pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { let shared_state = self.mutex.lock().unwrap(); (shared_state.sk.inmem.clone(), shared_state.sk.state.clone()) } + pub fn get_wal_backup_lsn(&self) -> Lsn { + self.mutex.lock().unwrap().sk.inmem.backup_lsn + } + + pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) { + self.mutex.lock().unwrap().sk.inmem.backup_lsn = backup_lsn; + // we should check whether to shut down offloader, but this will be done + // soon by peer communication anyway. + } + /// Prepare public safekeeper info for reporting. pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result { let shared_state = self.mutex.lock().unwrap(); @@ -436,7 +542,6 @@ impl Timeline { flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), // note: this value is not flushed to control file yet and can be lost commit_lsn: Some(shared_state.sk.inmem.commit_lsn), - s3_wal_lsn: Some(shared_state.sk.inmem.s3_wal_lsn), // TODO: rework feedbacks to avoid max here remote_consistent_lsn: Some(max( shared_state.get_replicas_state().remote_consistent_lsn, @@ -444,14 +549,35 @@ impl Timeline { )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), safekeeper_connection_string: Some(conf.listen_pg_addr.clone()), + backup_lsn: Some(shared_state.sk.inmem.backup_lsn), }) } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: ZNodeId) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.sk.record_safekeeper_info(sk_info)?; - self.notify_wal_senders(&mut shared_state); + pub async fn record_safekeeper_info( + &self, + sk_info: &SkTimelineInfo, + _sk_id: NodeId, + ) -> Result<()> { + let is_wal_backup_action_pending: bool; + let commit_lsn: Lsn; + { + let mut shared_state = self.mutex.lock().unwrap(); + // WAL seg size not initialized yet (no message from compute ever + // received), can't do much without it. + if shared_state.get_wal_seg_size() == 0 { + return Ok(()); + } + shared_state.sk.record_safekeeper_info(sk_info)?; + self.notify_wal_senders(&mut shared_state); + is_wal_backup_action_pending = shared_state.update_status(); + commit_lsn = shared_state.sk.inmem.commit_lsn; + } + self.commit_lsn_watch_tx.send(commit_lsn)?; + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.send(self.zttid).await?; + } Ok(()) } @@ -476,16 +602,16 @@ impl Timeline { shared_state.sk.wal_store.flush_lsn() } - pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> { + pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { let shared_state = self.mutex.lock().unwrap(); // WAL seg size not initialized yet, no WAL exists. - if shared_state.sk.state.server.wal_seg_size == 0 { + if shared_state.get_wal_seg_size() == 0 { return Ok(()); } - horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled); + horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { return Ok(()); @@ -522,12 +648,14 @@ impl TimelineTools for Option> { struct GlobalTimelinesState { timelines: HashMap>, callmemaybe_tx: Option>, + wal_backup_launcher_tx: Option>, } lazy_static! { static ref TIMELINES_STATE: Mutex = Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - callmemaybe_tx: None + callmemaybe_tx: None, + wal_backup_launcher_tx: None, }); } @@ -541,17 +669,22 @@ pub struct TimelineDeleteForceResult { pub struct GlobalTimelines; impl GlobalTimelines { - pub fn set_callmemaybe_tx(callmemaybe_tx: UnboundedSender) { + pub fn init( + callmemaybe_tx: UnboundedSender, + wal_backup_launcher_tx: Sender, + ) { let mut state = TIMELINES_STATE.lock().unwrap(); assert!(state.callmemaybe_tx.is_none()); state.callmemaybe_tx = Some(callmemaybe_tx); + assert!(state.wal_backup_launcher_tx.is_none()); + state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); } fn create_internal( mut state: MutexGuard, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result> { match state.timelines.get(&zttid) { Some(_) => bail!("timeline {} already exists", zttid), @@ -559,12 +692,14 @@ impl GlobalTimelines { // TODO: check directory existence let dir = conf.timeline_dir(&zttid); fs::create_dir_all(dir)?; + let shared_state = SharedState::create(conf, &zttid, peer_ids) .context("failed to create shared state")?; let new_tli = Arc::new(Timeline::new( zttid, state.callmemaybe_tx.as_ref().unwrap().clone(), + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); state.timelines.insert(zttid, Arc::clone(&new_tli)); @@ -576,7 +711,7 @@ impl GlobalTimelines { pub fn create( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result> { let state = TIMELINES_STATE.lock().unwrap(); GlobalTimelines::create_internal(state, conf, zttid, peer_ids) @@ -594,8 +729,7 @@ impl GlobalTimelines { match state.timelines.get(&zttid) { Some(result) => Ok(Arc::clone(result)), None => { - let shared_state = - SharedState::restore(conf, &zttid).context("failed to restore shared state"); + let shared_state = SharedState::restore(conf, &zttid); let shared_state = match shared_state { Ok(shared_state) => shared_state, @@ -617,6 +751,7 @@ impl GlobalTimelines { let new_tli = Arc::new(Timeline::new( zttid, state.callmemaybe_tx.as_ref().unwrap().clone(), + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); state.timelines.insert(zttid, Arc::clone(&new_tli)); @@ -625,6 +760,12 @@ impl GlobalTimelines { } } + /// Get loaded timeline, if it exists. + pub fn get_loaded(zttid: ZTenantTimelineId) -> Option> { + let state = TIMELINES_STATE.lock().unwrap(); + state.timelines.get(&zttid).map(Arc::clone) + } + /// Get ZTenantTimelineIDs of all active timelines. pub fn get_active_timelines() -> Vec { let state = TIMELINES_STATE.lock().unwrap(); @@ -665,43 +806,48 @@ impl GlobalTimelines { /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. /// TODO: ensure all of the above never happens. - pub fn delete_force( + pub async fn delete_force( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, ) -> Result { info!("deleting timeline {}", zttid); - let was_active = match TIMELINES_STATE.lock().unwrap().timelines.remove(zttid) { - None => false, - Some(tli) => tli.deactivate_for_delete()?, - }; + let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); + let mut was_active = false; + if let Some(tli) = timeline { + was_active = tli.deactivate_for_delete().await?; + } GlobalTimelines::delete_force_internal(conf, zttid, was_active) } /// Deactivates and deletes all timelines for the tenant, see `delete()`. /// Returns map of all timelines which the tenant had, `true` if a timeline was active. - pub fn delete_force_all_for_tenant( + /// There may be a race if new timelines are created simultaneously. + pub async fn delete_force_all_for_tenant( conf: &SafeKeeperConf, tenant_id: &ZTenantId, ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); - let mut state = TIMELINES_STATE.lock().unwrap(); - let mut deleted = HashMap::new(); - for (zttid, tli) in &state.timelines { - if zttid.tenant_id == *tenant_id { - deleted.insert( - *zttid, - GlobalTimelines::delete_force_internal( - conf, - zttid, - tli.deactivate_for_delete()?, - )?, - ); + let mut to_delete = HashMap::new(); + { + // Keep mutex in this scope. + let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines; + for (&zttid, tli) in timelines.iter() { + if zttid.tenant_id == *tenant_id { + to_delete.insert(zttid, tli.clone()); + } } + // TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently. + timelines.retain(|zttid, _| !to_delete.contains_key(zttid)); } - // TODO: test that the exact subset of timelines is removed. - state - .timelines - .retain(|zttid, _| !deleted.contains_key(zttid)); + let mut deleted = HashMap::new(); + for (zttid, timeline) in to_delete { + let was_active = timeline.deactivate_for_delete().await?; + deleted.insert( + zttid, + GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?, + ); + } + // There may be inactive timelines, so delete the whole tenant dir as well. match std::fs::remove_dir_all(conf.tenant_dir(tenant_id)) { Ok(_) => (), Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs new file mode 100644 index 0000000000..83dc312d28 --- /dev/null +++ b/safekeeper/src/wal_backup.rs @@ -0,0 +1,417 @@ +use anyhow::{Context, Result}; +use tokio::task::JoinHandle; + +use std::cmp::min; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::Duration; + +use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI}; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use tokio::fs::File; +use tokio::runtime::Builder; + +use tokio::select; +use tokio::sync::mpsc::{self, Receiver, Sender}; +use tokio::sync::watch; +use tokio::time::sleep; +use tracing::*; + +use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + +use crate::broker::{Election, ElectionLeader}; +use crate::timeline::{GlobalTimelines, Timeline}; +use crate::{broker, SafeKeeperConf}; + +use once_cell::sync::OnceCell; + +const BACKUP_ELECTION_NAME: &str = "WAL_BACKUP"; + +const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000; + +const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; +const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; + +pub fn wal_backup_launcher_thread_main( + conf: SafeKeeperConf, + wal_backup_launcher_rx: Receiver, +) { + let rt = Builder::new_multi_thread() + .worker_threads(conf.backup_runtime_threads) + .enable_all() + .build() + .expect("failed to create wal backup runtime"); + + rt.block_on(async { + wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await; + }); +} + +/// Check whether wal backup is required for timeline and mark that launcher is +/// aware of current status (if timeline exists). +fn is_wal_backup_required(zttid: ZTenantTimelineId) -> bool { + if let Some(tli) = GlobalTimelines::get_loaded(zttid) { + tli.wal_backup_attend() + } else { + false + } +} + +struct WalBackupTaskHandle { + shutdown_tx: Sender<()>, + handle: JoinHandle<()>, +} + +/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup +/// tasks. Having this in separate task simplifies locking, allows to reap +/// panics and separate elections from offloading itself. +async fn wal_backup_launcher_main_loop( + conf: SafeKeeperConf, + mut wal_backup_launcher_rx: Receiver, +) { + info!( + "WAL backup launcher: started, remote config {:?}", + conf.remote_storage + ); + + let conf_ = conf.clone(); + REMOTE_STORAGE.get_or_init(|| { + conf_.remote_storage.as_ref().map(|c| { + GenericRemoteStorage::new(conf_.workdir, c).expect("failed to create remote storage") + }) + }); + + let mut tasks: HashMap = HashMap::new(); + + loop { + // channel is never expected to get closed + let zttid = wal_backup_launcher_rx.recv().await.unwrap(); + let is_wal_backup_required = is_wal_backup_required(zttid); + if conf.remote_storage.is_none() || !conf.wal_backup_enabled { + continue; /* just drain the channel and do nothing */ + } + // do we need to do anything at all? + if is_wal_backup_required != tasks.contains_key(&zttid) { + if is_wal_backup_required { + // need to start the task + info!("starting WAL backup task for {}", zttid); + + // TODO: decide who should offload in launcher itself by simply checking current state + let election_name = broker::get_campaign_name( + BACKUP_ELECTION_NAME.to_string(), + conf.broker_etcd_prefix.clone(), + &zttid, + ); + let my_candidate_name = broker::get_candiate_name(conf.my_id); + let election = broker::Election::new( + election_name, + my_candidate_name, + conf.broker_endpoints.clone(), + ); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + let timeline_dir = conf.timeline_dir(&zttid); + + let handle = tokio::spawn( + backup_task_main(zttid, timeline_dir, shutdown_rx, election) + .instrument(info_span!("WAL backup task", zttid = %zttid)), + ); + + tasks.insert( + zttid, + WalBackupTaskHandle { + shutdown_tx, + handle, + }, + ); + } else { + // need to stop the task + info!("stopping WAL backup task for {}", zttid); + + let wb_handle = tasks.remove(&zttid).unwrap(); + // Tell the task to shutdown. Error means task exited earlier, that's ok. + let _ = wb_handle.shutdown_tx.send(()).await; + // Await the task itself. TODO: restart panicked tasks earlier. + // Hm, why I can't await on reference to handle? + if let Err(e) = wb_handle.handle.await { + warn!("WAL backup task for {} panicked: {}", zttid, e); + } + } + } + } +} + +struct WalBackupTask { + timeline: Arc, + timeline_dir: PathBuf, + wal_seg_size: usize, + commit_lsn_watch_rx: watch::Receiver, + leader: Option, + election: Election, +} + +/// Offload single timeline. +async fn backup_task_main( + zttid: ZTenantTimelineId, + timeline_dir: PathBuf, + mut shutdown_rx: Receiver<()>, + election: Election, +) { + info!("started"); + let timeline: Arc = if let Some(tli) = GlobalTimelines::get_loaded(zttid) { + tli + } else { + /* Timeline could get deleted while task was starting, just exit then. */ + info!("no timeline, exiting"); + return; + }; + + let mut wb = WalBackupTask { + wal_seg_size: timeline.get_wal_seg_size(), + commit_lsn_watch_rx: timeline.get_commit_lsn_watch_rx(), + timeline, + timeline_dir, + leader: None, + election, + }; + + // task is spinned up only when wal_seg_size already initialized + assert!(wb.wal_seg_size > 0); + + let mut canceled = false; + select! { + _ = wb.run() => {} + _ = shutdown_rx.recv() => { + canceled = true; + } + } + if let Some(l) = wb.leader { + l.give_up().await; + } + info!("task {}", if canceled { "canceled" } else { "terminated" }); +} + +impl WalBackupTask { + async fn run(&mut self) { + let mut backup_lsn = Lsn(0); + + // election loop + loop { + let mut retry_attempt = 0u32; + + if let Some(l) = self.leader.take() { + l.give_up().await; + } + + match broker::get_leader(&self.election).await { + Ok(l) => { + self.leader = Some(l); + } + Err(e) => { + error!("error during leader election {:?}", e); + sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await; + continue; + } + } + + // offload loop + loop { + if retry_attempt == 0 { + // wait for new WAL to arrive + if let Err(e) = self.commit_lsn_watch_rx.changed().await { + // should never happen, as we hold Arc to timeline. + error!("commit_lsn watch shut down: {:?}", e); + return; + } + } else { + // or just sleep if we errored previously + let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS; + if let Some(backoff_delay) = + UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt) + { + retry_delay = min(retry_delay, backoff_delay); + } + sleep(Duration::from_millis(retry_delay)).await; + } + + let commit_lsn = *self.commit_lsn_watch_rx.borrow(); + + // Note that backup_lsn can be higher than commit_lsn if we + // don't have much local WAL and others already uploaded + // segments we don't even have. + if backup_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) + { + continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ + } + // Perhaps peers advanced the position, check shmem value. + backup_lsn = self.timeline.get_wal_backup_lsn(); + if backup_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) + { + continue; + } + + if let Some(l) = self.leader.as_mut() { + // Optimization idea for later: + // Avoid checking election leader every time by returning current lease grant expiration time + // Re-check leadership only after expiration time, + // such approach woud reduce overhead on write-intensive workloads + + match l + .check_am_i( + self.election.election_name.clone(), + self.election.candidate_name.clone(), + ) + .await + { + Ok(leader) => { + if !leader { + info!("leader has changed"); + break; + } + } + Err(e) => { + warn!("error validating leader, {:?}", e); + break; + } + } + } + + match backup_lsn_range( + backup_lsn, + commit_lsn, + self.wal_seg_size, + &self.timeline_dir, + ) + .await + { + Ok(backup_lsn_result) => { + backup_lsn = backup_lsn_result; + self.timeline.set_wal_backup_lsn(backup_lsn_result); + retry_attempt = 0; + } + Err(e) => { + error!( + "failed while offloading range {}-{}: {:?}", + backup_lsn, commit_lsn, e + ); + + retry_attempt = min(retry_attempt + 1, u32::MAX); + } + } + } + } + } +} + +pub async fn backup_lsn_range( + start_lsn: Lsn, + end_lsn: Lsn, + wal_seg_size: usize, + timeline_dir: &Path, +) -> Result { + let mut res = start_lsn; + let segments = get_segments(start_lsn, end_lsn, wal_seg_size); + for s in &segments { + backup_single_segment(s, timeline_dir) + .await + .with_context(|| format!("offloading segno {}", s.seg_no))?; + + res = s.end_lsn; + } + info!( + "offloaded segnos {:?} up to {}, previous backup_lsn {}", + segments.iter().map(|&s| s.seg_no).collect::>(), + end_lsn, + start_lsn, + ); + Ok(res) +} + +async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> { + let segment_file_name = seg.file_path(timeline_dir)?; + + backup_object(&segment_file_name, seg.size()).await?; + debug!("Backup of {} done", segment_file_name.display()); + + Ok(()) +} + +#[derive(Debug, Copy, Clone)] +pub struct Segment { + seg_no: XLogSegNo, + start_lsn: Lsn, + end_lsn: Lsn, +} + +impl Segment { + pub fn new(seg_no: u64, start_lsn: Lsn, end_lsn: Lsn) -> Self { + Self { + seg_no, + start_lsn, + end_lsn, + } + } + + pub fn object_name(self) -> String { + XLogFileName(PG_TLI, self.seg_no, self.size()) + } + + pub fn file_path(self, timeline_dir: &Path) -> Result { + Ok(timeline_dir.join(self.object_name())) + } + + pub fn size(self) -> usize { + (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize + } +} + +fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { + let first_seg = start.segment_number(seg_size); + let last_seg = end.segment_number(seg_size); + + let res: Vec = (first_seg..last_seg) + .map(|s| { + let start_lsn = XLogSegNoOffsetToRecPtr(s, 0, seg_size); + let end_lsn = XLogSegNoOffsetToRecPtr(s + 1, 0, seg_size); + Segment::new(s, Lsn::from(start_lsn), Lsn::from(end_lsn)) + }) + .collect(); + res +} + +static REMOTE_STORAGE: OnceCell> = OnceCell::new(); + +async fn backup_object(source_file: &Path, size: usize) -> Result<()> { + let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); + + let file = File::open(&source_file).await?; + + // Storage is initialized by launcher at ths point. + match storage.as_ref().unwrap() { + GenericRemoteStorage::Local(local_storage) => { + let destination = local_storage.remote_object_id(source_file)?; + + debug!( + "local upload about to start from {} to {}", + source_file.display(), + destination.display() + ); + local_storage.upload(file, size, &destination, None).await + } + GenericRemoteStorage::S3(s3_storage) => { + let s3key = s3_storage.remote_object_id(source_file)?; + + debug!( + "S3 upload about to start from {} to {:?}", + source_file.display(), + s3key + ); + s3_storage.upload(file, size, &s3key, None).await + } + }?; + + Ok(()) +} diff --git a/scripts/git-upload b/scripts/git-upload index 4649f6998d..a53987894a 100755 --- a/scripts/git-upload +++ b/scripts/git-upload @@ -80,12 +80,14 @@ class GitRepo: print('No changes detected, quitting') return - run([ + git_with_user = [ 'git', '-c', 'user.name=vipvap', '-c', 'user.email=vipvap@zenith.tech', + ] + run(git_with_user + [ 'commit', '--author="vipvap "', f'--message={message}', @@ -94,7 +96,7 @@ class GitRepo: for _ in range(5): try: run(['git', 'fetch', 'origin', branch]) - run(['git', 'rebase', f'origin/{branch}']) + run(git_with_user + ['rebase', f'origin/{branch}']) run(['git', 'push', 'origin', branch]) return diff --git a/test_runner/README.md b/test_runner/README.md index ee171ae6a0..059bbb83cc 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -51,7 +51,6 @@ Useful environment variables: should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. `ZENITH_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as -`FORCE_MOCK_S3`: inits every test's pageserver with a mock S3 used as a remote storage. `--pageserver-config-override=${value}` parameter values when zenith cli is invoked `RUST_LOG`: logging configuration to pass into Zenith CLI diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index c07b9d6dd1..5dbd6d2e26 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -10,13 +10,6 @@ from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserv # Create ancestor branches off the main branch. # def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): - - # Use safekeeper in this test to avoid a subtle race condition. - # Without safekeeper, walreceiver reconnection can stuck - # because of IO deadlock. - # - # See https://github.com/zenithdb/zenith/issues/1068 - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() # Override defaults, 1M gc_horizon and 4M checkpoint_distance. diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 6658b337ec..81f45b749b 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -94,7 +94,6 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/1587") def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() # Create a branch for us env.zenith_cli.create_branch('test_backpressure') diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/batch_others/test_basebackup_error.py new file mode 100644 index 0000000000..4b8b8a746c --- /dev/null +++ b/test_runner/batch_others/test_basebackup_error.py @@ -0,0 +1,20 @@ +import pytest +from contextlib import closing + +from fixtures.zenith_fixtures import ZenithEnv +from fixtures.log_helper import log + + +# +# Test error handling, if the 'basebackup' command fails in the middle +# of building the tar archive. +# +def test_basebackup_error(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + env.zenith_cli.create_branch("test_basebackup_error", "empty") + + # Introduce failpoint + env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return") + + with pytest.raises(Exception, match="basebackup-before-control-file"): + pg = env.postgres.create_start('test_basebackup_error') diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index 17eadb33b4..f0aa44e0a4 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -1,6 +1,7 @@ import pytest +import concurrent.futures from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv from fixtures.log_helper import log import os @@ -78,3 +79,37 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="Cannot load local timeline") as err: pg.start() log.info(f'compute startup failed as expected: {err}') + + +def test_create_multiple_timelines_parallel(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + + tenant_id, _ = env.zenith_cli.create_tenant() + + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [ + executor.submit(env.zenith_cli.create_timeline, + f"test-create-multiple-timelines-{i}", + tenant_id) for i in range(4) + ] + for future in futures: + future.result() + + +def test_fix_broken_timelines_on_startup(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + + tenant_id, _ = env.zenith_cli.create_tenant() + + # Introduce failpoint when creating a new timeline + env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return") + with pytest.raises(Exception, match="before-checkpoint-new-timeline"): + _ = env.zenith_cli.create_timeline("test_fix_broken_timelines", tenant_id) + + # Restart the page server + env.zenith_cli.pageserver_stop(immediate=True) + env.zenith_cli.pageserver_start() + + # Check that the "broken" timeline is not loaded + timelines = env.zenith_cli.list_timelines(tenant_id) + assert len(timelines) == 1 diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/batch_others/test_next_xid.py index 03c27bcd70..1ab1addad3 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/batch_others/test_next_xid.py @@ -6,8 +6,6 @@ from fixtures.zenith_fixtures import ZenithEnvBuilder # Test restarting page server, while safekeeper and compute node keep # running. def test_next_xid(zenith_env_builder: ZenithEnvBuilder): - # One safekeeper is enough for this test. - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 13f6ef358e..7fe3b4dff5 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,6 +1,12 @@ from uuid import uuid4, UUID import pytest -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient +from fixtures.zenith_fixtures import ( + DEFAULT_BRANCH_NAME, + ZenithEnv, + ZenithEnvBuilder, + ZenithPageserverHttpClient, + ZenithPageserverApiException, +) # test that we cannot override node id @@ -48,6 +54,39 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): assert local_timeline_details['timeline_state'] == 'Loaded' +def test_pageserver_http_get_wal_receiver_not_found(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + client = env.pageserver.http_client() + + tenant_id, timeline_id = env.zenith_cli.create_tenant() + + # no PG compute node is running, so no WAL receiver is running + with pytest.raises(ZenithPageserverApiException) as e: + _ = client.wal_receiver_get(tenant_id, timeline_id) + assert "Not Found" in str(e.value) + + +def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + client = env.pageserver.http_client() + + tenant_id, timeline_id = env.zenith_cli.create_tenant() + pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) + + res = client.wal_receiver_get(tenant_id, timeline_id) + assert list(res.keys()) == [ + "thread_id", + "wal_producer_connstr", + "last_received_msg_lsn", + "last_received_msg_ts", + ] + + # make a DB modification then expect getting a new WAL receiver's data + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + res2 = client.wal_receiver_get(tenant_id, timeline_id) + assert res2["last_received_msg_lsn"] > res["last_received_msg_lsn"] + + def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): env = zenith_simple_env client = env.pageserver.http_client() diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 20e6f4467e..69f5ea85ce 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -5,8 +5,6 @@ from fixtures.log_helper import log # Test restarting page server, while safekeeper and compute node keep # running. def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): - # One safekeeper is enough for this test. - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() env.zenith_cli.create_branch('test_pageserver_restart') diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py index dbfa943a7a..eb1747efa5 100644 --- a/test_runner/batch_others/test_recovery.py +++ b/test_runner/batch_others/test_recovery.py @@ -45,14 +45,14 @@ def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder): # Configure failpoints pscur.execute( - "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=panic") + "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=exit") # Do some updates until pageserver is crashed try: while True: cur.execute("update foo set x=x+1") except Exception as err: - log.info(f"Excepted server crash {err}") + log.info(f"Expected server crash {err}") log.info("Wait before server restart") env.pageserver.stop() diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index e205f79957..afbe3c55c7 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -6,7 +6,7 @@ from contextlib import closing from pathlib import Path import time from uuid import UUID -from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload +from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log from fixtures.utils import lsn_from_hex, lsn_to_hex import pytest @@ -32,7 +32,6 @@ import pytest @pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, storage_type: str): # zenith_env_builder.rust_log_override = 'debug' - zenith_env_builder.num_safekeepers = 1 if storage_type == 'local_fs': zenith_env_builder.enable_local_fs_remote_storage() elif storage_type == 'mock_s3': @@ -110,9 +109,9 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) log.info("waiting for timeline redownload") - wait_for(number_of_iterations=10, - interval=1, - func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) + wait_until(number_of_iterations=10, + interval=1, + func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert detail['local'] is not None diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 279b3a0a25..91506e120d 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -3,12 +3,14 @@ import os import pathlib import subprocess import threading +import typing from uuid import UUID from fixtures.log_helper import log +from typing import Optional import signal import pytest -from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, ZenithPageserverHttpClient, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir +from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, Etcd, ZenithPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir from fixtures.utils import lsn_from_hex @@ -21,7 +23,8 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, pageserver_bin: pathlib.Path, remote_storage_mock_path: pathlib.Path, pg_port: int, - http_port: int): + http_port: int, + broker: Optional[Etcd]): """ cannot use ZenithPageserver yet because it depends on zenith cli which currently lacks support for multiple pageservers @@ -38,6 +41,9 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", ] + if broker is not None: + cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) + subprocess.check_output(cmd, text=True) # actually run new pageserver @@ -103,7 +109,6 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, port_distributor: PortDistributor, with_load: str): - zenith_env_builder.num_safekeepers = 1 zenith_env_builder.enable_local_fs_remote_storage() env = zenith_env_builder.init_start() @@ -180,12 +185,13 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, pageserver_bin, remote_storage_mock_path, new_pageserver_pg_port, - new_pageserver_http_port): + new_pageserver_http_port, + zenith_env_builder.broker): # call to attach timeline to new pageserver new_pageserver_http.timeline_attach(tenant, timeline) # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint - new_timeline_detail = wait_for( + new_timeline_detail = wait_until( number_of_iterations=5, interval=1, func=lambda: assert_local(new_pageserver_http, tenant, timeline)) diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 1b593cfee3..9ccb8cf196 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -1,8 +1,12 @@ from contextlib import closing - +from datetime import datetime +import os import pytest from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.log_helper import log +from fixtures.metrics import parse_metrics +from fixtures.utils import lsn_to_hex @pytest.mark.parametrize('with_safekeepers', [False, True]) @@ -38,3 +42,79 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (5000050000, ) + + +def test_metrics_normal_work(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 3 + + env = zenith_env_builder.init_start() + tenant_1, _ = env.zenith_cli.create_tenant() + tenant_2, _ = env.zenith_cli.create_tenant() + + timeline_1 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1) + timeline_2 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2) + + pg_tenant1 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_1) + pg_tenant2 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_2) + + for pg in [pg_tenant1, pg_tenant2]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000, ) + + collected_metrics = { + "pageserver": env.pageserver.http_client().get_metrics(), + } + for sk in env.safekeepers: + collected_metrics[f'safekeeper{sk.id}'] = sk.http_client().get_metrics_str() + + for name in collected_metrics: + basepath = os.path.join(zenith_env_builder.repo_dir, f'{name}.metrics') + + with open(basepath, 'w') as stdout_f: + print(collected_metrics[name], file=stdout_f, flush=True) + + all_metrics = [parse_metrics(m, name) for name, m in collected_metrics.items()] + ps_metrics = all_metrics[0] + sk_metrics = all_metrics[1:] + + ttids = [{ + 'tenant_id': tenant_1.hex, 'timeline_id': timeline_1.hex + }, { + 'tenant_id': tenant_2.hex, 'timeline_id': timeline_2.hex + }] + + # Test metrics per timeline + for tt in ttids: + log.info(f"Checking metrics for {tt}") + + ps_lsn = int(ps_metrics.query_one("pageserver_last_record_lsn", filter=tt).value) + sk_lsns = [int(sk.query_one("safekeeper_commit_lsn", filter=tt).value) for sk in sk_metrics] + + log.info(f"ps_lsn: {lsn_to_hex(ps_lsn)}") + log.info(f"sk_lsns: {list(map(lsn_to_hex, sk_lsns))}") + + assert ps_lsn <= max(sk_lsns) + assert ps_lsn > 0 + + # Test common metrics + for metrics in all_metrics: + log.info(f"Checking common metrics for {metrics.name}") + + log.info( + f"process_cpu_seconds_total: {metrics.query_one('process_cpu_seconds_total').value}") + log.info(f"process_threads: {int(metrics.query_one('process_threads').value)}") + log.info( + f"process_resident_memory_bytes (MB): {metrics.query_one('process_resident_memory_bytes').value / 1024 / 1024}" + ) + log.info( + f"process_virtual_memory_bytes (MB): {metrics.query_one('process_virtual_memory_bytes').value / 1024 / 1024}" + ) + log.info(f"process_open_fds: {int(metrics.query_one('process_open_fds').value)}") + log.info(f"process_max_fds: {int(metrics.query_one('process_max_fds').value)}") + log.info( + f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}" + ) diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py new file mode 100644 index 0000000000..c00f077fcd --- /dev/null +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -0,0 +1,97 @@ +# +# Little stress test for the checkpointing and remote storage code. +# +# The test creates several tenants, and runs a simple workload on +# each tenant, in parallel. The test uses remote storage, and a tiny +# checkpoint_distance setting so that a lot of layer files are created. +# + +import asyncio +from contextlib import closing +from uuid import UUID + +import pytest + +from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv, Postgres, wait_for_last_record_lsn, wait_for_upload +from fixtures.utils import lsn_from_hex + + +async def tenant_workload(env: ZenithEnv, pg: Postgres): + pageserver_conn = await env.pageserver.connect_async() + + pg_conn = await pg.connect_async() + + tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant") + timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline") + + await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") + for i in range(1, 100): + await pg_conn.execute( + f"INSERT INTO t SELECT {i}*1000 + g, 'payload' from generate_series(1,1000) g") + + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + res = await pg_conn.fetchval("SELECT count(*) FROM t") + assert res == i * 1000 + + +async def all_tenants_workload(env: ZenithEnv, tenants_pgs): + workers = [] + for tenant, pg in tenants_pgs: + worker = tenant_workload(env, pg) + workers.append(asyncio.create_task(worker)) + + # await all workers + await asyncio.gather(*workers) + + +@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) +def test_tenants_many(zenith_env_builder: ZenithEnvBuilder, storage_type: str): + + if storage_type == 'local_fs': + zenith_env_builder.enable_local_fs_remote_storage() + elif storage_type == 'mock_s3': + zenith_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') + else: + raise RuntimeError(f'Unknown storage type: {storage_type}') + + zenith_env_builder.enable_local_fs_remote_storage() + + env = zenith_env_builder.init_start() + + tenants_pgs = [] + + for i in range(1, 5): + # Use a tiny checkpoint distance, to create a lot of layers quickly + tenant, _ = env.zenith_cli.create_tenant( + conf={ + 'checkpoint_distance': '5000000', + }) + env.zenith_cli.create_timeline(f'test_tenants_many', tenant_id=tenant) + + pg = env.postgres.create_start( + f'test_tenants_many', + tenant_id=tenant, + ) + tenants_pgs.append((tenant, pg)) + + asyncio.run(all_tenants_workload(env, tenants_pgs)) + + # Wait for the remote storage uploads to finish + pageserver_http = env.pageserver.http_client() + for tenant, pg in tenants_pgs: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("show zenith.zenith_tenant") + tenant_id = cur.fetchone()[0] + cur.execute("show zenith.zenith_timeline") + timeline_id = cur.fetchone()[0] + cur.execute("SELECT pg_current_wal_flush_lsn()") + current_lsn = lsn_from_hex(cur.fetchone()[0]) + + # wait until pageserver receives all the data + wait_for_last_record_lsn(pageserver_http, UUID(tenant_id), UUID(timeline_id), current_lsn) + + # run final checkpoint manually to flush all the data to remote storage + env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + wait_for_upload(pageserver_http, UUID(tenant_id), UUID(timeline_id), current_lsn) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index db33493d61..0b33b56df3 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -70,7 +70,6 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota') diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 67c9d6070e..fc192c28e8 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -12,8 +12,8 @@ from contextlib import closing from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path -from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol -from fixtures.utils import etcd_path, get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex +from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol +from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -22,7 +22,6 @@ from typing import List, Optional, Any # succeed and data is written def test_normal_work(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - zenith_env_builder.broker = True env = zenith_env_builder.init_start() env.zenith_cli.create_branch('test_safekeepers_normal_work') @@ -328,10 +327,8 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): # Test that safekeepers push their info to the broker and learn peer status from it -@pytest.mark.skipif(etcd_path() is None, reason="requires etcd which is not present in PATH") def test_broker(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - zenith_env_builder.broker = True zenith_env_builder.enable_local_fs_remote_storage() env = zenith_env_builder.init_start() @@ -371,10 +368,8 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): # Test that old WAL consumed by peers and pageserver is removed from safekeepers. -@pytest.mark.skipif(etcd_path() is None, reason="requires etcd which is not present in PATH") def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 2 - zenith_env_builder.broker = True # to advance remote_consistent_llsn zenith_env_builder.enable_local_fs_remote_storage() env = zenith_env_builder.init_start() @@ -406,7 +401,7 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): http_cli = env.safekeepers[0].http_client() # Pretend WAL is offloaded to s3. - http_cli.record_safekeeper_info(tenant_id, timeline_id, {'s3_wal_lsn': 'FFFFFFFF/FEFFFFFF'}) + http_cli.record_safekeeper_info(tenant_id, timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) # wait till first segment is removed on all safekeepers started_at = time.time() @@ -419,6 +414,56 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): time.sleep(0.5) +@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) +def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): + zenith_env_builder.num_safekeepers = 3 + if storage_type == 'local_fs': + zenith_env_builder.enable_local_fs_remote_storage() + elif storage_type == 'mock_s3': + zenith_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup') + else: + raise RuntimeError(f'Unknown storage type: {storage_type}') + zenith_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + + env = zenith_env_builder.init_start() + + env.zenith_cli.create_branch('test_safekeepers_wal_backup') + pg = env.postgres.create_start('test_safekeepers_wal_backup') + + # learn zenith timeline from compute + tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] + timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + + pg_conn = pg.connect() + cur = pg_conn.cursor() + cur.execute('create table t(key int, value text)') + + # Shut down subsequently each of safekeepers and fill a segment while sk is + # down; ensure segment gets offloaded by others. + offloaded_seg_end = ['0/2000000', '0/3000000', '0/4000000'] + for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): + victim.stop() + # roughly fills one segment + cur.execute("insert into t select generate_series(1,250000), 'payload'") + live_sk = [sk for sk in env.safekeepers if sk != victim][0] + http_cli = live_sk.http_client() + + started_at = time.time() + while True: + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"live sk status is {tli_status}") + + if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): + break + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s segment ending at {seg_end} get offloaded") + time.sleep(0.5) + + victim.start() + + class ProposerPostgres(PgProtocol): """Object for running postgres without ZenithEnv""" def __init__(self, @@ -557,8 +602,6 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): - - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() env.zenith_cli.create_branch('test_timeline_status') @@ -599,6 +642,9 @@ class SafekeeperEnv: num_safekeepers: int = 1): self.repo_dir = repo_dir self.port_distributor = port_distributor + self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"), + port=self.port_distributor.get_port(), + peer_port=self.port_distributor.get_port()) self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers self.bin_safekeeper = os.path.join(str(zenith_binpath), 'safekeeper') @@ -645,6 +691,8 @@ class SafekeeperEnv: safekeeper_dir, "--id", str(i), + "--broker-endpoints", + self.broker.client_url(), "--daemonize" ] @@ -698,7 +746,6 @@ def test_safekeeper_without_pageserver(test_output_dir: str, repo_dir, port_distributor, pg_bin, - num_safekeepers=1, ) with env: diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index b0f34f4aae..f4aceac5e8 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -15,7 +15,6 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin, test_output_dir, port_distributor: PortDistributor): - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_wal_restore") pg = env.postgres.create_start('test_wal_restore') diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index bff17fa679..103d51aae5 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -94,8 +94,6 @@ def test_cli_tenant_create(zenith_simple_env: ZenithEnv): def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): - # Start with single sk - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() # Connect to sk port on v4 loopback @@ -111,8 +109,6 @@ def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): def test_cli_start_stop(zenith_env_builder: ZenithEnvBuilder): - # Start with single sk - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() # Stop default ps/sk diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index cde56d9b88..7c99c04fe3 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -1,9 +1,12 @@ import os - +import pytest from fixtures.utils import mkdir_if_needed from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir +# The isolation tests run for a long time, especially in debug mode, +# so use a larger-than-default timeout. +@pytest.mark.timeout(1800) def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): env = zenith_simple_env diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index 07d2574f4a..be7776113a 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -1,9 +1,12 @@ import os - +import pytest from fixtures.utils import mkdir_if_needed from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, base_dir, pg_distrib_dir +# The pg_regress tests run for a long time, especially in debug mode, +# so use a larger-than-default timeout. +@pytest.mark.timeout(1800) def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys): env = zenith_simple_env diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 0735f16d73..5fc6076f51 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -236,14 +236,14 @@ class ZenithBenchmarker: """ Fetch the "cumulative # of bytes written" metric from the pageserver """ - metric_name = r'pageserver_disk_io_bytes{io_operation="write"}' + metric_name = r'libmetrics_disk_io_bytes_total{io_operation="write"}' return self.get_int_counter_value(pageserver, metric_name) def get_peak_mem(self, pageserver) -> int: """ Fetch the "maxrss" metric from the pageserver """ - metric_name = r'pageserver_maxrss_kb' + metric_name = r'libmetrics_maxrss_kb' return self.get_int_counter_value(pageserver, metric_name) def get_int_counter_value(self, pageserver, metric_name) -> int: diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py new file mode 100644 index 0000000000..6fc62c6ea9 --- /dev/null +++ b/test_runner/fixtures/metrics.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass +from prometheus_client.parser import text_string_to_metric_families +from prometheus_client.samples import Sample +from typing import Dict, List +from collections import defaultdict + +from fixtures.log_helper import log + + +class Metrics: + metrics: Dict[str, List[Sample]] + name: str + + def __init__(self, name: str = ""): + self.metrics = defaultdict(list) + self.name = name + + def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]: + res = [] + for sample in self.metrics[name]: + if all(sample.labels[k] == v for k, v in filter.items()): + res.append(sample) + return res + + def query_one(self, name: str, filter: Dict[str, str] = {}) -> Sample: + res = self.query_all(name, filter) + assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}" + return res[0] + + +def parse_metrics(text: str, name: str = ""): + metrics = Metrics(name) + gen = text_string_to_metric_families(text) + for family in gen: + for sample in family.samples: + metrics.metrics[sample.name].append(sample) + + return metrics diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 7b95e729d9..ba9bc6e113 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,8 +1,9 @@ import os import shutil import subprocess +from pathlib import Path -from typing import Any, List +from typing import Any, List, Optional from fixtures.log_helper import log @@ -80,9 +81,12 @@ def print_gc_result(row): .format_map(row)) -# path to etcd binary or None if not present. -def etcd_path(): - return shutil.which("etcd") +def etcd_path() -> Path: + path_output = shutil.which("etcd") + if path_output is None: + raise RuntimeError('etcd not found in PATH') + else: + return Path(path_output) # Traverse directory to get total size. diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 357db4c16d..a2e8c82d30 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import field +from enum import Flag, auto import textwrap from cached_property import cached_property import asyncpg @@ -34,7 +35,12 @@ from typing_extensions import Literal import requests import backoff # type: ignore -from .utils import (etcd_path, get_self_dir, mkdir_if_needed, subprocess_capture, lsn_from_hex) +from .utils import (etcd_path, + get_self_dir, + mkdir_if_needed, + subprocess_capture, + lsn_from_hex, + lsn_to_hex) from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -61,7 +67,7 @@ DEFAULT_POSTGRES_DIR = 'tmp_install' DEFAULT_BRANCH_NAME = 'main' BASE_PORT = 15000 -WORKER_PORT_NUM = 100 +WORKER_PORT_NUM = 1000 def pytest_addoption(parser): @@ -178,7 +184,7 @@ def shareable_scope(fixture_name, config) -> Literal["session", "function"]: return 'function' if os.environ.get('TEST_SHARED_FIXTURES') is None else 'session' -@pytest.fixture(scope=shareable_scope) +@pytest.fixture(scope='session') def worker_seq_no(worker_id: str): # worker_id is a pytest-xdist fixture # it can be master or gw @@ -189,7 +195,7 @@ def worker_seq_no(worker_id: str): return int(worker_id[2:]) -@pytest.fixture(scope=shareable_scope) +@pytest.fixture(scope='session') def worker_base_port(worker_seq_no: int): # so we divide ports in ranges of 100 ports # so workers have disjoint set of ports for services @@ -242,11 +248,30 @@ class PortDistributor: 'port range configured for test is exhausted, consider enlarging the range') -@pytest.fixture(scope=shareable_scope) +@pytest.fixture(scope='session') def port_distributor(worker_base_port): return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) +@pytest.fixture(scope='session') +def default_broker(request: Any, port_distributor: PortDistributor): + client_port = port_distributor.get_port() + # multiple pytest sessions could get launched in parallel, get them different datadirs + etcd_datadir = os.path.join(get_test_output_dir(request), f"etcd_datadir_{client_port}") + pathlib.Path(etcd_datadir).mkdir(exist_ok=True, parents=True) + + broker = Etcd(datadir=etcd_datadir, port=client_port, peer_port=port_distributor.get_port()) + yield broker + broker.stop() + + +@pytest.fixture(scope='session') +def mock_s3_server(port_distributor: PortDistributor): + mock_s3_server = MockS3Server(port_distributor.get_port()) + yield mock_s3_server + mock_s3_server.kill() + + class PgProtocol: """ Reusable connection logic """ def __init__(self, **kwargs): @@ -369,7 +394,10 @@ class MockS3Server: ): self.port = port - self.subprocess = subprocess.Popen([f'poetry run moto_server s3 -p{port}'], shell=True) + # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. + # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux + # if a process is started from the shell process. + self.subprocess = subprocess.Popen(['poetry', 'run', 'moto_server', 's3', f'-p{port}']) error = None try: return_code = self.subprocess.poll() @@ -379,7 +407,7 @@ class MockS3Server: error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" if error is not None: log.error(error) - self.subprocess.kill() + self.kill() raise RuntimeError("failed to start s3 mock server") def endpoint(self) -> str: @@ -394,10 +422,51 @@ class MockS3Server: def secret_key(self) -> str: return 'test' + def access_env_vars(self) -> Dict[Any, Any]: + return { + 'AWS_ACCESS_KEY_ID': self.access_key(), + 'AWS_SECRET_ACCESS_KEY': self.secret_key(), + } + def kill(self): self.subprocess.kill() +@dataclass +class LocalFsStorage: + local_path: Path + + +@dataclass +class S3Storage: + bucket_name: str + bucket_region: str + endpoint: Optional[str] + + +RemoteStorage = Union[LocalFsStorage, S3Storage] + + +# serialize as toml inline table +def remote_storage_to_toml_inline_table(remote_storage): + if isinstance(remote_storage, LocalFsStorage): + res = f"local_path='{remote_storage.local_path}'" + elif isinstance(remote_storage, S3Storage): + res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'" + if remote_storage.endpoint is not None: + res += f", endpoint='{remote_storage.endpoint}'" + else: + raise Exception(f'Unknown storage configuration {remote_storage}') + else: + raise Exception("invalid remote storage type") + return f"{{{res}}}" + + +class RemoteStorageUsers(Flag): + PAGESERVER = auto() + SAFEKEEPER = auto() + + class ZenithEnvBuilder: """ Builder object to create a Zenith runtime environment @@ -410,31 +479,28 @@ class ZenithEnvBuilder: def __init__(self, repo_dir: Path, port_distributor: PortDistributor, - pageserver_remote_storage: Optional[RemoteStorage] = None, + broker: Etcd, + mock_s3_server: MockS3Server, + remote_storage: Optional[RemoteStorage] = None, + remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, pageserver_config_override: Optional[str] = None, - num_safekeepers: int = 0, + num_safekeepers: int = 1, pageserver_auth_enabled: bool = False, rust_log_override: Optional[str] = None, - default_branch_name=DEFAULT_BRANCH_NAME, - broker: bool = False): + default_branch_name=DEFAULT_BRANCH_NAME): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor - self.pageserver_remote_storage = pageserver_remote_storage + self.remote_storage = remote_storage + self.remote_storage_users = remote_storage_users + self.broker = broker + self.mock_s3_server = mock_s3_server self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.pageserver_auth_enabled = pageserver_auth_enabled self.default_branch_name = default_branch_name - self.broker = broker self.env: Optional[ZenithEnv] = None - self.s3_mock_server: Optional[MockS3Server] = None - - if os.getenv('FORCE_MOCK_S3') is not None: - bucket_name = f'{repo_dir.name}_bucket' - log.warning(f'Unconditionally initializing mock S3 server for bucket {bucket_name}') - self.enable_s3_mock_remote_storage(bucket_name) - def init(self) -> ZenithEnv: # Cannot create more than one environment from one builder assert self.env is None, "environment already initialized" @@ -455,9 +521,8 @@ class ZenithEnvBuilder: """ def enable_local_fs_remote_storage(self, force_enable=True): - assert force_enable or self.pageserver_remote_storage is None, "remote storage is enabled already" - self.pageserver_remote_storage = LocalFsStorage( - Path(self.repo_dir / 'local_fs_remote_storage')) + assert force_enable or self.remote_storage is None, "remote storage is enabled already" + self.remote_storage = LocalFsStorage(Path(self.repo_dir / 'local_fs_remote_storage')) """ Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already. @@ -466,22 +531,19 @@ class ZenithEnvBuilder: """ def enable_s3_mock_remote_storage(self, bucket_name: str, force_enable=True): - assert force_enable or self.pageserver_remote_storage is None, "remote storage is enabled already" - if not self.s3_mock_server: - self.s3_mock_server = MockS3Server(self.port_distributor.get_port()) - - mock_endpoint = self.s3_mock_server.endpoint() - mock_region = self.s3_mock_server.region() + assert force_enable or self.remote_storage is None, "remote storage is enabled already" + mock_endpoint = self.mock_s3_server.endpoint() + mock_region = self.mock_s3_server.region() boto3.client( 's3', endpoint_url=mock_endpoint, region_name=mock_region, - aws_access_key_id=self.s3_mock_server.access_key(), - aws_secret_access_key=self.s3_mock_server.secret_key(), + aws_access_key_id=self.mock_s3_server.access_key(), + aws_secret_access_key=self.mock_s3_server.secret_key(), ).create_bucket(Bucket=bucket_name) - self.pageserver_remote_storage = S3Storage(bucket=bucket_name, - endpoint=mock_endpoint, - region=mock_region) + self.remote_storage = S3Storage(bucket_name=bucket_name, + endpoint=mock_endpoint, + bucket_region=mock_region) def __enter__(self): return self @@ -495,10 +557,6 @@ class ZenithEnvBuilder: for sk in self.env.safekeepers: sk.stop(immediate=True) self.env.pageserver.stop(immediate=True) - if self.s3_mock_server: - self.s3_mock_server.kill() - if self.env.broker is not None: - self.env.broker.stop() class ZenithEnv: @@ -537,10 +595,13 @@ class ZenithEnv: self.repo_dir = config.repo_dir self.rust_log_override = config.rust_log_override self.port_distributor = config.port_distributor - self.s3_mock_server = config.s3_mock_server + self.s3_mock_server = config.mock_s3_server self.zenith_cli = ZenithCli(env=self) self.postgres = PostgresFactory(self) self.safekeepers: List[Safekeeper] = [] + self.broker = config.broker + self.remote_storage = config.remote_storage + self.remote_storage_users = config.remote_storage_users # generate initial tenant ID here instead of letting 'zenith init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -551,14 +612,10 @@ class ZenithEnv: default_tenant_id = '{self.initial_tenant.hex}' """) - self.broker = None - if config.broker: - # keep etcd datadir inside 'repo' - self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"), - port=self.port_distributor.get_port(), - peer_port=self.port_distributor.get_port()) - toml += textwrap.dedent(f""" - broker_endpoints = 'http://127.0.0.1:{self.broker.port}' + toml += textwrap.dedent(f""" + [etcd_broker] + broker_endpoints = ['{self.broker.client_url()}'] + etcd_binary_path = '{self.broker.binary_path}' """) # Create config for pageserver @@ -579,7 +636,6 @@ class ZenithEnv: # Create a corresponding ZenithPageserver object self.pageserver = ZenithPageserver(self, port=pageserver_port, - remote_storage=config.pageserver_remote_storage, config_override=config.pageserver_config_override) # Create config and a Safekeeper object for each safekeeper @@ -594,8 +650,12 @@ class ZenithEnv: id = {id} pg_port = {port.pg} http_port = {port.http} - sync = false # Disable fsyncs to make the tests go faster - """) + sync = false # Disable fsyncs to make the tests go faster""") + if bool(self.remote_storage_users + & RemoteStorageUsers.SAFEKEEPER) and self.remote_storage is not None: + toml += textwrap.dedent(f""" + remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}" + """) safekeeper = Safekeeper(env=self, id=id, port=port) self.safekeepers.append(safekeeper) @@ -603,15 +663,13 @@ class ZenithEnv: self.zenith_cli.init(toml) def start(self): - # Start up the page server, all the safekeepers and the broker + # Start up broker, pageserver and all safekeepers + self.broker.try_start() self.pageserver.start() for safekeeper in self.safekeepers: safekeeper.start() - if self.broker is not None: - self.broker.start() - def get_safekeeper_connstrs(self) -> str: """ Get list of safekeeper endpoints suitable for safekeepers GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) @@ -624,9 +682,12 @@ class ZenithEnv: @pytest.fixture(scope=shareable_scope) -def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: +def _shared_simple_env(request: Any, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + default_broker: Etcd) -> Iterator[ZenithEnv]: """ - Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES + # Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES is set, this is shared by all tests using `zenith_simple_env`. """ @@ -638,7 +699,8 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: repo_dir = os.path.join(str(top_output_dir), "shared_repo") shutil.rmtree(repo_dir, ignore_errors=True) - with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: + with ZenithEnvBuilder(Path(repo_dir), port_distributor, default_broker, + mock_s3_server) as builder: env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. @@ -660,12 +722,13 @@ def zenith_simple_env(_shared_simple_env: ZenithEnv) -> Iterator[ZenithEnv]: yield _shared_simple_env _shared_simple_env.postgres.stop_all() - if _shared_simple_env.s3_mock_server: - _shared_simple_env.s3_mock_server.kill() @pytest.fixture(scope='function') -def zenith_env_builder(test_output_dir, port_distributor) -> Iterator[ZenithEnvBuilder]: +def zenith_env_builder(test_output_dir, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + default_broker: Etcd) -> Iterator[ZenithEnvBuilder]: """ Fixture to create a Zenith environment for test. @@ -683,7 +746,8 @@ def zenith_env_builder(test_output_dir, port_distributor) -> Iterator[ZenithEnvB repo_dir = os.path.join(test_output_dir, "repo") # Return the builder to the caller - with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: + with ZenithEnvBuilder(Path(repo_dir), port_distributor, default_broker, + mock_s3_server) as builder: yield builder @@ -786,6 +850,15 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def wal_receiver_get(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/wal_receiver" + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) @@ -798,20 +871,6 @@ class PageserverPort: http: int -@dataclass -class LocalFsStorage: - root: Path - - -@dataclass -class S3Storage: - bucket: str - region: str - endpoint: Optional[str] - - -RemoteStorage = Union[LocalFsStorage, S3Storage] - CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", re.MULTILINE) CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", @@ -971,9 +1030,11 @@ class ZenithCli: cmd = ['init', f'--config={tmp.name}'] if initial_timeline_id: cmd.extend(['--timeline-id', initial_timeline_id.hex]) - append_pageserver_param_overrides(cmd, - self.env.pageserver.remote_storage, - self.env.pageserver.config_override) + append_pageserver_param_overrides( + params_to_update=cmd, + remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, + pageserver_config_override=self.env.pageserver.config_override) res = self.raw_cli(cmd) res.check_returncode() @@ -994,16 +1055,13 @@ class ZenithCli: def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': start_args = ['pageserver', 'start', *overrides] - append_pageserver_param_overrides(start_args, - self.env.pageserver.remote_storage, - self.env.pageserver.config_override) + append_pageserver_param_overrides( + params_to_update=start_args, + remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, + pageserver_config_override=self.env.pageserver.config_override) - s3_env_vars = None - if self.env.s3_mock_server: - s3_env_vars = { - 'AWS_ACCESS_KEY_ID': self.env.s3_mock_server.access_key(), - 'AWS_SECRET_ACCESS_KEY': self.env.s3_mock_server.secret_key(), - } + s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None return self.raw_cli(start_args, extra_env_vars=s3_env_vars) def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': @@ -1015,7 +1073,8 @@ class ZenithCli: return self.raw_cli(cmd) def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['safekeeper', 'start', str(id)]) + s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None + return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars) def safekeeper_stop(self, id: Optional[int] = None, @@ -1166,16 +1225,11 @@ class ZenithPageserver(PgProtocol): Initializes the repository via `zenith init`. """ - def __init__(self, - env: ZenithEnv, - port: PageserverPort, - remote_storage: Optional[RemoteStorage] = None, - config_override: Optional[str] = None): + def __init__(self, env: ZenithEnv, port: PageserverPort, config_override: Optional[str] = None): super().__init__(host='localhost', port=port.pg, user='zenith_admin') self.env = env self.running = False self.service_port = port - self.remote_storage = remote_storage self.config_override = config_override def start(self, overrides=()) -> 'ZenithPageserver': @@ -1215,23 +1269,14 @@ class ZenithPageserver(PgProtocol): def append_pageserver_param_overrides( params_to_update: List[str], - pageserver_remote_storage: Optional[RemoteStorage], + remote_storage: Optional[RemoteStorage], + remote_storage_users: RemoteStorageUsers, pageserver_config_override: Optional[str] = None, ): - if pageserver_remote_storage is not None: - if isinstance(pageserver_remote_storage, LocalFsStorage): - pageserver_storage_override = f"local_path='{pageserver_remote_storage.root}'" - elif isinstance(pageserver_remote_storage, S3Storage): - pageserver_storage_override = f"bucket_name='{pageserver_remote_storage.bucket}',\ - bucket_region='{pageserver_remote_storage.region}'" - - if pageserver_remote_storage.endpoint is not None: - pageserver_storage_override += f",endpoint='{pageserver_remote_storage.endpoint}'" - - else: - raise Exception(f'Unknown storage configuration {pageserver_remote_storage}') + if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None: + remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) params_to_update.append( - f'--pageserver-config-override=remote_storage={{{pageserver_storage_override}}}') + f'--pageserver-config-override=remote_storage={remote_storage_toml_table}') env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES') if env_overrides is not None: @@ -1765,8 +1810,9 @@ class Safekeeper: class SafekeeperTimelineStatus: acceptor_epoch: int flush_lsn: str - remote_consistent_lsn: str timeline_start_lsn: str + backup_lsn: str + remote_consistent_lsn: str @dataclass @@ -1791,8 +1837,9 @@ class SafekeeperHttpClient(requests.Session): resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], flush_lsn=resj['flush_lsn'], - remote_consistent_lsn=resj['remote_consistent_lsn'], - timeline_start_lsn=resj['timeline_start_lsn']) + timeline_start_lsn=resj['timeline_start_lsn'], + backup_lsn=resj['backup_lsn'], + remote_consistent_lsn=resj['remote_consistent_lsn']) def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): res = self.post( @@ -1815,10 +1862,13 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def get_metrics(self) -> SafekeeperMetrics: + def get_metrics_str(self) -> str: request_result = self.get(f"http://localhost:{self.port}/metrics") request_result.raise_for_status() - all_metrics_text = request_result.text + return request_result.text + + def get_metrics(self) -> SafekeeperMetrics: + all_metrics_text = self.get_metrics_str() metrics = SafekeeperMetrics() for match in re.finditer( @@ -1840,26 +1890,40 @@ class Etcd: datadir: str port: int peer_port: int + binary_path: Path = etcd_path() handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon + def client_url(self): + return f'http://127.0.0.1:{self.port}' + def check_status(self): s = requests.Session() s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry - s.get(f"http://localhost:{self.port}/health").raise_for_status() + s.get(f"{self.client_url()}/health").raise_for_status() + + def try_start(self): + if self.handle is not None: + log.debug(f'etcd is already running on port {self.port}') + return - def start(self): pathlib.Path(self.datadir).mkdir(exist_ok=True) - etcd_full_path = etcd_path() - if etcd_full_path is None: - raise Exception('etcd not found') + if not self.binary_path.is_file(): + raise RuntimeError(f"etcd broker binary '{self.binary_path}' is not a file") + + client_url = self.client_url() + log.info(f'Starting etcd to listen incoming connections at "{client_url}"') with open(os.path.join(self.datadir, "etcd.log"), "wb") as log_file: args = [ - etcd_full_path, + self.binary_path, f"--data-dir={self.datadir}", - f"--listen-client-urls=http://localhost:{self.port}", - f"--advertise-client-urls=http://localhost:{self.port}", - f"--listen-peer-urls=http://localhost:{self.peer_port}" + f"--listen-client-urls={client_url}", + f"--advertise-client-urls={client_url}", + f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}", + # Set --quota-backend-bytes to keep the etcd virtual memory + # size smaller. Our test etcd clusters are very small. + # See https://github.com/etcd-io/etcd/issues/7910 + f"--quota-backend-bytes=100000000" ] self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) @@ -1911,7 +1975,12 @@ def test_output_dir(request: Any) -> str: return test_dir -SKIP_DIRS = frozenset(('pg_wal', 'pg_stat', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical')) +SKIP_DIRS = frozenset(('pg_wal', + 'pg_stat', + 'pg_stat_tmp', + 'pg_subtrans', + 'pg_logical', + 'pg_replslot/wal_proposer_slot')) SKIP_FILES = frozenset(('pg_internal.init', 'pg.log', @@ -2037,7 +2106,11 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos assert (mismatch, error) == ([], []) -def wait_for(number_of_iterations: int, interval: int, func): +def wait_until(number_of_iterations: int, interval: int, func): + """ + Wait until 'func' returns successfully, without exception. Returns the last return value + from the the function. + """ last_exception = None for i in range(number_of_iterations): try: @@ -2064,9 +2137,15 @@ def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, timeline: uuid.UUID) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail['remote']['remote_consistent_lsn'] - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + if detail['remote'] is None: + # No remote information at all. This happens right after creating + # a timeline, before any part of it it has been uploaded to remote + # storage yet. + return 0 + else: + lsn_str = detail['remote']['remote_consistent_lsn'] + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, @@ -2074,8 +2153,15 @@ def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, timeline: uuid.UUID, lsn: int): """waits for local timeline upload up to specified lsn""" - - wait_for(10, 1, lambda: remote_consistent_lsn(pageserver_http_client, tenant, timeline) >= lsn) + for i in range(10): + current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return + log.info("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + time.sleep(1) + raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn))) def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, @@ -2093,5 +2179,12 @@ def wait_for_last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, timeline: uuid.UUID, lsn: int): """waits for pageserver to catch up to a certain lsn""" - - wait_for(10, 1, lambda: last_record_lsn(pageserver_http_client, tenant, timeline) >= lsn) + for i in range(10): + current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return + log.info("waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + time.sleep(1) + raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn))) diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index e30912ce32..53b6a3a4fc 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -1,9 +1,11 @@ +import pytest from contextlib import closing - from fixtures.zenith_fixtures import ZenithEnvBuilder from fixtures.benchmark_fixture import ZenithBenchmarker +# This test sometimes runs for longer than the global 5 minute timeout. +@pytest.mark.timeout(600) def test_startup(zenith_env_builder: ZenithEnvBuilder, zenbenchmark: ZenithBenchmarker): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() diff --git a/vendor/postgres b/vendor/postgres index 1db115cecb..038b2b98e5 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 1db115cecb3dbc2a74c5efa964fdf3a8a341c4d2 +Subproject commit 038b2b98e5c3d6274cbd43e9b822cdd946cb8b91