diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 00a51eb906..0000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,369 +0,0 @@ -version: 2.1 - -executors: - neon-xlarge-executor: - resource_class: xlarge - docker: - # NB: when changed, do not forget to update rust image tag in all Dockerfiles - - image: neondatabase/rust:1.58 - neon-executor: - docker: - - image: neondatabase/rust:1.58 - -jobs: - # A job to build postgres - build-postgres: - executor: neon-xlarge-executor - parameters: - build_type: - type: enum - enum: ["debug", "release"] - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - # Checkout the git repo (circleci doesn't have a flag to enable submodules here) - - checkout - - # Grab the postgres git revision to build a cache key. - # Append makefile as it could change the way postgres is built. - # Note this works even though the submodule hasn't been checkout out yet. - - run: - name: Get postgres cache key - command: | - git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres - cat Makefile >> /tmp/cache-key-postgres - - - restore_cache: - name: Restore postgres cache - keys: - # Restore ONLY if the rev key matches exactly - - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - - # Build postgres if the restore_cache didn't find a build. - # `make` can't figure out whether the cache is valid, since - # it only compares file timestamps. - - run: - name: build postgres - command: | - if [ ! -e tmp_install/bin/postgres ]; then - # "depth 1" saves some time by not cloning the whole repo - git submodule update --init --depth 1 - # bail out on any warnings - COPT='-Werror' mold -run make postgres -j$(nproc) - fi - - - save_cache: - name: Save postgres cache - key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - paths: - - tmp_install - - # A job to build Neon rust code - build-neon: - executor: neon-xlarge-executor - parameters: - build_type: - type: enum - enum: ["debug", "release"] - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - # Checkout the git repo (without submodules) - - checkout - - # Grab the postgres git revision to build a cache key. - # Append makefile as it could change the way postgres is built. - # Note this works even though the submodule hasn't been checkout out yet. - - run: - name: Get postgres cache key - command: | - git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres - cat Makefile >> /tmp/cache-key-postgres - - - - restore_cache: - name: Restore postgres cache - keys: - # Restore ONLY if the rev key matches exactly - - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - - - restore_cache: - name: Restore rust cache - keys: - # Require an exact match. While an out of date cache might speed up the build, - # there's no way to clean out old packages, so the cache grows every time something - # changes. - - v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} - - # Build the rust code, including test binaries - - run: - name: Rust build << parameters.build_type >> - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - CARGO_FLAGS="--release --features profiling" - fi - - export CARGO_INCREMENTAL=0 - export CACHEPOT_BUCKET=zenith-rust-cachepot - export RUSTC_WRAPPER="" - export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" - export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" - mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests - cachepot -s - - - save_cache: - name: Save rust cache - key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} - paths: - - ~/.cargo/registry - - ~/.cargo/git - - target - - # Run rust unit tests - - run: - name: cargo test - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - CARGO_FLAGS=--release - fi - - cargo test $CARGO_FLAGS - - # Install the rust binaries, for use by test jobs - - run: - name: Install rust binaries - command: | - binaries=$( - cargo metadata --format-version=1 --no-deps | - jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' - ) - - mkdir -p /tmp/zenith/bin - mkdir -p /tmp/zenith/test_bin - mkdir -p /tmp/zenith/etc - - # Install target binaries - for bin in $binaries; do - SRC=target/$BUILD_TYPE/$bin - DST=/tmp/zenith/bin/$bin - cp $SRC $DST - done - - # Install the postgres binaries, for use by test jobs - - run: - name: Install postgres binaries - command: | - cp -a tmp_install /tmp/zenith/pg_install - - # Save rust binaries for other jobs in the workflow - - persist_to_workspace: - root: /tmp/zenith - paths: - - "*" - - check-codestyle-python: - executor: neon-executor - steps: - - checkout - - restore_cache: - keys: - - v2-python-deps-{{ checksum "poetry.lock" }} - - run: - name: Install deps - command: ./scripts/pysync - - save_cache: - key: v2-python-deps-{{ checksum "poetry.lock" }} - paths: - - /home/circleci/.cache/pypoetry/virtualenvs - - run: - name: Print versions - when: always - command: | - poetry run python --version - poetry show - - run: - name: Run yapf to ensure code format - when: always - command: poetry run yapf --recursive --diff . - - run: - name: Run mypy to check types - when: always - command: poetry run mypy . - - run-pytest: - executor: neon-executor - parameters: - # pytest args to specify the tests to run. - # - # This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory, - # or '-k foobar' to run tests containing string 'foobar'. See pytest man page - # section SPECIFYING TESTS / SELECTING TESTS for details. - # - # Select the type of Rust build. Must be "release" or "debug". - build_type: - type: string - default: "debug" - # This parameter is required, to prevent the mistake of running all tests in one job. - test_selection: - type: string - default: "" - # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr - extra_params: - type: string - default: "" - needs_postgres_source: - type: boolean - default: false - run_in_parallel: - type: boolean - default: true - save_perf_report: - type: boolean - default: false - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - - attach_workspace: - at: /tmp/zenith - - checkout - - when: - condition: << parameters.needs_postgres_source >> - steps: - - run: git submodule update --init --depth 1 - - restore_cache: - keys: - - v2-python-deps-{{ checksum "poetry.lock" }} - - run: - name: Install deps - command: ./scripts/pysync - - save_cache: - key: v2-python-deps-{{ checksum "poetry.lock" }} - paths: - - /home/circleci/.cache/pypoetry/virtualenvs - - run: - name: Run pytest - # pytest doesn't output test logs in real time, so CI job may fail with - # `Too long with no output` error, if a test is running for a long time. - # In that case, tests should have internal timeouts that are less than - # no_output_timeout, specified here. - no_output_timeout: 10m - environment: - - NEON_BIN: /tmp/zenith/bin - - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install - - TEST_OUTPUT: /tmp/test_output - # this variable will be embedded in perf test report - # and is needed to distinguish different environments - - PLATFORM: zenith-local-ci - command: | - PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" - rm -rf $PERF_REPORT_DIR - - TEST_SELECTION="test_runner/<< parameters.test_selection >>" - EXTRA_PARAMS="<< parameters.extra_params >>" - if [ -z "$TEST_SELECTION" ]; then - echo "test_selection must be set" - exit 1 - fi - if << parameters.run_in_parallel >>; then - EXTRA_PARAMS="-n4 $EXTRA_PARAMS" - fi - if << parameters.save_perf_report >>; then - if [[ $CIRCLE_BRANCH == "main" ]]; then - mkdir -p "$PERF_REPORT_DIR" - EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" - fi - fi - - export GITHUB_SHA=$CIRCLE_SHA1 - - # Run the tests. - # - # The junit.xml file allows CircleCI to display more fine-grained test information - # in its "Tests" tab in the results page. - # --verbose prints name of each test (helpful when there are - # multiple tests in one file) - # -rA prints summary in the end - # -n4 uses four processes to run tests via pytest-xdist - # -s is not used to prevent pytest from capturing output, because tests are running - # in parallel and logs are mixed between different tests - ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ - --tb=short \ - --verbose \ - -m "not remote_cluster" \ - -rA $TEST_SELECTION $EXTRA_PARAMS - - if << parameters.save_perf_report >>; then - if [[ $CIRCLE_BRANCH == "main" ]]; then - export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO=local - scripts/generate_and_push_perf_report.sh - fi - fi - - run: - # CircleCI artifacts are preserved one file at a time, so skipping - # this step isn't a good idea. If you want to extract the - # pageserver state, perhaps a tarball would be a better idea. - name: Delete all data but logs - when: always - command: | - du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete - du -sh /tmp/test_output/* - - store_artifacts: - path: /tmp/test_output - # The store_test_results step tells CircleCI where to find the junit.xml file. - - store_test_results: - path: /tmp/test_output - # Save data (if any) - - persist_to_workspace: - root: /tmp/zenith - paths: - - "*" - -workflows: - build_and_test: - jobs: - - check-codestyle-python - - build-postgres: - name: build-postgres-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - - build-neon: - name: build-neon-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - requires: - - build-postgres-<< matrix.build_type >> - - run-pytest: - name: pg_regress-tests-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - test_selection: batch_pg_regress - needs_postgres_source: true - requires: - - build-neon-<< matrix.build_type >> - - run-pytest: - name: other-tests-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - test_selection: batch_others - requires: - - build-neon-<< matrix.build_type >> - - run-pytest: - name: benchmarks - context: PERF_TEST_RESULT_CONNSTR - build_type: release - test_selection: performance - run_in_parallel: false - save_perf_report: true - requires: - - build-neon-release diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index f220be2b12..a956929d92 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -37,26 +37,13 @@ runs: name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact path: ./neon-artifact/ - - name: Get Postgres artifact for restoration - uses: actions/download-artifact@v3 - with: - name: postgres-${{ runner.os }}-${{ inputs.build_type }}-artifact - path: ./pg-artifact/ - - name: Extract Neon artifact shell: bash -ex {0} run: | mkdir -p /tmp/neon/ - tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/ rm -rf ./neon-artifact/ - - name: Extract Postgres artifact - shell: bash -ex {0} - run: | - mkdir -p /tmp/neon/tmp_install - tar -xf ./pg-artifact/pg.tgz -C /tmp/neon/tmp_install - rm -rf ./pg-artifact/ - - name: Checkout if: inputs.needs_postgres_source == 'true' uses: actions/checkout@v3 @@ -78,7 +65,7 @@ runs: - name: Run pytest env: NEON_BIN: /tmp/neon/bin - POSTGRES_DISTRIB_DIR: /tmp/neon/tmp_install + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install TEST_OUTPUT: /tmp/test_output # this variable will be embedded in perf test report # and is needed to distinguish different environments @@ -112,7 +99,7 @@ runs: # Run the tests. # - # The junit.xml file allows CircleCI to display more fine-grained test information + # The junit.xml file allows CI tools to display more fine-grained test information # in its "Tests" tab in the results page. # --verbose prints name of each test (helpful when there are # multiple tests in one file) diff --git a/.github/ansible/production.hosts b/.github/ansible/production.hosts index d22ce0e37e..364e8ed50e 100644 --- a/.github/ansible/production.hosts +++ b/.github/ansible/production.hosts @@ -17,4 +17,4 @@ env_name = prod-1 console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 -etcd_endpoints = etcd-release.local:2379 +etcd_endpoints = zenith-1-etcd.local:2379 diff --git a/.github/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh index 2297788f59..a9b5025562 100644 --- a/.github/ansible/scripts/init_safekeeper.sh +++ b/.github/ansible/scripts/init_safekeeper.sh @@ -12,10 +12,9 @@ cat <> $GITHUB_ENV + echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV + echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV # Don't include the ~/.cargo/registry/src directory. It contains just # uncompressed versions of the crates in ~/.cargo/registry/cache @@ -110,59 +81,36 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + + - name: Cache postgres build + id: cache_pg + uses: actions/cache@v3 + with: + path: tmp_install/ + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Build postgres + if: steps.cache_pg.outputs.cache-hit != 'true' + run: mold -run make postgres -j$(nproc) - name: Run cargo build run: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - CARGO_FLAGS="--release --features profiling" - fi - - "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests - name: Run cargo test run: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - CARGO_FLAGS=--release - fi - - "${cov_prefix[@]}" cargo test $CARGO_FLAGS + ${cov_prefix} cargo test $CARGO_FLAGS - name: Install rust binaries run: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - + # Install target binaries + mkdir -p /tmp/neon/bin/ binaries=$( - "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | + ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) - - test_exe_paths=$( - "${cov_prefix[@]}" cargo test --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - - mkdir -p /tmp/neon/bin/ - mkdir -p /tmp/neon/test_bin/ - mkdir -p /tmp/neon/etc/ - - # Keep bloated coverage data files away from the rest of the artifact - mkdir -p /tmp/coverage/ - - # Install target binaries for bin in $binaries; do SRC=target/$BUILD_TYPE/$bin DST=/tmp/neon/bin/$bin @@ -171,9 +119,14 @@ jobs: # Install test executables and write list of all binaries (for code coverage) if [[ $BUILD_TYPE == "debug" ]]; then - for bin in $binaries; do - echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list - done + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ + + mkdir -p /tmp/neon/test_bin/ + test_exe_paths=$( + ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) for bin in $test_exe_paths; do SRC=$bin DST=/tmp/neon/test_bin/$(basename $bin) @@ -183,10 +136,17 @@ jobs: strip "$SRC" -o "$DST" echo "$DST" >> /tmp/coverage/binaries.list done + + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list + done fi + - name: Install postgres binaries + run: cp -a tmp_install /tmp/neon/pg_install + - name: Prepare neon artifact - run: tar -C /tmp/neon/ -czf ./neon.tgz . + run: ZSTD_NBTHREADS=0 tar -C /tmp/neon/ -cf ./neon.tar.zst --zstd . - name: Upload neon binaries uses: actions/upload-artifact@v3 @@ -194,7 +154,7 @@ jobs: retention-days: 7 if-no-files-found: error name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact - path: ./neon.tgz + path: ./neon.tar.zst # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data @@ -308,7 +268,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact for restoration uses: actions/download-artifact@v3 @@ -319,7 +279,7 @@ jobs: - name: Extract Neon artifact run: | mkdir -p /tmp/neon/ - tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/ rm -rf ./neon-artifact/ - name: Restore coverage data @@ -557,7 +517,7 @@ jobs: if [[ "$GITHUB_REF_NAME" == "main" ]]; then STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}' NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}' - echo "::set-output name=include::[$STAGING, $NEON_STRESS]" + echo "::set-output name=include::[$STAGING]" elif [[ "$GITHUB_REF_NAME" == "release" ]]; then PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}' echo "::set-output name=include::[$PRODUCTION]" diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 89bfffd4b9..8bcaa8f947 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -101,7 +101,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh diff --git a/Cargo.lock b/Cargo.lock index 4f453678e6..5031ae02e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -467,7 +467,6 @@ dependencies = [ "clap 3.2.12", "env_logger", "hyper", - "libc", "log", "postgres", "regex", @@ -517,7 +516,6 @@ dependencies = [ "tar", "thiserror", "toml", - "url", "utils", "workspace_hack", ] @@ -1604,7 +1602,6 @@ version = "0.1.0" dependencies = [ "lazy_static", "libc", - "once_cell", "prometheus", "workspace_hack", ] @@ -1677,7 +1674,6 @@ dependencies = [ "git-version", "pageserver", "postgres", - "postgres_ffi", "safekeeper", "serde_json", "utils", @@ -1905,7 +1901,6 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "tokio-stream", "toml_edit", "tracing", "url", @@ -2764,7 +2759,6 @@ dependencies = [ "daemonize", "etcd_broker", "fs2", - "futures", "git-version", "hex", "humantime", @@ -2784,12 +2778,10 @@ dependencies = [ "tempfile", "tokio", "tokio-postgres", - "tokio-util", "toml_edit", "tracing", "url", "utils", - "walkdir", "workspace_hack", ] diff --git a/Dockerfile b/Dockerfile index ad85638af3..6f017ac5d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,10 @@ RUN set -e \ FROM neondatabase/rust:1.58 AS build ARG GIT_VERSION=local +# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. +# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +ARG RUSTC_WRAPPER=cachepot ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 71770ae9ed..76cbc2ac30 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,7 +1,11 @@ # First transient image to build compute_tools binaries -# NB: keep in sync with rust image version in .circle/config.yml +# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml FROM neondatabase/rust:1.58 AS rust-build +# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. +# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +ARG RUSTC_WRAPPER=cachepot ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 1022438c2e..78b85d0e79 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -libc = "0.2" anyhow = "1.0" chrono = "0.4" clap = "3.0" diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 21311eea9a..26bb577636 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -14,7 +14,6 @@ regex = "1" anyhow = "1.0" thiserror = "1" nix = "0.23" -url = "2.2.2" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } pageserver = { path = "../pageserver" } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index c90f36d104..d87be95b82 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -304,10 +304,9 @@ impl SafekeeperNode { Ok(self .http_request( Method::POST, - format!("{}/{}", self.http_base_url, "timeline"), + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), ) .json(&TimelineCreateRequest { - tenant_id, timeline_id, peer_ids, }) diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index 8ff5d1d421..2879dfed81 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -7,5 +7,4 @@ edition = "2021" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency libc = "0.2" lazy_static = "1.4" -once_cell = "1.8.0" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 3b5da9f7ff..ea24b3fe7e 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,6 +3,9 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use lazy_static::lazy_static; +use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec}; +pub use prometheus::opts; +pub use prometheus::register; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_gauge, Gauge}; @@ -18,6 +21,17 @@ pub use prometheus::{Encoder, TextEncoder}; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; +pub type UIntGauge = GenericGauge; +pub type UIntGaugeVec = GenericGaugeVec; + +#[macro_export] +macro_rules! register_uint_gauge_vec { + ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ + let gauge_vec = UIntGaugeVec::new($crate::opts!($NAME, $HELP), $LABELS_NAMES).unwrap(); + $crate::register(Box::new(gauge_vec.clone())).map(|_| gauge_vec) + }}; +} + /// Gathers all Prometheus metrics and records the I/O stats just before that. /// /// Metrics gathering is a relatively simple and standalone operation, so diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index c6df4fc0b0..7db2c20e34 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -49,12 +49,12 @@ fn main() { // Finding the location of C headers for the Postgres server: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/tmp_install/include/postgresql/server` - let mut pg_install_dir: PathBuf; - if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { - pg_install_dir = postgres_install_dir.into(); + let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") + { + postgres_install_dir.into() } else { - pg_install_dir = PathBuf::from("tmp_install") - } + PathBuf::from("tmp_install") + }; if pg_install_dir.is_relative() { let cwd = env::current_dir().unwrap(); diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 0a320f123c..3dcae4d0af 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -47,10 +47,12 @@ pub enum FeStartupPacket { StartupMessage { major_version: u32, minor_version: u32, - params: HashMap, + params: StartupMessageParams, }, } +pub type StartupMessageParams = HashMap; + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct CancelKeyData { pub backend_pid: i32, diff --git a/neon_local/Cargo.toml b/neon_local/Cargo.toml index 8ebd7d5c17..2fc38cfe02 100644 --- a/neon_local/Cargo.toml +++ b/neon_local/Cargo.toml @@ -15,6 +15,5 @@ git-version = "0.3.5" pageserver = { path = "../pageserver" } control_plane = { path = "../control_plane" } safekeeper = { path = "../safekeeper" } -postgres_ffi = { path = "../libs/postgres_ffi" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index b7d97a67c0..215fa151a0 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -29,7 +29,6 @@ postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-stream = "0.1.8" anyhow = { version = "1.0", features = ["backtrace"] } crc32c = "0.6.0" thiserror = "1.0" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 3ec1ec9243..5837447ce8 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -23,8 +23,7 @@ use tar::{Builder, EntryType, Header}; use tracing::*; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Timeline; -use crate::DatadirTimelineImpl; +use crate::DatadirTimeline; use postgres_ffi::xlog_utils::*; use postgres_ffi::*; use utils::lsn::Lsn; @@ -32,12 +31,13 @@ use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a, W> +pub struct Basebackup<'a, W, T> where W: Write, + T: DatadirTimeline, { ar: Builder>, - timeline: &'a Arc, + timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, full_backup: bool, @@ -52,17 +52,18 @@ where // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a, W> Basebackup<'a, W> +impl<'a, W, T> Basebackup<'a, W, T> where W: Write, + T: DatadirTimeline, { pub fn new( write: W, - timeline: &'a Arc, + timeline: &'a Arc, req_lsn: Option, prev_lsn: Option, full_backup: bool, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -79,13 +80,13 @@ where let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", req_lsn); - timeline.tline.wait_lsn(req_lsn)?; + timeline.wait_lsn(req_lsn)?; // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as // zero, though, if no WAL has been generated on this timeline // yet.) - let end_of_timeline = timeline.tline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); if req_lsn == end_of_timeline.last { (end_of_timeline.prev, req_lsn) } else { @@ -93,7 +94,7 @@ where } } else { // Backup was requested at end of the timeline. - let end_of_timeline = timeline.tline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); (end_of_timeline.prev, end_of_timeline.last) }; @@ -371,7 +372,7 @@ where // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.tline.get_ancestor_lsn() { + if self.lsn == self.timeline.get_ancestor_lsn() { write!(zenith_signal, "PREV LSN: none")?; } else { write!(zenith_signal, "PREV LSN: invalid")?; @@ -402,9 +403,10 @@ where } } -impl<'a, W> Drop for Basebackup<'a, W> +impl<'a, W, T> Drop for Basebackup<'a, W, T> where W: Write, + T: DatadirTimeline, { /// If the basebackup was not finished, prevent the Archive::drop() from /// writing the end-of-archive marker. diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2775a27e0f..46305a4855 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -78,6 +78,11 @@ paths: schema: type: string description: Controls calculation of current_logical_size_non_incremental + - name: include-non-incremental-physical-size + in: query + schema: + type: string + description: Controls calculation of current_physical_size_non_incremental get: description: Get timelines for tenant responses: @@ -136,6 +141,11 @@ paths: schema: type: string description: Controls calculation of current_logical_size_non_incremental + - name: include-non-incremental-physical-size + in: query + schema: + type: string + description: Controls calculation of current_physical_size_non_incremental responses: "200": description: TimelineInfo @@ -671,8 +681,12 @@ components: format: hex current_logical_size: type: integer + current_physical_size: + type: integer current_logical_size_non_incremental: type: integer + current_physical_size_non_incremental: + type: integer WalReceiverEntry: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 236415cf58..8ac3faca7a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -113,10 +113,17 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); + let include_non_incremental_logical_size = + query_param_present(&request, "include-non-incremental-logical-size"); + let include_non_incremental_physical_size = + query_param_present(&request, "include-non-incremental-physical-size"); let local_timeline_infos = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - crate::timelines::get_local_timelines(tenant_id, include_non_incremental_logical_size) + crate::timelines::get_local_timelines( + tenant_id, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) }) .await .map_err(ApiError::from_err)??; @@ -145,17 +152,15 @@ async fn timeline_list_handler(request: Request) -> Result, json_response(StatusCode::OK, response_data) } -// Gate non incremental logical size calculation behind a flag -// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines -// and tenants it can take noticeable amount of time. Also the value currently used only in tests -fn get_include_non_incremental_logical_size(request: &Request) -> bool { +/// Checks if a query param is present in the request's URL +fn query_param_present(request: &Request, param: &str) -> bool { request .uri() .query() .map(|v| { url::form_urlencoded::parse(v.as_bytes()) .into_owned() - .any(|(param, _)| param == "include-non-incremental-logical-size") + .any(|(p, _)| p == param) }) .unwrap_or(false) } @@ -165,7 +170,10 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result( +pub fn import_timeline_from_postgres_datadir( path: &Path, - tline: &mut DatadirTimeline, + tline: &T, lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) // Then fishing out pg_control would be unnecessary - let mut modification = tline.begin_modification(lsn); + let mut modification = tline.begin_modification(); modification.init_empty()?; // Import all but pg_wal @@ -57,12 +56,12 @@ pub fn import_timeline_from_postgres_datadir( if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { pg_control = Some(control_file); } - modification.flush()?; + modification.flush(lsn)?; } } // We're done importing all the data files. - modification.commit()?; + modification.commit(lsn)?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; @@ -89,8 +88,8 @@ pub fn import_timeline_from_postgres_datadir( } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_rel( - modification: &mut DatadirModification, +fn import_rel( + modification: &mut DatadirModification, path: &Path, spcoid: Oid, dboid: Oid, @@ -169,8 +168,8 @@ fn import_rel( /// Import an SLRU segment file /// -fn import_slru( - modification: &mut DatadirModification, +fn import_slru( + modification: &mut DatadirModification, slru: SlruKind, path: &Path, mut reader: Reader, @@ -225,9 +224,9 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( +fn import_wal( walpath: &Path, - tline: &mut DatadirTimeline, + tline: &T, startpoint: Lsn, endpoint: Lsn, ) -> Result<()> { @@ -268,9 +267,11 @@ fn import_wal( waldecoder.feed_bytes(&buf); let mut nrecords = 0; + let mut modification = tline.begin_modification(); + let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(tline, recdata, lsn)?; + walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; last_lsn = lsn; nrecords += 1; @@ -294,13 +295,13 @@ fn import_wal( Ok(()) } -pub fn import_basebackup_from_tar( - tline: &mut DatadirTimeline, +pub fn import_basebackup_from_tar( + tline: &T, reader: Reader, base_lsn: Lsn, ) -> Result<()> { info!("importing base at {}", base_lsn); - let mut modification = tline.begin_modification(base_lsn); + let mut modification = tline.begin_modification(); modification.init_empty()?; let mut pg_control: Option = None; @@ -318,7 +319,7 @@ pub fn import_basebackup_from_tar( // We found the pg_control file. pg_control = Some(res); } - modification.flush()?; + modification.flush(base_lsn)?; } tar::EntryType::Directory => { debug!("directory {:?}", file_path); @@ -332,12 +333,12 @@ pub fn import_basebackup_from_tar( // sanity check: ensure that pg_control is loaded let _pg_control = pg_control.context("pg_control file not found")?; - modification.commit()?; + modification.commit(base_lsn)?; Ok(()) } -pub fn import_wal_from_tar( - tline: &mut DatadirTimeline, +pub fn import_wal_from_tar( + tline: &T, reader: Reader, start_lsn: Lsn, end_lsn: Lsn, @@ -384,9 +385,11 @@ pub fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); + let mut modification = tline.begin_modification(); + let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(tline, recdata, lsn)?; + walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); @@ -415,8 +418,8 @@ pub fn import_wal_from_tar( Ok(()) } -pub fn import_file( - modification: &mut DatadirModification, +pub fn import_file( + modification: &mut DatadirModification, file_path: &Path, reader: Reader, len: usize, @@ -535,7 +538,7 @@ pub fn import_file( // zenith.signal is not necessarily the last file, that we handle // but it is ok to call `finish_write()`, because final `modification.commit()` // will update lsn once more to the final one. - let writer = modification.tline.tline.writer(); + let writer = modification.tline.writer(); writer.finish_write(prev_lsn); debug!("imported zenith signal {}", prev_lsn); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6459e802f4..c500b05e66 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -11,52 +11,35 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::Bytes; -use fail::fail_point; -use itertools::Itertools; -use lazy_static::lazy_static; +use anyhow::{bail, ensure, Context, Result}; use tracing::*; -use std::cmp::{max, min, Ordering}; +use std::cmp::min; use std::collections::hash_map::Entry; +use std::collections::BTreeSet; use std::collections::HashMap; -use std::collections::{BTreeSet, HashSet}; use std::fs; -use std::fs::{File, OpenOptions}; -use std::io::Write; +use std::fs::File; use std::num::NonZeroU64; -use std::ops::{Bound::Included, Deref, Range}; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{self, AtomicBool}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; -use std::time::{Duration, Instant, SystemTime}; +use std::ops::Bound::Included; +use std::path::Path; +use std::sync::{Arc, Mutex, RwLock}; +use std::time::{Duration, Instant}; -use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; +use self::metadata::{metadata_path, TimelineMetadata}; use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline, TimelineWriter}; -use crate::repository::{Key, Value}; -use crate::tenant_mgr; +use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline}; use crate::thread_mgr; -use crate::virtual_file::VirtualFile; -use crate::walreceiver::IS_WAL_RECEIVER; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; -use crate::{page_cache, storage_sync}; -use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, - Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, -}; use toml_edit; use utils::{ crashsafe_dir, - lsn::{AtomicLsn, Lsn, RecordLsn}, - seqwait::SeqWait, + lsn::{Lsn, RecordLsn}, zid::{ZTenantId, ZTimelineId}, }; @@ -73,78 +56,16 @@ pub mod metadata; mod par_fsync; mod storage_layer; -use crate::pgdatadir_mapping::LsnForTimestamp; -use delta_layer::{DeltaLayer, DeltaLayerWriter}; -use ephemeral_file::is_ephemeral_file; -use filename::{DeltaFileName, ImageFileName}; -use image_layer::{ImageLayer, ImageLayerWriter}; -use inmemory_layer::InMemoryLayer; -use layer_map::LayerMap; -use layer_map::SearchResult; -use postgres_ffi::xlog_utils::to_pg_timestamp; -use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +mod timeline; + +use storage_layer::Layer; +use timeline::{LayeredTimeline, LayeredTimelineEntry}; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; -// Metrics collected on operations on the storage repository. -lazy_static! { - static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( - "pageserver_storage_operations_seconds", - "Time spent on storage operations", - &["operation", "tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -// Metrics collected on operations on the storage repository. -lazy_static! { - static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( - "pageserver_getpage_reconstruct_seconds", - "Time spent in reconstruct_value", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -lazy_static! { - static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( - "pageserver_materialized_cache_hits_total", - "Number of cache hits from materialized page cache", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); - static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( - "pageserver_wait_lsn_seconds", - "Time spent waiting for WAL to arrive", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -lazy_static! { - static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!( - "pageserver_last_record_lsn", - "Last record LSN grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, -// or in testing they estimate how much we would upload if we did. -lazy_static! { - static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( - "pageserver_created_persistent_files_total", - "Number of files created that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric"); - static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( - "pageserver_written_persistent_bytes_total", - "Total bytes written that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric"); -} +// re-export for use in storage_sync.rs +pub use crate::layered_repository::timeline::save_metadata; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; @@ -248,7 +169,7 @@ impl Repository for LayeredRepository { crashsafe_dir::create_dir_all(timeline_path)?; let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - Self::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; + timeline::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; let timeline = LayeredTimeline::new( self.conf, @@ -281,12 +202,22 @@ impl Repository for LayeredRepository { // concurrently removes data that is needed by the new timeline. let _gc_cs = self.gc_cs.lock().unwrap(); + // In order for the branch creation task to not wait for GC/compaction, + // we need to make sure that the starting LSN of the child branch is not out of scope midway by + // + // 1. holding the GC lock to prevent overwritting timeline's GC data + // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline + // + // Step 2 is to avoid initializing the new branch using data removed by past GC iterations + // or in-queue GC iterations. + let mut timelines = self.timelines.lock().unwrap(); let src_timeline = self .get_timeline_load_internal(src, &mut timelines) // message about timeline being remote is one .context up in the stack .context("failed to load timeline for branching")? .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?; + let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN @@ -296,9 +227,23 @@ impl Repository for LayeredRepository { lsn }); + // Check if the starting LSN is out of scope because it is less than + // 1. the latest GC cutoff LSN or + // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. src_timeline .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context("invalid branch start lsn")?; + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {latest_gc_cutoff_lsn}" + ))?; + { + let gc_info = src_timeline.gc_info.read().unwrap(); + let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + if start_lsn < cutoff { + bail!(format!( + "invalid branch start lsn: less than planned GC cutoff {cutoff}" + )); + } + } // Determine prev-LSN for the new timeline. We can only determine it if // the timeline was branched at the current end of the source timeline. @@ -328,7 +273,7 @@ impl Repository for LayeredRepository { src_timeline.initdb_lsn, ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - Self::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; + timeline::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -350,7 +295,7 @@ impl Repository for LayeredRepository { .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); - STORAGE_TIME + timeline::STORAGE_TIME .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .observe_closure_duration(|| { self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) @@ -440,13 +385,7 @@ impl Repository for LayeredRepository { Entry::Vacant(_) => bail!("timeline not found"), }; - // try to acquire gc and compaction locks to prevent errors from missing files - let _gc_guard = self - .gc_cs - .try_lock() - .map_err(|e| anyhow::anyhow!("cannot acquire gc lock {e}"))?; - - let compaction_guard = timeline_entry.get().compaction_guard()?; + let layer_removal_guard = timeline_entry.get().layer_removal_guard()?; let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { @@ -457,7 +396,7 @@ impl Repository for LayeredRepository { })?; info!("detach removed files"); - drop(compaction_guard); + drop(layer_removal_guard); timeline_entry.remove(); Ok(()) @@ -471,7 +410,7 @@ impl Repository for LayeredRepository { // we need to get metadata of a timeline, another option is to pass it along with Downloaded status let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; // finally we make newly downloaded timeline visible to repository - entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) + entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata }) }, }; Ok(()) @@ -482,72 +421,6 @@ impl Repository for LayeredRepository { } } -#[derive(Clone)] -enum LayeredTimelineEntry { - Loaded(Arc), - Unloaded { - id: ZTimelineId, - metadata: TimelineMetadata, - }, -} - -impl LayeredTimelineEntry { - fn timeline_id(&self) -> ZTimelineId { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, - LayeredTimelineEntry::Unloaded { id, .. } => *id, - } - } - - fn ancestor_timeline_id(&self) -> Option { - match self { - LayeredTimelineEntry::Loaded(timeline) => { - timeline.ancestor_timeline.as_ref().map(|t| t.timeline_id()) - } - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_timeline(), - } - } - - fn ancestor_lsn(&self) -> Lsn { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.ancestor_lsn, - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_lsn(), - } - } - - fn ensure_loaded(&self) -> anyhow::Result<&Arc> { - match self { - LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), - LayeredTimelineEntry::Unloaded { .. } => { - anyhow::bail!("timeline is unloaded") - } - } - } - - fn compaction_guard(&self) -> Result>, anyhow::Error> { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline - .compaction_cs - .try_lock() - .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) - .map(Some), - - LayeredTimelineEntry::Unloaded { .. } => Ok(None), - } - } -} - -impl From for RepositoryTimeline { - fn from(entry: LayeredTimelineEntry) -> Self { - match entry { - LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), - LayeredTimelineEntry::Unloaded { metadata, .. } => { - RepositoryTimeline::Unloaded { metadata } - } - } - } -} - /// Private functions impl LayeredRepository { pub fn get_checkpoint_distance(&self) -> u64 { @@ -809,42 +682,6 @@ impl LayeredRepository { }) } - /// Save timeline metadata to file - pub fn save_metadata( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - data: &TimelineMetadata, - first_save: bool, - ) -> Result<()> { - let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timelineid, tenantid); - // use OpenOptions to ensure file presence is consistent with first_save - let mut file = VirtualFile::open_with_options( - &path, - OpenOptions::new().write(true).create_new(first_save), - )?; - - let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; - - if file.write(&metadata_bytes)? != metadata_bytes.len() { - bail!("Could not write all the metadata bytes in a single call"); - } - file.sync_all()?; - - // fsync the parent directory to ensure the directory entry is durable - if first_save { - let timeline_dir = File::open( - &path - .parent() - .expect("Metadata should always have a parent dir"), - )?; - timeline_dir.sync_all()?; - } - - Ok(()) - } - // // How garbage collection works: // @@ -883,54 +720,53 @@ impl LayeredRepository { let now = Instant::now(); // grab mutex to prevent new timelines from being created here. - let _gc_cs = self.gc_cs.lock().unwrap(); + let gc_cs = self.gc_cs.lock().unwrap(); + + let timelines = self.timelines.lock().unwrap(); // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); - let mut timeline_ids = Vec::new(); - let mut timelines = self.timelines.lock().unwrap(); + let timeline_ids = { + if let Some(target_timeline_id) = target_timeline_id.as_ref() { + if timelines.get(target_timeline_id).is_none() { + bail!("gc target timeline does not exist") + } + }; - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - bail!("gc target timeline does not exist") - } - }; - - for (timeline_id, timeline_entry) in timelines.iter() { - timeline_ids.push(*timeline_id); - - // This is unresolved question for now, how to do gc in presence of remote timelines - // especially when this is combined with branching. - // Somewhat related: https://github.com/neondatabase/neon/issues/999 - if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timelineid) = target_timeline_id { - if ancestor_timeline_id == &timelineid { - all_branchpoints - .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + timelines + .iter() + .map(|(timeline_id, timeline_entry)| { + // This is unresolved question for now, how to do gc in presence of remote timelines + // especially when this is combined with branching. + // Somewhat related: https://github.com/zenithdb/zenith/issues/999 + if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { + // If target_timeline is specified, we only need to know branchpoints of its children + if let Some(timelineid) = target_timeline_id { + if ancestor_timeline_id == &timelineid { + all_branchpoints + .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + } + } + // Collect branchpoints for all timelines + else { + all_branchpoints + .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + } } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); - } - } - } + + *timeline_id + }) + .collect::>() + }; + drop(timelines); // Ok, we now know all the branch points. - // Perform GC for each timeline. - for timeline_id in timeline_ids.into_iter() { - if thread_mgr::is_shutdown_requested() { - // We were requested to shut down. Stop and return with the progress we - // made. - break; - } - + // Update the GC information for each timeline. + let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); + for timeline_id in timeline_ids { // Timeline is known to be local and loaded. - let timeline = self - .get_timeline_load_internal(timeline_id, &mut *timelines)? - .expect("checked above that timeline is local and loaded"); + let timeline = self.get_timeline_load(timeline_id)?; // If target_timeline is specified, ignore all other timelines if let Some(target_timelineid) = target_timeline_id { @@ -940,7 +776,6 @@ impl LayeredRepository { } if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { - drop(timelines); let branchpoints: Vec = all_branchpoints .range(( Included((timeline_id, Lsn(0))), @@ -948,21 +783,44 @@ impl LayeredRepository { )) .map(|&x| x.1) .collect(); + timeline.update_gc_info(branchpoints, cutoff, pitr)?; - // If requested, force flush all in-memory layers to disk first, - // so that they too can be garbage collected. That's - // used in tests, so we want as deterministic results as possible. - if checkpoint_before_gc { - timeline.checkpoint(CheckpointConfig::Forced)?; - info!("timeline {} checkpoint_before_gc done", timeline_id); - } - timeline.update_gc_info(branchpoints, cutoff, pitr); - let result = timeline.gc()?; - - totals += result; - timelines = self.timelines.lock().unwrap(); + gc_timelines.push(timeline); } } + drop(gc_cs); + + // Perform GC for each timeline. + // + // Note that we don't hold the GC lock here because we don't want + // to delay the branch creation task, which requires the GC lock. + // A timeline GC iteration can be slow because it may need to wait for + // compaction (both require `layer_removal_cs` lock), + // but the GC iteration can run concurrently with branch creation. + // + // See comments in [`LayeredRepository::branch_timeline`] for more information + // about why branch creation task can run concurrently with timeline's GC iteration. + for timeline in gc_timelines { + if thread_mgr::is_shutdown_requested() { + // We were requested to shut down. Stop and return with the progress we + // made. + break; + } + + // If requested, force flush all in-memory layers to disk first, + // so that they too can be garbage collected. That's + // used in tests, so we want as deterministic results as possible. + if checkpoint_before_gc { + timeline.checkpoint(CheckpointConfig::Forced)?; + info!( + "timeline {} checkpoint_before_gc done", + timeline.timeline_id + ); + } + + let result = timeline.gc()?; + totals += result; + } totals.elapsed = now.elapsed(); Ok(totals) @@ -973,1655 +831,6 @@ impl LayeredRepository { } } -pub struct LayeredTimeline { - conf: &'static PageServerConf, - tenant_conf: Arc>, - - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - - layers: RwLock, - - last_freeze_at: AtomicLsn, - - // WAL redo manager - walredo_mgr: Arc, - - // What page versions do we hold in the repository? If we get a - // request > last_record_lsn, we need to wait until we receive all - // the WAL up to the request. The SeqWait provides functions for - // that. TODO: If we get a request for an old LSN, such that the - // versions have already been garbage collected away, we should - // throw an error, but we don't track that currently. - // - // last_record_lsn.load().last points to the end of last processed WAL record. - // - // We also remember the starting point of the previous record in - // 'last_record_lsn.load().prev'. It's used to set the xl_prev pointer of the - // first WAL record when the node is started up. But here, we just - // keep track of it. - last_record_lsn: SeqWait, - - // All WAL records have been processed and stored durably on files on - // local disk, up to this LSN. On crash and restart, we need to re-process - // the WAL starting from this point. - // - // Some later WAL records might have been processed and also flushed to disk - // already, so don't be surprised to see some, but there's no guarantee on - // them yet. - disk_consistent_lsn: AtomicLsn, - - // Parent timeline that this timeline was branched from, and the LSN - // of the branch point. - ancestor_timeline: Option, - ancestor_lsn: Lsn, - - // Metrics - reconstruct_time_histo: Histogram, - materialized_page_cache_hit_counter: IntCounter, - flush_time_histo: Histogram, - compact_time_histo: Histogram, - create_images_time_histo: Histogram, - last_record_gauge: IntGauge, - wait_lsn_time_histo: Histogram, - - /// If `true`, will backup its files that appear after each checkpointing to the remote storage. - upload_layers: AtomicBool, - - /// Ensures layers aren't frozen by checkpointer between - /// [`LayeredTimeline::get_layer_for_write`] and layer reads. - /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. - /// Must always be acquired before the layer map/individual layer lock - /// to avoid deadlock. - write_lock: Mutex<()>, - - /// Used to ensure that there is only one thread - layer_flush_lock: Mutex<()>, - - // Prevent concurrent compactions. - // Compactions are normally performed by one thread. But compaction can also be manually - // requested by admin (that's used in tests). These forced compactions run in a different - // thread and could be triggered at the same time as a normal, timed compaction. - compaction_cs: Mutex<()>, - - // Needed to ensure that we can't create a branch at a point that was already garbage collected - latest_gc_cutoff_lsn: RwLock, - - // List of child timelines and their branch points. This is needed to avoid - // garbage collecting data that is still needed by the child timelines. - gc_info: RwLock, - - // It may change across major versions so for simplicity - // keep it after running initdb for a timeline. - // It is needed in checks when we want to error on some operations - // when they are requested for pre-initdb lsn. - // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", - // though lets keep them both for better error visibility. - initdb_lsn: Lsn, -} - -/// -/// Information about how much history needs to be retained, needed by -/// Garbage Collection. -/// -struct GcInfo { - /// Specific LSNs that are needed. - /// - /// Currently, this includes all points where child branches have - /// been forked off from. In the future, could also include - /// explicit user-defined snapshot points. - retain_lsns: Vec, - - /// In addition to 'retain_lsns', keep everything newer than this - /// point. - /// - /// This is calculated by subtracting 'gc_horizon' setting from - /// last-record LSN - /// - /// FIXME: is this inclusive or exclusive? - cutoff: Lsn, - - /// In addition to 'retain_lsns', keep everything newer than 'SystemTime::now()' - /// minus 'pitr_interval' - /// - pitr: Duration, -} - -/// Public interface functions -impl Timeline for LayeredTimeline { - fn get_ancestor_lsn(&self) -> Lsn { - self.ancestor_lsn - } - - fn get_ancestor_timeline_id(&self) -> Option { - self.ancestor_timeline - .as_ref() - .map(LayeredTimelineEntry::timeline_id) - } - - /// Wait until WAL has been received up to the given LSN. - fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { - // This should never be called from the WAL receiver thread, because that could lead - // to a deadlock. - ensure!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" - ); - - self.wait_lsn_time_histo.observe_closure_duration( - || self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - }))?; - - Ok(()) - } - - fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { - self.latest_gc_cutoff_lsn.read().unwrap() - } - - /// Look up the value with the given a key - fn get(&self, key: Key, lsn: Lsn) -> Result { - debug_assert!(lsn <= self.get_last_record_lsn()); - - // Check the page cache. We will get back the most recent page with lsn <= `lsn`. - // The cached image can be returned directly if there is no WAL between the cached image - // and requested LSN. The cached image can also be used to reduce the amount of WAL needed - // for redo. - let cached_page_img = match self.lookup_cached_page(&key, lsn) { - Some((cached_lsn, cached_img)) => { - match cached_lsn.cmp(&lsn) { - Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn - } - Some((cached_lsn, cached_img)) - } - None => None, - }; - - let mut reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: cached_page_img, - }; - - self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; - - self.reconstruct_time_histo - .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) - } - - /// Public entry point for checkpoint(). All the logic is in the private - /// checkpoint_internal function, this public facade just wraps it for - /// metrics collection. - fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { - match cconf { - CheckpointConfig::Flush => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true) - } - CheckpointConfig::Forced => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true)?; - self.compact() - } - } - } - - /// - /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. - /// - fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()> { - ensure!( - lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", - lsn, - **latest_gc_cutoff_lsn, - ); - Ok(()) - } - - fn get_last_record_lsn(&self) -> Lsn { - self.last_record_lsn.load().last - } - - fn get_prev_record_lsn(&self) -> Lsn { - self.last_record_lsn.load().prev - } - - fn get_last_record_rlsn(&self) -> RecordLsn { - self.last_record_lsn.load() - } - - fn get_disk_consistent_lsn(&self) -> Lsn { - self.disk_consistent_lsn.load() - } - - fn writer<'a>(&'a self) -> Box { - Box::new(LayeredTimelineWriter { - tl: self, - _write_guard: self.write_lock.lock().unwrap(), - }) - } -} - -impl LayeredTimeline { - fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .checkpoint_distance - .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) - } - - fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .compaction_target_size - .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) - } - - fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .compaction_threshold - .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) - } - - fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .image_creation_threshold - .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) - } - - /// Open a Timeline handle. - /// - /// Loads the metadata for the timeline into memory, but not the layer map. - #[allow(clippy::too_many_arguments)] - fn new( - conf: &'static PageServerConf, - tenant_conf: Arc>, - metadata: TimelineMetadata, - ancestor: Option, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, - walredo_mgr: Arc, - upload_layers: bool, - ) -> LayeredTimeline { - let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let flush_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "layer flush", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let compact_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "compact", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let create_images_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "create images", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - - LayeredTimeline { - conf, - tenant_conf, - timeline_id, - tenant_id, - layers: RwLock::new(LayerMap::default()), - - walredo_mgr, - - // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. - last_record_lsn: SeqWait::new(RecordLsn { - last: metadata.disk_consistent_lsn(), - prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), - }), - disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), - - last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), - - ancestor_timeline: ancestor, - ancestor_lsn: metadata.ancestor_lsn(), - - reconstruct_time_histo, - materialized_page_cache_hit_counter, - flush_time_histo, - compact_time_histo, - create_images_time_histo, - last_record_gauge, - wait_lsn_time_histo, - - upload_layers: AtomicBool::new(upload_layers), - - write_lock: Mutex::new(()), - layer_flush_lock: Mutex::new(()), - compaction_cs: Mutex::new(()), - - gc_info: RwLock::new(GcInfo { - retain_lsns: Vec::new(), - cutoff: Lsn(0), - pitr: Duration::ZERO, - }), - - latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), - initdb_lsn: metadata.initdb_lsn(), - } - } - - /// - /// Scan the timeline directory to populate the layer map. - /// Returns all timeline-related files that were found and loaded. - /// - fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { - let mut layers = self.layers.write().unwrap(); - let mut num_layers = 0; - - // Scan timeline directory and create ImageFileName and DeltaFilename - // structs representing all files on disk - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); - - for direntry in fs::read_dir(timeline_path)? { - let direntry = direntry?; - let fname = direntry.file_name(); - let fname = fname.to_string_lossy(); - - if let Some(imgfilename) = ImageFileName::parse_str(&fname) { - // create an ImageLayer struct for each image file. - if imgfilename.lsn > disk_consistent_lsn { - warn!( - "found future image layer {} on timeline {} disk_consistent_lsn is {}", - imgfilename, self.timeline_id, disk_consistent_lsn - ); - - rename_to_backup(direntry.path())?; - continue; - } - - let layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); - - trace!("found layer {}", layer.filename().display()); - layers.insert_historic(Arc::new(layer)); - num_layers += 1; - } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { - // Create a DeltaLayer struct for each delta file. - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { - warn!( - "found future delta layer {} on timeline {} disk_consistent_lsn is {}", - deltafilename, self.timeline_id, disk_consistent_lsn - ); - - rename_to_backup(direntry.path())?; - continue; - } - - let layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); - - trace!("found layer {}", layer.filename().display()); - layers.insert_historic(Arc::new(layer)); - num_layers += 1; - } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { - // ignore these - } else if is_ephemeral_file(&fname) { - // Delete any old ephemeral files - trace!("deleting old ephemeral file in timeline dir: {}", fname); - fs::remove_file(direntry.path())?; - } else { - warn!("unrecognized filename in timeline dir: {}", fname); - } - } - - layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); - - info!( - "loaded layer map with {} layers at {}", - num_layers, disk_consistent_lsn - ); - - Ok(()) - } - - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - /// This function takes the current timeline's locked LayerMap as an argument, - /// so callers can avoid potential race conditions. - fn get_reconstruct_data( - &self, - key: Key, - request_lsn: Lsn, - reconstruct_state: &mut ValueReconstructState, - ) -> anyhow::Result<()> { - // Start from the current timeline. - let mut timeline_owned; - let mut timeline = self; - - // For debugging purposes, collect the path of layers that we traversed - // through. It's included in the error message if we fail to find the key. - let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); - - let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { - *cached_lsn - } else { - Lsn(0) - }; - - // 'prev_lsn' tracks the last LSN that we were at in our search. It's used - // to check that each iteration make some progress, to break infinite - // looping if something goes wrong. - let mut prev_lsn = Lsn(u64::MAX); - - let mut result = ValueReconstructResult::Continue; - let mut cont_lsn = Lsn(request_lsn.0 + 1); - - 'outer: loop { - // The function should have updated 'state' - //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); - match result { - ValueReconstructResult::Complete => return Ok(()), - ValueReconstructResult::Continue => { - // If we reached an earlier cached page image, we're done. - if cont_lsn == cached_lsn + 1 { - self.materialized_page_cache_hit_counter.inc_by(1); - return Ok(()); - } - if prev_lsn <= cont_lsn { - // Didn't make any progress in last iteration. Error out to avoid - // getting stuck in the loop. - return layer_traversal_error(format!( - "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", - key, - Lsn(cont_lsn.0 - 1), - request_lsn, - timeline.ancestor_lsn - ), traversal_path); - } - prev_lsn = cont_lsn; - } - ValueReconstructResult::Missing => { - return layer_traversal_error( - format!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, cont_lsn, request_lsn - ), - traversal_path, - ); - } - } - - // Recurse into ancestor if needed - if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( - "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn - ); - let ancestor = timeline.get_ancestor_timeline()?; - timeline_owned = ancestor; - timeline = &*timeline_owned; - prev_lsn = Lsn(u64::MAX); - continue; - } - - let layers = timeline.layers.read().unwrap(); - - // Check the open and frozen in-memory layers first, in order from newest - // to oldest. - if let Some(open_layer) = &layers.open_layer { - let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = open_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, open_layer.clone())); - continue; - } - } - for frozen_layer in layers.frozen_layers.iter().rev() { - let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = frozen_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, frozen_layer.clone())); - continue 'outer; - } - } - - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { - //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); - - let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, layer)); - } else if timeline.ancestor_timeline.is_some() { - // Nothing on this timeline. Traverse to parent - result = ValueReconstructResult::Continue; - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); - } else { - // Nothing found - result = ValueReconstructResult::Missing; - } - } - } - - fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { - let cache = page_cache::get(); - - // FIXME: It's pointless to check the cache for things that are not 8kB pages. - // We should look at the key to determine if it's a cacheable object - let (lsn, read_guard) = - cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?; - let img = Bytes::from(read_guard.to_vec()); - Some((lsn, img)) - } - - fn get_ancestor_timeline(&self) -> Result> { - let ancestor = self - .ancestor_timeline - .as_ref() - .with_context(|| { - format!( - "Ancestor is missing. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })? - .ensure_loaded() - .with_context(|| { - format!( - "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })?; - Ok(Arc::clone(ancestor)) - } - - /// - /// Get a handle to the latest layer for appending. - /// - fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { - let mut layers = self.layers.write().unwrap(); - - ensure!(lsn.is_aligned()); - - let last_record_lsn = self.get_last_record_lsn(); - ensure!( - lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", - lsn, - last_record_lsn, - ); - - // Do we have a layer open for writing already? - let layer; - if let Some(open_layer) = &layers.open_layer { - if open_layer.get_lsn_range().start > lsn { - bail!("unexpected open layer in the future"); - } - - layer = Arc::clone(open_layer); - } else { - // No writeable layer yet. Create one. - let start_lsn = layers.next_open_layer_at.unwrap(); - - trace!( - "creating layer for write at {}/{} for record at {}", - self.timeline_id, - start_lsn, - lsn - ); - let new_layer = - InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; - let layer_rc = Arc::new(new_layer); - - layers.open_layer = Some(Arc::clone(&layer_rc)); - layers.next_open_layer_at = None; - - layer = layer_rc; - } - Ok(layer) - } - - fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { - //info!("PUT: key {} at {}", key, lsn); - let layer = self.get_layer_for_write(lsn)?; - layer.put_value(key, lsn, val)?; - Ok(()) - } - - fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> Result<()> { - let layer = self.get_layer_for_write(lsn)?; - layer.put_tombstone(key_range, lsn)?; - - Ok(()) - } - - fn finish_write(&self, new_lsn: Lsn) { - assert!(new_lsn.is_aligned()); - - self.last_record_gauge.set(new_lsn.0 as i64); - self.last_record_lsn.advance(new_lsn); - } - - fn freeze_inmem_layer(&self, write_lock_held: bool) { - // Freeze the current open in-memory layer. It will be written to disk on next - // iteration. - let _write_guard = if write_lock_held { - None - } else { - Some(self.write_lock.lock().unwrap()) - }; - let mut layers = self.layers.write().unwrap(); - if let Some(open_layer) = &layers.open_layer { - let open_layer_rc = Arc::clone(open_layer); - // Does this layer need freezing? - let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); - open_layer.freeze(end_lsn); - - // The layer is no longer open, update the layer map to reflect this. - // We will replace it with on-disk historics below. - layers.frozen_layers.push_back(open_layer_rc); - layers.open_layer = None; - layers.next_open_layer_at = Some(end_lsn); - self.last_freeze_at.store(end_lsn); - } - drop(layers); - } - - /// - /// Check if more than 'checkpoint_distance' of WAL has been accumulated - /// in the in-memory layer, and initiate flushing it if so. - /// - pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { - let last_lsn = self.get_last_record_lsn(); - - // Has more than 'checkpoint_distance' of WAL been accumulated? - let distance = last_lsn.widening_sub(self.last_freeze_at.load()); - if distance >= self.get_checkpoint_distance().into() { - // Yes. Freeze the current in-memory layer. - self.freeze_inmem_layer(true); - self.last_freeze_at.store(last_lsn); - - // Launch a thread to flush the frozen layer to disk, unless - // a thread was already running. (If the thread was running - // at the time that we froze the layer, it must've seen the - // the layer we just froze before it exited; see comments - // in flush_frozen_layers()) - if let Ok(guard) = self.layer_flush_lock.try_lock() { - drop(guard); - let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush thread", - false, - move || self_clone.flush_frozen_layers(false), - )?; - } - } - Ok(()) - } - - /// Flush all frozen layers to disk. - /// - /// Only one thread at a time can be doing layer-flushing for a - /// given timeline. If 'wait' is true, and another thread is - /// currently doing the flushing, this function will wait for it - /// to finish. If 'wait' is false, this function will return - /// immediately instead. - fn flush_frozen_layers(&self, wait: bool) -> Result<()> { - let flush_lock_guard = if wait { - self.layer_flush_lock.lock().unwrap() - } else { - match self.layer_flush_lock.try_lock() { - Ok(guard) => guard, - Err(TryLockError::WouldBlock) => return Ok(()), - Err(TryLockError::Poisoned(err)) => panic!("{:?}", err), - } - }; - - let timer = self.flush_time_histo.start_timer(); - - loop { - let layers = self.layers.read().unwrap(); - if let Some(frozen_layer) = layers.frozen_layers.front() { - let frozen_layer = Arc::clone(frozen_layer); - drop(layers); // to allow concurrent reads and writes - self.flush_frozen_layer(frozen_layer)?; - } else { - // Drop the 'layer_flush_lock' *before* 'layers'. That - // way, if you freeze a layer, and then call - // flush_frozen_layers(false), it is guaranteed that - // if another thread was busy flushing layers and the - // call therefore returns immediately, the other - // thread will have seen the newly-frozen layer and - // will flush that too (assuming no errors). - drop(flush_lock_guard); - drop(layers); - break; - } - } - - timer.stop_and_record(); - - Ok(()) - } - - /// Flush one frozen in-memory layer to disk, as a new delta layer. - fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { - // As a special case, when we have just imported an image into the repository, - // instead of writing out a L0 delta layer, we directly write out image layer - // files instead. This is possible as long as *all* the data imported into the - // repository have the same LSN. - let lsn_range = frozen_layer.get_lsn_range(); - let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn - && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) - { - let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; - let (partitioning, _lsn) = - pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - self.create_image_layers(&partitioning, self.initdb_lsn, true)? - } else { - // normal case, write out a L0 delta layer file. - let delta_path = self.create_delta_layer(&frozen_layer)?; - HashSet::from([delta_path]) - }; - - fail_point!("flush-frozen-before-sync"); - - // The new on-disk layers are now in the layer map. We can remove the - // in-memory layer from the map now. - { - let mut layers = self.layers.write().unwrap(); - let l = layers.frozen_layers.pop_front(); - - // Only one thread may call this function at a time (for this - // timeline). If two threads tried to flush the same frozen - // layer to disk at the same time, that would not work. - assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); - - // release lock on 'layers' - } - - fail_point!("checkpoint-after-sync"); - - // Update the metadata file, with new 'disk_consistent_lsn' - // - // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing - // *all* the layers, to avoid fsyncing the file multiple times. - let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); - self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?; - - Ok(()) - } - - /// Update metadata file - fn update_disk_consistent_lsn( - &self, - disk_consistent_lsn: Lsn, - layer_paths_to_upload: HashSet, - ) -> Result<()> { - // If we were able to advance 'disk_consistent_lsn', save it the metadata file. - // After crash, we will restart WAL streaming and processing from that point. - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); - - // We can only save a valid 'prev_record_lsn' value on disk if we - // flushed *all* in-memory changes to disk. We only track - // 'prev_record_lsn' in memory for the latest processed record, so we - // don't remember what the correct value that corresponds to some old - // LSN is. But if we flush everything, then the value corresponding - // current 'last_record_lsn' is correct and we can store it on disk. - let RecordLsn { - last: last_record_lsn, - prev: prev_record_lsn, - } = self.last_record_lsn.load(); - let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { - Some(prev_record_lsn) - } else { - None - }; - - let ancestor_timelineid = self - .ancestor_timeline - .as_ref() - .map(LayeredTimelineEntry::timeline_id); - - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - ondisk_prev_record_lsn, - ancestor_timelineid, - self.ancestor_lsn, - *self.latest_gc_cutoff_lsn.read().unwrap(), - self.initdb_lsn, - ); - - fail_point!("checkpoint-before-saving-metadata", |x| bail!( - "{}", - x.unwrap() - )); - - LayeredRepository::save_metadata( - self.conf, - self.timeline_id, - self.tenant_id, - &metadata, - false, - )?; - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - layer_paths_to_upload, - Some(metadata), - ); - } - - // Also update the in-memory copy - self.disk_consistent_lsn.store(disk_consistent_lsn); - } - - Ok(()) - } - - // Write out the given frozen in-memory layer as a new L0 delta file - fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result { - // Write it out - let new_delta = frozen_layer.write_to_disk()?; - let new_delta_path = new_delta.path(); - - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, it might be better to first write them all, and then fsync - // them all in parallel. - par_fsync::par_fsync(&[ - new_delta_path.clone(), - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - ])?; - - // Add it to the layer map - { - let mut layers = self.layers.write().unwrap(); - layers.insert_historic(Arc::new(new_delta)); - } - - NUM_PERSISTENT_FILES_CREATED.inc_by(1); - PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); - - Ok(new_delta_path) - } - - pub fn compact(&self) -> Result<()> { - // - // High level strategy for compaction / image creation: - // - // 1. First, calculate the desired "partitioning" of the - // currently in-use key space. The goal is to partition the - // key space into roughly fixed-size chunks, but also take into - // account any existing image layers, and try to align the - // chunk boundaries with the existing image layers to avoid - // too much churn. Also try to align chunk boundaries with - // relation boundaries. In principle, we don't know about - // relation boundaries here, we just deal with key-value - // pairs, and the code in pgdatadir_mapping.rs knows how to - // map relations into key-value pairs. But in practice we know - // that 'field6' is the block number, and the fields 1-5 - // identify a relation. This is just an optimization, - // though. - // - // 2. Once we know the partitioning, for each partition, - // decide if it's time to create a new image layer. The - // criteria is: there has been too much "churn" since the last - // image layer? The "churn" is fuzzy concept, it's a - // combination of too many delta files, or too much WAL in - // total in the delta file. Or perhaps: if creating an image - // file would allow to delete some older files. - // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. - let _compaction_cs = self.compaction_cs.lock().unwrap(); - - let target_file_size = self.get_checkpoint_distance(); - - // Define partitioning schema if needed - if let Ok(pgdir) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - // 2. Create new image layers for partitions that have been modified - // "enough". - let (partitioning, lsn) = pgdir.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - )?; - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; - if !layer_paths_to_upload.is_empty() - && self.upload_layers.load(atomic::Ordering::Relaxed) - { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - HashSet::from_iter(layer_paths_to_upload), - None, - ); - } - - // 3. Compact - let timer = self.compact_time_histo.start_timer(); - self.compact_level0(target_file_size)?; - timer.stop_and_record(); - } else { - debug!("Could not compact because no partitioning specified yet"); - } - - Ok(()) - } - - // Is it time to create a new image layer for the given partition? - fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { - let layers = self.layers.read().unwrap(); - - for part_range in &partition.ranges { - let image_coverage = layers.image_coverage(part_range, lsn)?; - for (img_range, last_img) in image_coverage { - let img_lsn = if let Some(last_img) = last_img { - last_img.get_lsn_range().end - } else { - Lsn(0) - }; - // Let's consider an example: - // - // delta layer with LSN range 71-81 - // delta layer with LSN range 81-91 - // delta layer with LSN range 91-101 - // image layer at LSN 100 - // - // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, - // there's no need to create a new one. We check this case explicitly, to avoid passing - // a bogus range to count_deltas below, with start > end. It's even possible that there - // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed - // after we read last_record_lsn, which is passed here in the 'lsn' argument. - if img_lsn < lsn { - let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; - - debug!( - "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", - img_range.start, img_range.end, num_deltas, img_lsn, lsn - ); - if num_deltas >= self.get_image_creation_threshold() { - return Ok(true); - } - } - } - } - - Ok(false) - } - - fn create_image_layers( - &self, - partitioning: &KeyPartitioning, - lsn: Lsn, - force: bool, - ) -> Result> { - let timer = self.create_images_time_histo.start_timer(); - let mut image_layers: Vec = Vec::new(); - let mut layer_paths_to_upload = HashSet::new(); - for partition in partitioning.parts.iter() { - if force || self.time_for_new_image_layer(partition, lsn)? { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_id, - &img_range, - lsn, - )?; - - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - let img = self.get(key, lsn)?; - image_layer_writer.put_image(key, &img)?; - key = key.next(); - } - } - let image_layer = image_layer_writer.finish()?; - layer_paths_to_upload.insert(image_layer.path()); - image_layers.push(image_layer); - } - } - - // Sync the new layer to disk before adding it to the layer map, to make sure - // we don't garbage collect something based on the new layer, before it has - // reached the disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // Compaction creates multiple image layers. It would be better to create them all - // and fsync them all in parallel. - let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone()); - all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); - par_fsync::par_fsync(&all_paths)?; - - let mut layers = self.layers.write().unwrap(); - for l in image_layers { - layers.insert_historic(Arc::new(l)); - } - drop(layers); - timer.stop_and_record(); - - Ok(layer_paths_to_upload) - } - - /// - /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. - /// - fn compact_level0(&self, target_file_size: u64) -> Result<()> { - let layers = self.layers.read().unwrap(); - let mut level0_deltas = layers.get_level0_deltas()?; - drop(layers); - - // Only compact if enough layers have accumulated. - if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { - return Ok(()); - } - - // Gather the files to compact in this iteration. - // - // Start with the oldest Level 0 delta file, and collect any other - // level 0 files that form a contiguous sequence, such that the end - // LSN of previous file matches the start LSN of the next file. - // - // Note that if the files don't form such a sequence, we might - // "compact" just a single file. That's a bit pointless, but it allows - // us to get rid of the level 0 file, and compact the other files on - // the next iteration. This could probably made smarter, but such - // "gaps" in the sequence of level 0 files should only happen in case - // of a crash, partial download from cloud storage, or something like - // that, so it's not a big deal in practice. - level0_deltas.sort_by_key(|l| l.get_lsn_range().start); - let mut level0_deltas_iter = level0_deltas.iter(); - - let first_level0_delta = level0_deltas_iter.next().unwrap(); - let mut prev_lsn_end = first_level0_delta.get_lsn_range().end; - let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)]; - for l in level0_deltas_iter { - let lsn_range = l.get_lsn_range(); - - if lsn_range.start != prev_lsn_end { - break; - } - deltas_to_compact.push(Arc::clone(l)); - prev_lsn_end = lsn_range.end; - } - let lsn_range = Range { - start: deltas_to_compact.first().unwrap().get_lsn_range().start, - end: deltas_to_compact.last().unwrap().get_lsn_range().end, - }; - - info!( - "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", - lsn_range.start, - lsn_range.end, - deltas_to_compact.len(), - level0_deltas.len() - ); - for l in deltas_to_compact.iter() { - info!("compact includes {}", l.filename().display()); - } - // We don't need the original list of layers anymore. Drop it so that - // we don't accidentally use it later in the function. - drop(level0_deltas); - - // This iterator walks through all key-value pairs from all the layers - // we're compacting, in key, LSN order. - let all_values_iter = deltas_to_compact - .iter() - .map(|l| l.iter()) - .kmerge_by(|a, b| { - if let Ok((a_key, a_lsn, _)) = a { - if let Ok((b_key, b_lsn, _)) = b { - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, - } - } else { - false - } - } else { - true - } - }); - - // Merge the contents of all the input delta layers into a new set - // of delta layers, based on the current partitioning. - // - // TODO: this actually divides the layers into fixed-size chunks, not - // based on the partitioning. - // - // TODO: we should also opportunistically materialize and - // garbage collect what we can. - let mut new_layers = Vec::new(); - let mut prev_key: Option = None; - let mut writer: Option = None; - for x in all_values_iter { - let (key, lsn, value) = x?; - - if let Some(prev_key) = prev_key { - if key != prev_key && writer.is_some() { - let size = writer.as_mut().unwrap().size(); - if size > target_file_size { - new_layers.push(writer.take().unwrap().finish(prev_key.next())?); - writer = None; - } - } - } - - if writer.is_none() { - writer = Some(DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_id, - key, - lsn_range.clone(), - )?); - } - - writer.as_mut().unwrap().put_value(key, lsn, value)?; - prev_key = Some(key); - } - if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next())?); - } - - // Sync layers - if !new_layers.is_empty() { - let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); - - // also sync the directory - layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync(&layer_paths)?; - - layer_paths.pop().unwrap(); - } - - let mut layers = self.layers.write().unwrap(); - let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); - for l in new_layers { - new_layer_paths.insert(l.path()); - layers.insert_historic(Arc::new(l)); - } - - // Now that we have reshuffled the data to set of new delta layers, we can - // delete the old ones - let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); - for l in deltas_to_compact { - l.delete()?; - if let Some(path) = l.local_path() { - layer_paths_do_delete.insert(path); - } - layers.remove_historic(l); - } - drop(layers); - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - new_layer_paths, - None, - ); - storage_sync::schedule_layer_delete( - self.tenant_id, - self.timeline_id, - layer_paths_do_delete, - ); - } - - Ok(()) - } - - /// Update information about which layer files need to be retained on - /// garbage collection. This is separate from actually performing the GC, - /// and is updated more frequently, so that compaction can remove obsolete - /// page versions more aggressively. - /// - /// TODO: that's wishful thinking, compaction doesn't actually do that - /// currently. - /// - /// The caller specifies how much history is needed with the two arguments: - /// - /// retain_lsns: keep a version of each page at these LSNs - /// cutoff: also keep everything newer than this LSN - /// - /// The 'retain_lsns' list is currently used to prevent removing files that - /// are needed by child timelines. In the future, the user might be able to - /// name additional points in time to retain. The caller is responsible for - /// collecting that information. - /// - /// The 'cutoff' point is used to retain recent versions that might still be - /// needed by read-only nodes. (As of this writing, the caller just passes - /// the latest LSN subtracted by a constant, and doesn't do anything smart - /// to figure out what read-only nodes might actually need.) - /// - fn update_gc_info(&self, retain_lsns: Vec, cutoff: Lsn, pitr: Duration) { - let mut gc_info = self.gc_info.write().unwrap(); - gc_info.retain_lsns = retain_lsns; - gc_info.cutoff = cutoff; - gc_info.pitr = pitr; - } - - /// - /// Garbage collect layer files on a timeline that are no longer needed. - /// - /// Currently, we don't make any attempt at removing unneeded page versions - /// within a layer file. We can only remove the whole file if it's fully - /// obsolete. - /// - fn gc(&self) -> Result { - let now = SystemTime::now(); - let mut result: GcResult = Default::default(); - let disk_consistent_lsn = self.get_disk_consistent_lsn(); - - let _compaction_cs = self.compaction_cs.lock().unwrap(); - - let gc_info = self.gc_info.read().unwrap(); - let retain_lsns = &gc_info.retain_lsns; - let cutoff = min(gc_info.cutoff, disk_consistent_lsn); - let pitr = gc_info.pitr; - - // Calculate pitr cutoff point. - // If we cannot determine a cutoff LSN, be conservative and don't GC anything. - let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); - - if let Ok(timeline) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - // First, calculate pitr_cutoff_timestamp and then convert it to LSN. - // If we don't have enough data to convert to LSN, - // play safe and don't remove any layers. - if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { - let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - - match timeline.find_lsn_for_timestamp(pitr_timestamp)? { - LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, - LsnForTimestamp::Future(lsn) => { - debug!("future({})", lsn); - pitr_cutoff_lsn = cutoff; - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - } - } - debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) - } - } else if cfg!(test) { - // We don't have local timeline in mocked cargo tests. - // So, just ignore pitr_interval setting in this case. - pitr_cutoff_lsn = cutoff; - } - - let new_gc_cutoff = Lsn::min(cutoff, pitr_cutoff_lsn); - - // Nothing to GC. Return early. - if *self.get_latest_gc_cutoff_lsn() >= new_gc_cutoff { - info!( - "Nothing to GC for timeline {}. cutoff_lsn {}", - self.timeline_id, new_gc_cutoff - ); - result.elapsed = now.elapsed()?; - return Ok(result); - } - - let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered(); - - // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. - // See branch_timeline() for details. - *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; - - info!("GC starting"); - - debug!("retain_lsns: {:?}", retain_lsns); - - let mut layers_to_remove = Vec::new(); - - // Scan all on-disk layers in the timeline. - // - // Garbage collect the layer if all conditions are satisfied: - // 1. it is older than cutoff LSN; - // 2. it is older than PITR interval; - // 3. it doesn't need to be retained for 'retain_lsns'; - // 4. newer on-disk image layers cover the layer's whole key range - // - let mut layers = self.layers.write().unwrap(); - 'outer: for l in layers.iter_historic_layers() { - // This layer is in the process of being flushed to disk. - // It will be swapped out of the layer map, replaced with - // on-disk layers containing the same data. - // We can't GC it, as it's not on disk. We can't remove it - // from the layer map yet, as it would make its data - // inaccessible. - if l.is_in_memory() { - continue; - } - - result.layers_total += 1; - - // 1. Is it newer than cutoff point? - if l.get_lsn_range().end > cutoff { - debug!( - "keeping {} because it's newer than cutoff {}", - l.filename().display(), - cutoff - ); - result.layers_needed_by_cutoff += 1; - continue 'outer; - } - - // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > pitr_cutoff_lsn { - debug!( - "keeping {} because it's newer than pitr_cutoff_lsn {}", - l.filename().display(), - pitr_cutoff_lsn - ); - result.layers_needed_by_pitr += 1; - continue 'outer; - } - - // 3. Is it needed by a child branch? - // NOTE With that we would keep data that - // might be referenced by child branches forever. - // We can track this in child timeline GC and delete parent layers when - // they are no longer needed. This might be complicated with long inheritance chains. - for retain_lsn in retain_lsns { - // start_lsn is inclusive - if &l.get_lsn_range().start <= retain_lsn { - debug!( - "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename().display(), - retain_lsn, - l.is_incremental(), - ); - result.layers_needed_by_branches += 1; - continue 'outer; - } - } - - // 4. Is there a later on-disk layer for this relation? - // - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - // - // For example, imagine that the following layers exist: - // - // 1000 - image (A) - // 1000-2000 - delta (B) - // 2000 - image (C) - // 2000-3000 - delta (D) - // 3000 - image (E) - // - // If GC horizon is at 2500, we can remove layers A and B, but - // we cannot remove C, even though it's older than 2500, because - // the delta layer 2000-3000 depends on it. - if !layers - .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? - { - debug!( - "keeping {} because it is the latest layer", - l.filename().display() - ); - result.layers_not_updated += 1; - continue 'outer; - } - - // We didn't find any reason to keep this file, so remove it. - debug!( - "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename().display(), - l.is_incremental(), - ); - layers_to_remove.push(Arc::clone(l)); - } - - // Actually delete the layers from disk and remove them from the map. - // (couldn't do this in the loop above, because you cannot modify a collection - // while iterating it. BTreeMap::retain() would be another option) - let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); - for doomed_layer in layers_to_remove { - doomed_layer.delete()?; - if let Some(path) = doomed_layer.local_path() { - layer_paths_to_delete.insert(path); - } - layers.remove_historic(doomed_layer); - result.layers_removed += 1; - } - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_delete( - self.tenant_id, - self.timeline_id, - layer_paths_to_delete, - ); - } - - result.elapsed = now.elapsed()?; - Ok(result) - } - - /// - /// Reconstruct a value, using the given base image and WAL records in 'data'. - /// - fn reconstruct_value( - &self, - key: Key, - request_lsn: Lsn, - mut data: ValueReconstructState, - ) -> Result { - // Perform WAL redo if needed - data.records.reverse(); - - // If we have a page image, and no WAL, we're all set - if data.records.is_empty() { - if let Some((img_lsn, img)) = &data.img { - trace!( - "found page image for key {} at {}, no WAL redo required", - key, - img_lsn - ); - Ok(img.clone()) - } else { - bail!("base image for {} at {} not found", key, request_lsn); - } - } else { - // We need to do WAL redo. - // - // If we don't have a base image, then the oldest WAL record better initialize - // the page - if data.img.is_none() && !data.records.first().unwrap().1.will_init() { - bail!( - "Base image for {} at {} not found, but got {} WAL records", - key, - request_lsn, - data.records.len() - ); - } else { - let base_img = if let Some((_lsn, img)) = data.img { - trace!( - "found {} WAL records and a base image for {} at {}, performing WAL redo", - data.records.len(), - key, - request_lsn - ); - Some(img) - } else { - trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); - None - }; - - let last_rec_lsn = data.records.last().unwrap().0; - - let img = - self.walredo_mgr - .request_redo(key, request_lsn, base_img, data.records)?; - - if img.len() == page_cache::PAGE_SZ { - let cache = page_cache::get(); - cache.memorize_materialized_page( - self.tenant_id, - self.timeline_id, - key, - last_rec_lsn, - &img, - ); - } - - Ok(img) - } - } - } -} - -/// Helper function for get_reconstruct_data() to add the path of layers traversed -/// to an error, as anyhow context information. -fn layer_traversal_error( - msg: String, - path: Vec<(ValueReconstructResult, Lsn, Arc)>, -) -> anyhow::Result<()> { - // We want the original 'msg' to be the outermost context. The outermost context - // is the most high-level information, which also gets propagated to the client. - let mut msg_iter = path - .iter() - .map(|(r, c, l)| { - format!( - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, - c, - l.filename().display() - ) - }) - .chain(std::iter::once(msg)); - // Construct initial message from the first traversed layer - let err = anyhow!(msg_iter.next().unwrap()); - - // Append all subsequent traversals, and the error message 'msg', as contexts. - Err(msg_iter.fold(err, |err, msg| err.context(msg))) -} - -struct LayeredTimelineWriter<'a> { - tl: &'a LayeredTimeline, - _write_guard: MutexGuard<'a, ()>, -} - -impl Deref for LayeredTimelineWriter<'_> { - type Target = dyn Timeline; - - fn deref(&self) -> &Self::Target { - self.tl - } -} - -impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { - fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { - self.tl.put_value(key, lsn, value) - } - - fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { - self.tl.put_tombstone(key_range, lsn) - } - - /// - /// Remember the (end of) last valid WAL record remembered in the timeline. - /// - fn finish_write(&self, new_lsn: Lsn) { - self.tl.finish_write(new_lsn); - } -} - /// Dump contents of a layer file to stdout. pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { use std::os::unix::fs::FileExt; @@ -2633,34 +842,18 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { file.read_exact_at(&mut header_buf, 0)?; match u16::from_be_bytes(header_buf) { - crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?, - crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?, + crate::IMAGE_FILE_MAGIC => { + image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)? + } + crate::DELTA_FILE_MAGIC => { + delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)? + } magic => bail!("unrecognized magic identifier: {:?}", magic), } Ok(()) } -/// Add a suffix to a layer file's name: .{num}.old -/// Uses the first available num (starts at 0) -fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { - let filename = path - .file_name() - .ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))? - .to_string_lossy(); - let mut new_path = path.clone(); - - for i in 0u32.. { - new_path.set_file_name(format!("{}.{}.old", filename, i)); - if !new_path.exists() { - std::fs::rename(&path, &new_path)?; - return Ok(()); - } - } - - bail!("couldn't find an unused backup number for {:?}", path) -} - pub fn load_metadata( conf: &'static PageServerConf, timeline_id: ZTimelineId, @@ -2690,9 +883,11 @@ pub fn load_metadata( /// #[cfg(test)] pub mod tests { + use super::metadata::METADATA_FILE_NAME; use super::*; use crate::keyspace::KeySpaceAccum; use crate::repository::repo_harness::*; + use crate::repository::{Key, Value}; use rand::{thread_rng, Rng}; #[test] @@ -2823,7 +1018,7 @@ pub mod tests { let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; @@ -2893,7 +1088,7 @@ pub mod tests { // Perform a cycle of checkpoint, compaction, and GC println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; @@ -2970,7 +1165,7 @@ pub mod tests { // Perform a cycle of checkpoint, compaction, and GC println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index ed342c0cca..ce5cb57745 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -316,6 +316,18 @@ impl Layer for DeltaLayer { } } + fn key_iter<'a>(&'a self) -> Box + 'a> { + let inner = match self.load() { + Ok(inner) => inner, + Err(e) => panic!("Failed to load a delta layer: {e:?}"), + }; + + match DeltaKeyIter::new(inner) { + Ok(iter) => Box::new(iter), + Err(e) => panic!("Layer index is corrupted: {e:?}"), + } + } + fn delete(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; @@ -660,11 +672,21 @@ impl DeltaLayerWriter { /// The values must be appended in key, lsn order. /// pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init()) + } + + pub fn put_value_bytes( + &mut self, + key: Key, + lsn: Lsn, + val: &[u8], + will_init: bool, + ) -> Result<()> { assert!(self.lsn_range.start <= lsn); - let off = self.blob_writer.write_blob(&Value::ser(&val)?)?; + let off = self.blob_writer.write_blob(val)?; - let blob_ref = BlobRef::new(off, val.will_init()); + let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); self.tree.append(&delta_key.0, blob_ref.0)?; @@ -822,3 +844,75 @@ impl<'a> DeltaValueIter<'a> { } } } +/// +/// Iterator over all keys stored in a delta layer +/// +/// FIXME: This creates a Vector to hold all keys. +/// That takes up quite a lot of memory. Should do this in a more streaming +/// fashion. +/// +struct DeltaKeyIter { + all_keys: Vec<(DeltaKey, u64)>, + next_idx: usize, +} + +impl Iterator for DeltaKeyIter { + type Item = (Key, Lsn, u64); + + fn next(&mut self) -> Option { + if self.next_idx < self.all_keys.len() { + let (delta_key, size) = &self.all_keys[self.next_idx]; + + let key = delta_key.key(); + let lsn = delta_key.lsn(); + + self.next_idx += 1; + Some((key, lsn, *size)) + } else { + None + } + } +} + +impl<'a> DeltaKeyIter { + fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + + let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new(); + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value| { + let delta_key = DeltaKey::from_slice(key); + let pos = BlobRef(value).pos(); + if let Some(last) = all_keys.last_mut() { + if last.0.key() == delta_key.key() { + return true; + } else { + // subtract offset of new key BLOB and first blob of this key + // to get total size if values associated with this key + let first_pos = last.1; + last.1 = pos - first_pos; + } + } + all_keys.push((delta_key, pos)); + true + }, + )?; + if let Some(last) = all_keys.last_mut() { + // Last key occupies all space till end of layer + last.1 = std::fs::metadata(&file.file.path)?.len() - last.1; + } + let iter = DeltaKeyIter { + all_keys, + next_idx: 0, + }; + + Ok(iter) + } +} diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index cdde9d5d13..299bb4e873 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -43,7 +43,7 @@ pub struct EphemeralFile { _timelineid: ZTimelineId, file: Arc, - size: u64, + pub size: u64, } impl EphemeralFile { diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 87e6877520..5f269a868f 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -15,6 +15,7 @@ use crate::layered_repository::storage_layer::{ use crate::repository::{Key, Value}; use crate::walrecord; use anyhow::{bail, ensure, Result}; +use std::cell::RefCell; use std::collections::HashMap; use tracing::*; use utils::{ @@ -30,6 +31,12 @@ use std::ops::Range; use std::path::PathBuf; use std::sync::RwLock; +thread_local! { + /// A buffer for serializing object during [`InMemoryLayer::put_value`]. + /// This buffer is reused for each serialization to avoid additional malloc calls. + static SER_BUFFER: RefCell> = RefCell::new(Vec::new()); +} + pub struct InMemoryLayer { conf: &'static PageServerConf, tenantid: ZTenantId, @@ -233,6 +240,14 @@ impl Layer for InMemoryLayer { } impl InMemoryLayer { + /// + /// Get layer size on the disk + /// + pub fn size(&self) -> Result { + let inner = self.inner.read().unwrap(); + Ok(inner.file.size) + } + /// /// Create a new, empty, in-memory layer /// @@ -270,10 +285,17 @@ impl InMemoryLayer { pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); let mut inner = self.inner.write().unwrap(); - inner.assert_writeable(); - let off = inner.file.write_blob(&Value::ser(val)?)?; + let off = { + SER_BUFFER.with(|x| -> Result<_> { + let mut buf = x.borrow_mut(); + buf.clear(); + val.ser_into(&mut (*buf))?; + let off = inner.file.write_blob(&buf)?; + Ok(off) + })? + }; let vec_map = inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, off).unwrap().0; @@ -342,8 +364,8 @@ impl InMemoryLayer { // Write all page versions for (lsn, pos) in vec_map.as_slice() { cursor.read_blob_into_buf(*pos, &mut buf)?; - let val = Value::des(&buf)?; - delta_layer_writer.put_value(key, *lsn, val)?; + let will_init = Value::des(&buf)?.will_init(); + delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?; } } diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index f7f51bf21f..be590c88c2 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -10,9 +10,9 @@ //! corresponding files are written to disk. //! +use crate::layered_repository::inmemory_layer::InMemoryLayer; use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; -use crate::layered_repository::InMemoryLayer; use crate::repository::Key; use anyhow::Result; use lazy_static::lazy_static; diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index aaf765b83d..e10330bdd3 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -139,6 +139,12 @@ pub trait Layer: Send + Sync { /// Iterate through all keys and values stored in the layer fn iter(&self) -> Box> + '_>; + /// Iterate through all keys stored in the layer. Returns key, lsn and value size + /// It is used only for compaction and so is currently implemented only for DeltaLayer + fn key_iter(&self) -> Box + '_> { + panic!("Not implemented") + } + /// Permanently remove this layer from disk. fn delete(&self) -> Result<()>; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs new file mode 100644 index 0000000000..703e1993e5 --- /dev/null +++ b/pageserver/src/layered_repository/timeline.rs @@ -0,0 +1,2173 @@ +//! + +use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::Bytes; +use fail::fail_point; +use itertools::Itertools; +use lazy_static::lazy_static; +use tracing::*; + +use std::cmp::{max, min, Ordering}; +use std::collections::HashSet; +use std::fs; +use std::fs::{File, OpenOptions}; +use std::io::Write; +use std::ops::{Deref, Range}; +use std::path::PathBuf; +use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; +use std::time::{Duration, SystemTime}; + +use metrics::{ + register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, + register_uint_gauge_vec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, + IntGaugeVec, UIntGauge, UIntGaugeVec, +}; + +use crate::layered_repository::{ + delta_layer::{DeltaLayer, DeltaLayerWriter}, + ephemeral_file::is_ephemeral_file, + filename::{DeltaFileName, ImageFileName}, + image_layer::{ImageLayer, ImageLayerWriter}, + inmemory_layer::InMemoryLayer, + layer_map::{LayerMap, SearchResult}, + metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, + par_fsync, + storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, +}; + +use crate::config::PageServerConf; +use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::tenant_config::TenantConfOpt; +use crate::DatadirTimeline; + +use postgres_ffi::xlog_utils::to_pg_timestamp; +use utils::{ + lsn::{AtomicLsn, Lsn, RecordLsn}, + seqwait::SeqWait, + zid::{ZTenantId, ZTimelineId}, +}; + +use crate::repository::{GcResult, RepositoryTimeline, Timeline, TimelineWriter}; +use crate::repository::{Key, Value}; +use crate::thread_mgr; +use crate::virtual_file::VirtualFile; +use crate::walreceiver::IS_WAL_RECEIVER; +use crate::walredo::WalRedoManager; +use crate::CheckpointConfig; +use crate::{page_cache, storage_sync}; + +// Metrics collected on operations on the storage repository. +lazy_static! { + pub static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( + "pageserver_storage_operations_seconds", + "Time spent on storage operations", + &["operation", "tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +// Metrics collected on operations on the storage repository. +lazy_static! { + static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( + "pageserver_getpage_reconstruct_seconds", + "Time spent in reconstruct_value", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +lazy_static! { + static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( + "pageserver_materialized_cache_hits_total", + "Number of cache hits from materialized page cache", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); + static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( + "pageserver_wait_lsn_seconds", + "Time spent waiting for WAL to arrive", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +lazy_static! { + static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!( + "pageserver_last_record_lsn", + "Last record LSN grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +// Metrics for determining timeline's physical size. +// A layered timeline's physical is defined as the total size of +// (delta/image) layer files on disk. +lazy_static! { + static ref CURRENT_PHYSICAL_SIZE: UIntGaugeVec = register_uint_gauge_vec!( + "pageserver_current_physical_size", + "Current physical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, +// or in testing they estimate how much we would upload if we did. +lazy_static! { + static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( + "pageserver_created_persistent_files_total", + "Number of files created that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric"); + static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( + "pageserver_written_persistent_bytes_total", + "Total bytes written that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric"); +} + +#[derive(Clone)] +pub enum LayeredTimelineEntry { + Loaded(Arc), + Unloaded { + id: ZTimelineId, + metadata: TimelineMetadata, + }, +} + +impl LayeredTimelineEntry { + fn timeline_id(&self) -> ZTimelineId { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, + LayeredTimelineEntry::Unloaded { id, .. } => *id, + } + } + + pub fn ancestor_timeline_id(&self) -> Option { + match self { + LayeredTimelineEntry::Loaded(timeline) => { + timeline.ancestor_timeline.as_ref().map(|t| t.timeline_id()) + } + LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_timeline(), + } + } + + pub fn ancestor_lsn(&self) -> Lsn { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline.ancestor_lsn, + LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_lsn(), + } + } + + fn ensure_loaded(&self) -> anyhow::Result<&Arc> { + match self { + LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), + LayeredTimelineEntry::Unloaded { .. } => { + anyhow::bail!("timeline is unloaded") + } + } + } + + pub fn layer_removal_guard(&self) -> Result>, anyhow::Error> { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline + .layer_removal_cs + .try_lock() + .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + .map(Some), + + LayeredTimelineEntry::Unloaded { .. } => Ok(None), + } + } +} + +impl From for RepositoryTimeline { + fn from(entry: LayeredTimelineEntry) -> Self { + match entry { + LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), + LayeredTimelineEntry::Unloaded { metadata, .. } => { + RepositoryTimeline::Unloaded { metadata } + } + } + } +} + +pub struct LayeredTimeline { + conf: &'static PageServerConf, + tenant_conf: Arc>, + + tenant_id: ZTenantId, + pub timeline_id: ZTimelineId, + + pub layers: RwLock, + + last_freeze_at: AtomicLsn, + + // WAL redo manager + walredo_mgr: Arc, + + // What page versions do we hold in the repository? If we get a + // request > last_record_lsn, we need to wait until we receive all + // the WAL up to the request. The SeqWait provides functions for + // that. TODO: If we get a request for an old LSN, such that the + // versions have already been garbage collected away, we should + // throw an error, but we don't track that currently. + // + // last_record_lsn.load().last points to the end of last processed WAL record. + // + // We also remember the starting point of the previous record in + // 'last_record_lsn.load().prev'. It's used to set the xl_prev pointer of the + // first WAL record when the node is started up. But here, we just + // keep track of it. + last_record_lsn: SeqWait, + + // All WAL records have been processed and stored durably on files on + // local disk, up to this LSN. On crash and restart, we need to re-process + // the WAL starting from this point. + // + // Some later WAL records might have been processed and also flushed to disk + // already, so don't be surprised to see some, but there's no guarantee on + // them yet. + disk_consistent_lsn: AtomicLsn, + + // Parent timeline that this timeline was branched from, and the LSN + // of the branch point. + ancestor_timeline: Option, + ancestor_lsn: Lsn, + + // Metrics + reconstruct_time_histo: Histogram, + materialized_page_cache_hit_counter: IntCounter, + flush_time_histo: Histogram, + compact_time_histo: Histogram, + create_images_time_histo: Histogram, + last_record_gauge: IntGauge, + wait_lsn_time_histo: Histogram, + current_physical_size_gauge: UIntGauge, + + /// If `true`, will backup its files that appear after each checkpointing to the remote storage. + upload_layers: AtomicBool, + + /// Ensures layers aren't frozen by checkpointer between + /// [`LayeredTimeline::get_layer_for_write`] and layer reads. + /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. + /// Must always be acquired before the layer map/individual layer lock + /// to avoid deadlock. + write_lock: Mutex<()>, + + /// Used to ensure that there is only one thread + layer_flush_lock: Mutex<()>, + + /// Layer removal lock. + /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. + /// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`], + /// and [`LayeredRepository::delete_timeline`]. + layer_removal_cs: Mutex<()>, + + // Needed to ensure that we can't create a branch at a point that was already garbage collected + pub latest_gc_cutoff_lsn: RwLock, + + // List of child timelines and their branch points. This is needed to avoid + // garbage collecting data that is still needed by the child timelines. + pub gc_info: RwLock, + + // It may change across major versions so for simplicity + // keep it after running initdb for a timeline. + // It is needed in checks when we want to error on some operations + // when they are requested for pre-initdb lsn. + // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", + // though lets keep them both for better error visibility. + pub initdb_lsn: Lsn, + + /// When did we last calculate the partitioning? + partitioning: Mutex<(KeyPartitioning, Lsn)>, + + /// Configuration: how often should the partitioning be recalculated. + repartition_threshold: u64, + + /// Current logical size of the "datadir", at the last LSN. + current_logical_size: AtomicIsize, +} + +/// Inherit all the functions from DatadirTimeline, to provide the +/// functionality to store PostgreSQL relations, SLRUs, etc. in a +/// LayeredTimeline. +impl DatadirTimeline for LayeredTimeline {} + +/// +/// Information about how much history needs to be retained, needed by +/// Garbage Collection. +/// +pub struct GcInfo { + /// Specific LSNs that are needed. + /// + /// Currently, this includes all points where child branches have + /// been forked off from. In the future, could also include + /// explicit user-defined snapshot points. + pub retain_lsns: Vec, + + /// In addition to 'retain_lsns', keep everything newer than this + /// point. + /// + /// This is calculated by subtracting 'gc_horizon' setting from + /// last-record LSN + /// + /// FIXME: is this inclusive or exclusive? + pub horizon_cutoff: Lsn, + + /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this + /// point. + /// + /// This is calculated by finding a number such that a record is needed for PITR + /// if only if its LSN is larger than 'pitr_cutoff'. + pub pitr_cutoff: Lsn, +} + +/// Public interface functions +impl Timeline for LayeredTimeline { + fn get_ancestor_lsn(&self) -> Lsn { + self.ancestor_lsn + } + + fn get_ancestor_timeline_id(&self) -> Option { + self.ancestor_timeline + .as_ref() + .map(LayeredTimelineEntry::timeline_id) + } + + /// Wait until WAL has been received up to the given LSN. + fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + // This should never be called from the WAL receiver thread, because that could lead + // to a deadlock. + ensure!( + !IS_WAL_RECEIVER.with(|c| c.get()), + "wait_lsn called by WAL receiver thread" + ); + + self.wait_lsn_time_histo.observe_closure_duration( + || self.last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .with_context(|| { + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + }))?; + + Ok(()) + } + + fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { + self.latest_gc_cutoff_lsn.read().unwrap() + } + + /// Look up the value with the given a key + fn get(&self, key: Key, lsn: Lsn) -> Result { + debug_assert!(lsn <= self.get_last_record_lsn()); + + // Check the page cache. We will get back the most recent page with lsn <= `lsn`. + // The cached image can be returned directly if there is no WAL between the cached image + // and requested LSN. The cached image can also be used to reduce the amount of WAL needed + // for redo. + let cached_page_img = match self.lookup_cached_page(&key, lsn) { + Some((cached_lsn, cached_img)) => { + match cached_lsn.cmp(&lsn) { + Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check + Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn + } + Some((cached_lsn, cached_img)) + } + None => None, + }; + + let mut reconstruct_state = ValueReconstructState { + records: Vec::new(), + img: cached_page_img, + }; + + self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; + + self.reconstruct_time_histo + .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) + } + + /// Public entry point for checkpoint(). All the logic is in the private + /// checkpoint_internal function, this public facade just wraps it for + /// metrics collection. + fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { + match cconf { + CheckpointConfig::Flush => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true) + } + CheckpointConfig::Forced => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true)?; + self.compact() + } + } + } + + /// + /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. + /// + fn check_lsn_is_in_scope( + &self, + lsn: Lsn, + latest_gc_cutoff_lsn: &RwLockReadGuard, + ) -> Result<()> { + ensure!( + lsn >= **latest_gc_cutoff_lsn, + "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + lsn, + **latest_gc_cutoff_lsn, + ); + Ok(()) + } + + fn get_last_record_lsn(&self) -> Lsn { + self.last_record_lsn.load().last + } + + fn get_prev_record_lsn(&self) -> Lsn { + self.last_record_lsn.load().prev + } + + fn get_last_record_rlsn(&self) -> RecordLsn { + self.last_record_lsn.load() + } + + fn get_disk_consistent_lsn(&self) -> Lsn { + self.disk_consistent_lsn.load() + } + + fn writer<'a>(&'a self) -> Box { + Box::new(LayeredTimelineWriter { + tl: self, + _write_guard: self.write_lock.lock().unwrap(), + }) + } + + fn get_physical_size(&self) -> u64 { + self.current_physical_size_gauge.get() + } + + fn get_physical_size_non_incremental(&self) -> anyhow::Result { + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + // total size of layer files in the current timeline directory + let mut total_physical_size = 0; + + for direntry in fs::read_dir(timeline_path)? { + let direntry = direntry?; + let fname = direntry.file_name(); + let fname = fname.to_string_lossy(); + + if ImageFileName::parse_str(&fname).is_some() + || DeltaFileName::parse_str(&fname).is_some() + { + total_physical_size += direntry.metadata()?.len(); + } + } + + Ok(total_physical_size) + } +} + +impl LayeredTimeline { + fn get_checkpoint_distance(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_distance + .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) + } + + fn get_compaction_target_size(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_target_size + .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) + } + + fn get_compaction_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_threshold + .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) + } + + fn get_image_creation_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .image_creation_threshold + .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) + } + + /// Open a Timeline handle. + /// + /// Loads the metadata for the timeline into memory, but not the layer map. + #[allow(clippy::too_many_arguments)] + pub fn new( + conf: &'static PageServerConf, + tenant_conf: Arc>, + metadata: TimelineMetadata, + ancestor: Option, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + walredo_mgr: Arc, + upload_layers: bool, + ) -> LayeredTimeline { + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let flush_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "layer flush", + &tenant_id.to_string(), + &timeline_id.to_string(), + ]) + .unwrap(); + let compact_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "compact", + &tenant_id.to_string(), + &timeline_id.to_string(), + ]) + .unwrap(); + let create_images_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "create images", + &tenant_id.to_string(), + &timeline_id.to_string(), + ]) + .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + + let mut result = LayeredTimeline { + conf, + tenant_conf, + timeline_id, + tenant_id, + layers: RwLock::new(LayerMap::default()), + + walredo_mgr, + + // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. + last_record_lsn: SeqWait::new(RecordLsn { + last: metadata.disk_consistent_lsn(), + prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), + }), + disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), + + last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), + + ancestor_timeline: ancestor, + ancestor_lsn: metadata.ancestor_lsn(), + + reconstruct_time_histo, + materialized_page_cache_hit_counter, + flush_time_histo, + compact_time_histo, + create_images_time_histo, + last_record_gauge, + wait_lsn_time_histo, + current_physical_size_gauge, + + upload_layers: AtomicBool::new(upload_layers), + + write_lock: Mutex::new(()), + layer_flush_lock: Mutex::new(()), + layer_removal_cs: Mutex::new(()), + + gc_info: RwLock::new(GcInfo { + retain_lsns: Vec::new(), + horizon_cutoff: Lsn(0), + pitr_cutoff: Lsn(0), + }), + + latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), + initdb_lsn: metadata.initdb_lsn(), + + current_logical_size: AtomicIsize::new(0), + partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), + repartition_threshold: 0, + }; + result.repartition_threshold = result.get_checkpoint_distance() / 10; + result + } + + /// + /// Scan the timeline directory to populate the layer map. + /// Returns all timeline-related files that were found and loaded. + /// + pub fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { + let mut layers = self.layers.write().unwrap(); + let mut num_layers = 0; + + // Scan timeline directory and create ImageFileName and DeltaFilename + // structs representing all files on disk + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + // total size of layer files in the current timeline directory + let mut total_physical_size = 0; + + for direntry in fs::read_dir(timeline_path)? { + let direntry = direntry?; + let fname = direntry.file_name(); + let fname = fname.to_string_lossy(); + + if let Some(imgfilename) = ImageFileName::parse_str(&fname) { + // create an ImageLayer struct for each image file. + if imgfilename.lsn > disk_consistent_lsn { + warn!( + "found future image layer {} on timeline {} disk_consistent_lsn is {}", + imgfilename, self.timeline_id, disk_consistent_lsn + ); + + rename_to_backup(direntry.path())?; + continue; + } + + let layer = + ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); + + trace!("found layer {}", layer.filename().display()); + total_physical_size += layer.path().metadata()?.len(); + layers.insert_historic(Arc::new(layer)); + num_layers += 1; + } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { + // Create a DeltaLayer struct for each delta file. + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { + warn!( + "found future delta layer {} on timeline {} disk_consistent_lsn is {}", + deltafilename, self.timeline_id, disk_consistent_lsn + ); + + rename_to_backup(direntry.path())?; + continue; + } + + let layer = + DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); + + trace!("found layer {}", layer.filename().display()); + total_physical_size += layer.path().metadata()?.len(); + layers.insert_historic(Arc::new(layer)); + num_layers += 1; + } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { + // ignore these + } else if is_ephemeral_file(&fname) { + // Delete any old ephemeral files + trace!("deleting old ephemeral file in timeline dir: {}", fname); + fs::remove_file(direntry.path())?; + } else { + warn!("unrecognized filename in timeline dir: {}", fname); + } + } + + layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); + + info!( + "loaded layer map with {} layers at {}, total physical size: {}", + num_layers, disk_consistent_lsn, total_physical_size + ); + self.current_physical_size_gauge.set(total_physical_size); + + Ok(()) + } + + /// (Re-)calculate the logical size of the database at the latest LSN. + /// + /// This can be a slow operation. + pub fn init_logical_size(&self) -> Result<()> { + // Try a fast-path first: + // Copy logical size from ancestor timeline if there has been no changes on this + // branch, and no changes on the ancestor branch since the branch point. + if self.get_ancestor_lsn() == self.get_last_record_lsn() && self.ancestor_timeline.is_some() + { + let ancestor = self.get_ancestor_timeline()?; + let ancestor_logical_size = ancestor.get_current_logical_size(); + // Check LSN after getting logical size to exclude race condition + // when ancestor timeline is concurrently updated. + // + // Logical size 0 means that it was not initialized, so don't believe that. + if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn { + self.current_logical_size + .store(ancestor_logical_size as isize, AtomicOrdering::SeqCst); + debug!( + "logical size copied from ancestor: {}", + ancestor_logical_size + ); + return Ok(()); + } + } + + // Have to calculate it the hard way + let last_lsn = self.get_last_record_lsn(); + let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; + self.current_logical_size + .store(logical_size as isize, AtomicOrdering::SeqCst); + debug!("calculated logical size the hard way: {}", logical_size); + Ok(()) + } + + /// Retrieve current logical size of the timeline + /// + /// NOTE: counted incrementally, includes ancestors, + pub fn get_current_logical_size(&self) -> usize { + let current_logical_size = self.current_logical_size.load(AtomicOrdering::Acquire); + match usize::try_from(current_logical_size) { + Ok(sz) => sz, + Err(_) => { + error!( + "current_logical_size is out of range: {}", + current_logical_size + ); + 0 + } + } + } + + /// + /// Get a handle to a Layer for reading. + /// + /// The returned Layer might be from an ancestor timeline, if the + /// segment hasn't been updated on this timeline yet. + /// + /// This function takes the current timeline's locked LayerMap as an argument, + /// so callers can avoid potential race conditions. + fn get_reconstruct_data( + &self, + key: Key, + request_lsn: Lsn, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result<()> { + // Start from the current timeline. + let mut timeline_owned; + let mut timeline = self; + + // For debugging purposes, collect the path of layers that we traversed + // through. It's included in the error message if we fail to find the key. + let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); + + let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { + *cached_lsn + } else { + Lsn(0) + }; + + // 'prev_lsn' tracks the last LSN that we were at in our search. It's used + // to check that each iteration make some progress, to break infinite + // looping if something goes wrong. + let mut prev_lsn = Lsn(u64::MAX); + + let mut result = ValueReconstructResult::Continue; + let mut cont_lsn = Lsn(request_lsn.0 + 1); + + 'outer: loop { + // The function should have updated 'state' + //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); + match result { + ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Continue => { + // If we reached an earlier cached page image, we're done. + if cont_lsn == cached_lsn + 1 { + self.materialized_page_cache_hit_counter.inc_by(1); + return Ok(()); + } + if prev_lsn <= cont_lsn { + // Didn't make any progress in last iteration. Error out to avoid + // getting stuck in the loop. + return layer_traversal_error(format!( + "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + key, + Lsn(cont_lsn.0 - 1), + request_lsn, + timeline.ancestor_lsn + ), traversal_path); + } + prev_lsn = cont_lsn; + } + ValueReconstructResult::Missing => { + return layer_traversal_error( + format!( + "could not find data for key {} at LSN {}, for request at LSN {}", + key, cont_lsn, request_lsn + ), + traversal_path, + ); + } + } + + // Recurse into ancestor if needed + if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + trace!( + "going into ancestor {}, cont_lsn is {}", + timeline.ancestor_lsn, + cont_lsn + ); + let ancestor = timeline.get_ancestor_timeline()?; + timeline_owned = ancestor; + timeline = &*timeline_owned; + prev_lsn = Lsn(u64::MAX); + continue; + } + + let layers = timeline.layers.read().unwrap(); + + // Check the open and frozen in-memory layers first, in order from newest + // to oldest. + if let Some(open_layer) = &layers.open_layer { + let start_lsn = open_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + // Get all the data needed to reconstruct the page version from this layer. + // But if we have an older cached page image, no need to go past that. + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = open_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, open_layer.clone())); + continue; + } + } + for frozen_layer in layers.frozen_layers.iter().rev() { + let start_lsn = frozen_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = frozen_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, frozen_layer.clone())); + continue 'outer; + } + } + + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { + //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); + + let lsn_floor = max(cached_lsn + 1, lsn_floor); + result = layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, layer)); + } else if timeline.ancestor_timeline.is_some() { + // Nothing on this timeline. Traverse to parent + result = ValueReconstructResult::Continue; + cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); + } else { + // Nothing found + result = ValueReconstructResult::Missing; + } + } + } + + fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { + let cache = page_cache::get(); + + // FIXME: It's pointless to check the cache for things that are not 8kB pages. + // We should look at the key to determine if it's a cacheable object + let (lsn, read_guard) = + cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?; + let img = Bytes::from(read_guard.to_vec()); + Some((lsn, img)) + } + + fn get_ancestor_timeline(&self) -> Result> { + let ancestor = self + .ancestor_timeline + .as_ref() + .with_context(|| { + format!( + "Ancestor is missing. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })? + .ensure_loaded() + .with_context(|| { + format!( + "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })?; + Ok(Arc::clone(ancestor)) + } + + /// + /// Get a handle to the latest layer for appending. + /// + fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { + let mut layers = self.layers.write().unwrap(); + + ensure!(lsn.is_aligned()); + + let last_record_lsn = self.get_last_record_lsn(); + ensure!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + lsn, + last_record_lsn, + ); + + // Do we have a layer open for writing already? + let layer; + if let Some(open_layer) = &layers.open_layer { + if open_layer.get_lsn_range().start > lsn { + bail!("unexpected open layer in the future"); + } + + layer = Arc::clone(open_layer); + } else { + // No writeable layer yet. Create one. + let start_lsn = layers.next_open_layer_at.unwrap(); + + trace!( + "creating layer for write at {}/{} for record at {}", + self.timeline_id, + start_lsn, + lsn + ); + let new_layer = + InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; + let layer_rc = Arc::new(new_layer); + + layers.open_layer = Some(Arc::clone(&layer_rc)); + layers.next_open_layer_at = None; + + layer = layer_rc; + } + Ok(layer) + } + + fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { + //info!("PUT: key {} at {}", key, lsn); + let layer = self.get_layer_for_write(lsn)?; + layer.put_value(key, lsn, val)?; + Ok(()) + } + + fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> Result<()> { + let layer = self.get_layer_for_write(lsn)?; + layer.put_tombstone(key_range, lsn)?; + + Ok(()) + } + + fn finish_write(&self, new_lsn: Lsn) { + assert!(new_lsn.is_aligned()); + + self.last_record_gauge.set(new_lsn.0 as i64); + self.last_record_lsn.advance(new_lsn); + } + + fn freeze_inmem_layer(&self, write_lock_held: bool) { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let _write_guard = if write_lock_held { + None + } else { + Some(self.write_lock.lock().unwrap()) + }; + let mut layers = self.layers.write().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_rc = Arc::clone(open_layer); + // Does this layer need freezing? + let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); + open_layer.freeze(end_lsn); + + // The layer is no longer open, update the layer map to reflect this. + // We will replace it with on-disk historics below. + layers.frozen_layers.push_back(open_layer_rc); + layers.open_layer = None; + layers.next_open_layer_at = Some(end_lsn); + self.last_freeze_at.store(end_lsn); + } + drop(layers); + } + + /// + /// Check if more than 'checkpoint_distance' of WAL has been accumulated + /// in the in-memory layer, and initiate flushing it if so. + /// + pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { + let last_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_size = open_layer.size()?; + drop(layers); + let distance = last_lsn.widening_sub(self.last_freeze_at.load()); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + { + info!( + "check_checkpoint_distance {}, layer size {}", + distance, open_layer_size + ); + + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); + + // Launch a thread to flush the frozen layer to disk, unless + // a thread was already running. (If the thread was running + // at the time that we froze the layer, it must've seen the + // the layer we just froze before it exited; see comments + // in flush_frozen_layers()) + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + thread_mgr::spawn( + thread_mgr::ThreadKind::LayerFlushThread, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush thread", + false, + move || self_clone.flush_frozen_layers(false), + )?; + } + } + } + Ok(()) + } + + /// Flush all frozen layers to disk. + /// + /// Only one thread at a time can be doing layer-flushing for a + /// given timeline. If 'wait' is true, and another thread is + /// currently doing the flushing, this function will wait for it + /// to finish. If 'wait' is false, this function will return + /// immediately instead. + fn flush_frozen_layers(&self, wait: bool) -> Result<()> { + let flush_lock_guard = if wait { + self.layer_flush_lock.lock().unwrap() + } else { + match self.layer_flush_lock.try_lock() { + Ok(guard) => guard, + Err(TryLockError::WouldBlock) => return Ok(()), + Err(TryLockError::Poisoned(err)) => panic!("{:?}", err), + } + }; + + let timer = self.flush_time_histo.start_timer(); + + loop { + let layers = self.layers.read().unwrap(); + if let Some(frozen_layer) = layers.frozen_layers.front() { + let frozen_layer = Arc::clone(frozen_layer); + drop(layers); // to allow concurrent reads and writes + self.flush_frozen_layer(frozen_layer)?; + } else { + // Drop the 'layer_flush_lock' *before* 'layers'. That + // way, if you freeze a layer, and then call + // flush_frozen_layers(false), it is guaranteed that + // if another thread was busy flushing layers and the + // call therefore returns immediately, the other + // thread will have seen the newly-frozen layer and + // will flush that too (assuming no errors). + drop(flush_lock_guard); + drop(layers); + break; + } + } + + timer.stop_and_record(); + + Ok(()) + } + + /// Flush one frozen in-memory layer to disk, as a new delta layer. + fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { + // As a special case, when we have just imported an image into the repository, + // instead of writing out a L0 delta layer, we directly write out image layer + // files instead. This is possible as long as *all* the data imported into the + // repository have the same LSN. + let lsn_range = frozen_layer.get_lsn_range(); + let layer_paths_to_upload = + if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { + let (partitioning, _lsn) = + self.repartition(self.initdb_lsn, self.get_compaction_target_size())?; + self.create_image_layers(&partitioning, self.initdb_lsn, true)? + } else { + // normal case, write out a L0 delta layer file. + let delta_path = self.create_delta_layer(&frozen_layer)?; + HashSet::from([delta_path]) + }; + + fail_point!("flush-frozen-before-sync"); + + // The new on-disk layers are now in the layer map. We can remove the + // in-memory layer from the map now. + { + let mut layers = self.layers.write().unwrap(); + let l = layers.frozen_layers.pop_front(); + + // Only one thread may call this function at a time (for this + // timeline). If two threads tried to flush the same frozen + // layer to disk at the same time, that would not work. + assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); + + // release lock on 'layers' + } + + fail_point!("checkpoint-after-sync"); + + // Update the metadata file, with new 'disk_consistent_lsn' + // + // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing + // *all* the layers, to avoid fsyncing the file multiple times. + let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); + self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?; + + Ok(()) + } + + /// Update metadata file + fn update_disk_consistent_lsn( + &self, + disk_consistent_lsn: Lsn, + layer_paths_to_upload: HashSet, + ) -> Result<()> { + // If we were able to advance 'disk_consistent_lsn', save it the metadata file. + // After crash, we will restart WAL streaming and processing from that point. + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + if disk_consistent_lsn != old_disk_consistent_lsn { + assert!(disk_consistent_lsn > old_disk_consistent_lsn); + + // We can only save a valid 'prev_record_lsn' value on disk if we + // flushed *all* in-memory changes to disk. We only track + // 'prev_record_lsn' in memory for the latest processed record, so we + // don't remember what the correct value that corresponds to some old + // LSN is. But if we flush everything, then the value corresponding + // current 'last_record_lsn' is correct and we can store it on disk. + let RecordLsn { + last: last_record_lsn, + prev: prev_record_lsn, + } = self.last_record_lsn.load(); + let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { + Some(prev_record_lsn) + } else { + None + }; + + let ancestor_timelineid = self + .ancestor_timeline + .as_ref() + .map(LayeredTimelineEntry::timeline_id); + + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + ondisk_prev_record_lsn, + ancestor_timelineid, + self.ancestor_lsn, + *self.latest_gc_cutoff_lsn.read().unwrap(), + self.initdb_lsn, + ); + + fail_point!("checkpoint-before-saving-metadata", |x| bail!( + "{}", + x.unwrap() + )); + + save_metadata( + self.conf, + self.timeline_id, + self.tenant_id, + &metadata, + false, + )?; + + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + layer_paths_to_upload, + Some(metadata), + ); + } + + // Also update the in-memory copy + self.disk_consistent_lsn.store(disk_consistent_lsn); + } + + Ok(()) + } + + // Write out the given frozen in-memory layer as a new L0 delta file + fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result { + // Write it out + let new_delta = frozen_layer.write_to_disk()?; + let new_delta_path = new_delta.path(); + + // Sync it to disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // TODO: If we're running inside 'flush_frozen_layers' and there are multiple + // files to flush, it might be better to first write them all, and then fsync + // them all in parallel. + par_fsync::par_fsync(&[ + new_delta_path.clone(), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), + ])?; + + // Add it to the layer map + { + let mut layers = self.layers.write().unwrap(); + layers.insert_historic(Arc::new(new_delta)); + } + + // update the timeline's physical size + let sz = new_delta_path.metadata()?.len(); + self.current_physical_size_gauge.add(sz); + // update metrics + NUM_PERSISTENT_FILES_CREATED.inc_by(1); + PERSISTENT_BYTES_WRITTEN.inc_by(sz); + + Ok(new_delta_path) + } + + pub fn compact(&self) -> Result<()> { + // + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This high level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let target_file_size = self.get_checkpoint_distance(); + + // Define partitioning schema if needed + + match self.repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + ) { + Ok((partitioning, lsn)) => { + // 2. Create new image layers for partitions that have been modified + // "enough". + let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + if !layer_paths_to_upload.is_empty() + && self.upload_layers.load(atomic::Ordering::Relaxed) + { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + HashSet::from_iter(layer_paths_to_upload), + None, + ); + } + + // 3. Compact + let timer = self.compact_time_histo.start_timer(); + self.compact_level0(target_file_size)?; + timer.stop_and_record(); + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + error!("could not compact, repartitioning keyspace failed: {err:?}"); + } + }; + + Ok(()) + } + + fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> { + let mut partitioning_guard = self.partitioning.lock().unwrap(); + if partitioning_guard.1 == Lsn(0) + || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold + { + let keyspace = self.collect_keyspace(lsn)?; + let partitioning = keyspace.partition(partition_size); + *partitioning_guard = (partitioning, lsn); + return Ok((partitioning_guard.0.clone(), lsn)); + } + Ok((partitioning_guard.0.clone(), partitioning_guard.1)) + } + + // Is it time to create a new image layer for the given partition? + fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { + let layers = self.layers.read().unwrap(); + + for part_range in &partition.ranges { + let image_coverage = layers.image_coverage(part_range, lsn)?; + for (img_range, last_img) in image_coverage { + let img_lsn = if let Some(last_img) = last_img { + last_img.get_lsn_range().end + } else { + Lsn(0) + }; + // Let's consider an example: + // + // delta layer with LSN range 71-81 + // delta layer with LSN range 81-91 + // delta layer with LSN range 91-101 + // image layer at LSN 100 + // + // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, + // there's no need to create a new one. We check this case explicitly, to avoid passing + // a bogus range to count_deltas below, with start > end. It's even possible that there + // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed + // after we read last_record_lsn, which is passed here in the 'lsn' argument. + if img_lsn < lsn { + let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; + + debug!( + "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", + img_range.start, img_range.end, num_deltas, img_lsn, lsn + ); + if num_deltas >= self.get_image_creation_threshold() { + return Ok(true); + } + } + } + } + + Ok(false) + } + + fn create_image_layers( + &self, + partitioning: &KeyPartitioning, + lsn: Lsn, + force: bool, + ) -> Result> { + let timer = self.create_images_time_histo.start_timer(); + let mut image_layers: Vec = Vec::new(); + let mut layer_paths_to_upload = HashSet::new(); + for partition in partitioning.parts.iter() { + if force || self.time_for_new_image_layer(partition, lsn)? { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + &img_range, + lsn, + )?; + + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + let img = self.get(key, lsn)?; + image_layer_writer.put_image(key, &img)?; + key = key.next(); + } + } + let image_layer = image_layer_writer.finish()?; + layer_paths_to_upload.insert(image_layer.path()); + image_layers.push(image_layer); + } + } + + // Sync the new layer to disk before adding it to the layer map, to make sure + // we don't garbage collect something based on the new layer, before it has + // reached the disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // Compaction creates multiple image layers. It would be better to create them all + // and fsync them all in parallel. + let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone()); + all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + par_fsync::par_fsync(&all_paths)?; + + let mut layers = self.layers.write().unwrap(); + for l in image_layers { + self.current_physical_size_gauge + .add(l.path().metadata()?.len()); + layers.insert_historic(Arc::new(l)); + } + drop(layers); + timer.stop_and_record(); + + Ok(layer_paths_to_upload) + } + + /// + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. + /// + fn compact_level0(&self, target_file_size: u64) -> Result<()> { + let layers = self.layers.read().unwrap(); + let mut level0_deltas = layers.get_level0_deltas()?; + drop(layers); + + // Only compact if enough layers have accumulated. + if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { + return Ok(()); + } + + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.get_lsn_range().start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.get_lsn_range().end; + let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)]; + for l in level0_deltas_iter { + let lsn_range = l.get_lsn_range(); + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(Arc::clone(l)); + prev_lsn_end = lsn_range.end; + } + let lsn_range = Range { + start: deltas_to_compact.first().unwrap().get_lsn_range().start, + end: deltas_to_compact.last().unwrap().get_lsn_range().end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + for l in deltas_to_compact.iter() { + info!("compact includes {}", l.filename().display()); + } + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let all_values_iter = deltas_to_compact + .iter() + .map(|l| l.iter()) + .kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + } else { + false + } + } else { + true + } + }); + + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = deltas_to_compact + .iter() + .map(|l| l.key_iter()) + .kmerge_by(|a, b| { + let (a_key, a_lsn, _) = a; + let (b_key, b_lsn, _) = b; + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + for x in all_values_iter { + let (key, lsn, value) = x?; + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key, or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + break; + } + key_values_total_size += next_size; + if key_values_total_size > target_file_size { + // split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn + } else { + lsn + }; + dup_end_lsn = next_lsn; + break; + } + } + // handle case when loop reaches last key + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + // check if key cause layer overflow + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + { + new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); + writer = None; + } + } + key_values_total_size = next_key_size; + } + if writer.is_none() { + writer = Some(DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + )?); + } + writer.as_mut().unwrap().put_value(key, lsn, value)?; + prev_key = Some(key); + } + if let Some(writer) = writer { + new_layers.push(writer.finish(prev_key.unwrap().next())?); + } + + // Sync layers + if !new_layers.is_empty() { + let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); + + // also sync the directory + layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + + // Fsync all the layer files and directory using multiple threads to + // minimize latency. + par_fsync::par_fsync(&layer_paths)?; + + layer_paths.pop().unwrap(); + } + + let mut layers = self.layers.write().unwrap(); + let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); + for l in new_layers { + let new_delta_path = l.path(); + + // update the timeline's physical size + self.current_physical_size_gauge + .add(new_delta_path.metadata()?.len()); + + new_layer_paths.insert(new_delta_path); + layers.insert_historic(Arc::new(l)); + } + + // Now that we have reshuffled the data to set of new delta layers, we can + // delete the old ones + let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); + drop(all_keys_iter); + for l in deltas_to_compact { + if let Some(path) = l.local_path() { + self.current_physical_size_gauge.sub(path.metadata()?.len()); + layer_paths_do_delete.insert(path); + } + l.delete()?; + layers.remove_historic(l); + } + drop(layers); + + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + new_layer_paths, + None, + ); + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_do_delete, + ); + } + + Ok(()) + } + + /// Update information about which layer files need to be retained on + /// garbage collection. This is separate from actually performing the GC, + /// and is updated more frequently, so that compaction can remove obsolete + /// page versions more aggressively. + /// + /// TODO: that's wishful thinking, compaction doesn't actually do that + /// currently. + /// + /// The caller specifies how much history is needed with the 3 arguments: + /// + /// retain_lsns: keep a version of each page at these LSNs + /// cutoff_horizon: also keep everything newer than this LSN + /// pitr: the time duration required to keep data for PITR + /// + /// The 'retain_lsns' list is currently used to prevent removing files that + /// are needed by child timelines. In the future, the user might be able to + /// name additional points in time to retain. The caller is responsible for + /// collecting that information. + /// + /// The 'cutoff_horizon' point is used to retain recent versions that might still be + /// needed by read-only nodes. (As of this writing, the caller just passes + /// the latest LSN subtracted by a constant, and doesn't do anything smart + /// to figure out what read-only nodes might actually need.) + /// + /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine + /// whether a record is needed for PITR. + pub fn update_gc_info( + &self, + retain_lsns: Vec, + cutoff_horizon: Lsn, + pitr: Duration, + ) -> Result<()> { + let mut gc_info = self.gc_info.write().unwrap(); + + gc_info.horizon_cutoff = cutoff_horizon; + gc_info.retain_lsns = retain_lsns; + + // Calculate pitr cutoff point. + // If we cannot determine a cutoff LSN, be conservative and don't GC anything. + let mut pitr_cutoff_lsn: Lsn; + + if pitr != Duration::ZERO { + // conservative, safe default is to remove nothing, when we have no + // commit timestamp data available + pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn(); + + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. + // If we don't have enough data to convert to LSN, + // play safe and don't remove any layers. + let now = SystemTime::now(); + if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { + let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); + + match self.find_lsn_for_timestamp(pitr_timestamp)? { + LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, + LsnForTimestamp::Future(lsn) => { + debug!("future({})", lsn); + pitr_cutoff_lsn = gc_info.horizon_cutoff; + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + } + } + debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) + } + } else { + // No time-based retention. (Some unit tests depend on garbage-collection + // working even when CLOG data is missing, so that find_lsn_for_timestamp() + // above doesn't work.) + pitr_cutoff_lsn = gc_info.horizon_cutoff; + } + gc_info.pitr_cutoff = pitr_cutoff_lsn; + + Ok(()) + } + + /// + /// Garbage collect layer files on a timeline that are no longer needed. + /// + /// Currently, we don't make any attempt at removing unneeded page versions + /// within a layer file. We can only remove the whole file if it's fully + /// obsolete. + /// + pub fn gc(&self) -> Result { + let mut result: GcResult = Default::default(); + let now = SystemTime::now(); + + fail_point!("before-timeline-gc"); + + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let gc_info = self.gc_info.read().unwrap(); + + let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.pitr_cutoff; + let retain_lsns = &gc_info.retain_lsns; + + let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + + // Nothing to GC. Return early. + let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + if latest_gc_cutoff >= new_gc_cutoff { + info!( + "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", + self.timeline_id + ); + return Ok(result); + } + + let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); + + // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. + // See branch_timeline() for details. + *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; + + info!("GC starting"); + + debug!("retain_lsns: {:?}", retain_lsns); + + let mut layers_to_remove = Vec::new(); + + // Scan all on-disk layers in the timeline. + // + // Garbage collect the layer if all conditions are satisfied: + // 1. it is older than cutoff LSN; + // 2. it is older than PITR interval; + // 3. it doesn't need to be retained for 'retain_lsns'; + // 4. newer on-disk image layers cover the layer's whole key range + // + let mut layers = self.layers.write().unwrap(); + 'outer: for l in layers.iter_historic_layers() { + // This layer is in the process of being flushed to disk. + // It will be swapped out of the layer map, replaced with + // on-disk layers containing the same data. + // We can't GC it, as it's not on disk. We can't remove it + // from the layer map yet, as it would make its data + // inaccessible. + if l.is_in_memory() { + continue; + } + + result.layers_total += 1; + + // 1. Is it newer than GC horizon cutoff point? + if l.get_lsn_range().end > horizon_cutoff { + debug!( + "keeping {} because it's newer than horizon_cutoff {}", + l.filename().display(), + horizon_cutoff + ); + result.layers_needed_by_cutoff += 1; + continue 'outer; + } + + // 2. It is newer than PiTR cutoff point? + if l.get_lsn_range().end > pitr_cutoff { + debug!( + "keeping {} because it's newer than pitr_cutoff {}", + l.filename().display(), + pitr_cutoff + ); + result.layers_needed_by_pitr += 1; + continue 'outer; + } + + // 3. Is it needed by a child branch? + // NOTE With that we would keep data that + // might be referenced by child branches forever. + // We can track this in child timeline GC and delete parent layers when + // they are no longer needed. This might be complicated with long inheritance chains. + for retain_lsn in retain_lsns { + // start_lsn is inclusive + if &l.get_lsn_range().start <= retain_lsn { + debug!( + "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + l.filename().display(), + retain_lsn, + l.is_incremental(), + ); + result.layers_needed_by_branches += 1; + continue 'outer; + } + } + + // 4. Is there a later on-disk layer for this relation? + // + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + // + // For example, imagine that the following layers exist: + // + // 1000 - image (A) + // 1000-2000 - delta (B) + // 2000 - image (C) + // 2000-3000 - delta (D) + // 3000 - image (E) + // + // If GC horizon is at 2500, we can remove layers A and B, but + // we cannot remove C, even though it's older than 2500, because + // the delta layer 2000-3000 depends on it. + if !layers + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? + { + debug!( + "keeping {} because it is the latest layer", + l.filename().display() + ); + result.layers_not_updated += 1; + continue 'outer; + } + + // We didn't find any reason to keep this file, so remove it. + debug!( + "garbage collecting {} is_dropped: xx is_incremental: {}", + l.filename().display(), + l.is_incremental(), + ); + layers_to_remove.push(Arc::clone(l)); + } + + // Actually delete the layers from disk and remove them from the map. + // (couldn't do this in the loop above, because you cannot modify a collection + // while iterating it. BTreeMap::retain() would be another option) + let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); + for doomed_layer in layers_to_remove { + if let Some(path) = doomed_layer.local_path() { + self.current_physical_size_gauge.sub(path.metadata()?.len()); + layer_paths_to_delete.insert(path); + } + doomed_layer.delete()?; + layers.remove_historic(doomed_layer); + result.layers_removed += 1; + } + + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_to_delete, + ); + } + + result.elapsed = now.elapsed()?; + Ok(result) + } + + /// + /// Reconstruct a value, using the given base image and WAL records in 'data'. + /// + fn reconstruct_value( + &self, + key: Key, + request_lsn: Lsn, + mut data: ValueReconstructState, + ) -> Result { + // Perform WAL redo if needed + data.records.reverse(); + + // If we have a page image, and no WAL, we're all set + if data.records.is_empty() { + if let Some((img_lsn, img)) = &data.img { + trace!( + "found page image for key {} at {}, no WAL redo required", + key, + img_lsn + ); + Ok(img.clone()) + } else { + bail!("base image for {} at {} not found", key, request_lsn); + } + } else { + // We need to do WAL redo. + // + // If we don't have a base image, then the oldest WAL record better initialize + // the page + if data.img.is_none() && !data.records.first().unwrap().1.will_init() { + bail!( + "Base image for {} at {} not found, but got {} WAL records", + key, + request_lsn, + data.records.len() + ); + } else { + let base_img = if let Some((_lsn, img)) = data.img { + trace!( + "found {} WAL records and a base image for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); + Some(img) + } else { + trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); + None + }; + + let last_rec_lsn = data.records.last().unwrap().0; + + let img = + self.walredo_mgr + .request_redo(key, request_lsn, base_img, data.records)?; + + if img.len() == page_cache::PAGE_SZ { + let cache = page_cache::get(); + cache.memorize_materialized_page( + self.tenant_id, + self.timeline_id, + key, + last_rec_lsn, + &img, + ); + } + + Ok(img) + } + } + } +} + +/// Helper function for get_reconstruct_data() to add the path of layers traversed +/// to an error, as anyhow context information. +fn layer_traversal_error( + msg: String, + path: Vec<(ValueReconstructResult, Lsn, Arc)>, +) -> anyhow::Result<()> { + // We want the original 'msg' to be the outermost context. The outermost context + // is the most high-level information, which also gets propagated to the client. + let mut msg_iter = path + .iter() + .map(|(r, c, l)| { + format!( + "layer traversal: result {:?}, cont_lsn {}, layer: {}", + r, + c, + l.filename().display() + ) + }) + .chain(std::iter::once(msg)); + // Construct initial message from the first traversed layer + let err = anyhow!(msg_iter.next().unwrap()); + + // Append all subsequent traversals, and the error message 'msg', as contexts. + Err(msg_iter.fold(err, |err, msg| err.context(msg))) +} + +struct LayeredTimelineWriter<'a> { + tl: &'a LayeredTimeline, + _write_guard: MutexGuard<'a, ()>, +} + +impl Deref for LayeredTimelineWriter<'_> { + type Target = dyn Timeline; + + fn deref(&self) -> &Self::Target { + self.tl + } +} + +impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { + fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { + self.tl.put_value(key, lsn, value) + } + + fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { + self.tl.put_tombstone(key_range, lsn) + } + + /// + /// Remember the (end of) last valid WAL record remembered in the timeline. + /// + fn finish_write(&self, new_lsn: Lsn) { + self.tl.finish_write(new_lsn); + } + + fn update_current_logical_size(&self, delta: isize) { + self.tl + .current_logical_size + .fetch_add(delta, AtomicOrdering::SeqCst); + } +} + +/// Add a suffix to a layer file's name: .{num}.old +/// Uses the first available num (starts at 0) +fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { + let filename = path + .file_name() + .ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))? + .to_string_lossy(); + let mut new_path = path.clone(); + + for i in 0u32.. { + new_path.set_file_name(format!("{}.{}.old", filename, i)); + if !new_path.exists() { + std::fs::rename(&path, &new_path)?; + return Ok(()); + } + } + + bail!("couldn't find an unused backup number for {:?}", path) +} + +/// Save timeline metadata to file +pub fn save_metadata( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + data: &TimelineMetadata, + first_save: bool, +) -> Result<()> { + let _enter = info_span!("saving metadata").entered(); + let path = metadata_path(conf, timelineid, tenantid); + // use OpenOptions to ensure file presence is consistent with first_save + let mut file = VirtualFile::open_with_options( + &path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; + + if file.write(&metadata_bytes)? != metadata_bytes.len() { + bail!("Could not write all the metadata bytes in a single call"); + } + file.sync_all()?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + let timeline_dir = File::open( + &path + .parent() + .expect("Metadata should always have a parent dir"), + )?; + timeline_dir.sync_all()?; + } + + Ok(()) +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c9c00d75e2..4ecb181553 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -63,8 +63,7 @@ pub enum CheckpointConfig { } pub type RepositoryImpl = LayeredRepository; - -pub type DatadirTimelineImpl = DatadirTimeline; +pub type TimelineImpl = ::Timeline; pub fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint thread. This prevents new connections from diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 078edc5c9f..c8aa4b35e8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -30,7 +30,6 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; -use crate::layered_repository::LayeredRepository; use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -555,9 +554,6 @@ impl PageServerHandler { info!("creating new timeline"); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?; - let repartition_distance = repo.get_checkpoint_distance(); - let mut datadir_timeline = - DatadirTimeline::::new(timeline, repartition_distance); // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -573,7 +569,7 @@ impl PageServerHandler { info!("importing basebackup"); pgb.write_message(&BeMessage::CopyInResponse)?; let reader = CopyInReader::new(pgb); - import_basebackup_from_tar(&mut datadir_timeline, reader, base_lsn)?; + import_basebackup_from_tar(&*timeline, reader, base_lsn)?; // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup @@ -583,7 +579,7 @@ impl PageServerHandler { // Flush data to disk, then upload to s3 info!("flushing layers"); - datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + timeline.checkpoint(CheckpointConfig::Flush)?; info!("done"); Ok(()) @@ -605,10 +601,6 @@ impl PageServerHandler { let timeline = repo.get_timeline_load(timeline_id)?; ensure!(timeline.get_last_record_lsn() == start_lsn); - let repartition_distance = repo.get_checkpoint_distance(); - let mut datadir_timeline = - DatadirTimeline::::new(timeline, repartition_distance); - // TODO leave clean state on error. For now you can use detach to clean // up broken state from a failed import. @@ -616,16 +608,16 @@ impl PageServerHandler { info!("importing wal"); pgb.write_message(&BeMessage::CopyInResponse)?; let reader = CopyInReader::new(pgb); - import_wal_from_tar(&mut datadir_timeline, reader, start_lsn, end_lsn)?; + import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)?; // TODO Does it make sense to overshoot? - ensure!(datadir_timeline.tline.get_last_record_lsn() >= end_lsn); + ensure!(timeline.get_last_record_lsn() >= end_lsn); // Flush data to disk, then upload to s3. No need for a forced checkpoint. // We only want to persist the data, and it doesn't matter if it's in the // shape of deltas or images. info!("flushing layers"); - datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + timeline.checkpoint(CheckpointConfig::Flush)?; info!("done"); Ok(()) @@ -643,8 +635,8 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( - timeline: &DatadirTimeline, + fn wait_or_get_last_lsn( + timeline: &T, mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RwLockReadGuard, @@ -671,7 +663,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.tline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn)?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -681,7 +673,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.tline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn)?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -691,14 +683,14 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + fn handle_get_rel_exists_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamExistsRequest, ) -> Result { let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; let exists = timeline.get_rel_exists(req.rel, lsn)?; @@ -708,13 +700,13 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + fn handle_get_nblocks_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamNblocksRequest, ) -> Result { let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; let n_blocks = timeline.get_rel_size(req.rel, lsn)?; @@ -724,13 +716,13 @@ impl PageServerHandler { })) } - fn handle_db_size_request( + fn handle_db_size_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamDbSizeRequest, ) -> Result { let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; let total_blocks = @@ -743,14 +735,14 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + fn handle_get_page_at_lsn_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamGetPageRequest, ) -> Result { let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) .entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; /* // Add a 1s delay to some requests. The delayed causes the requests to @@ -783,7 +775,7 @@ impl PageServerHandler { // check that the timeline exists let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) @@ -921,7 +913,7 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; - let end_of_timeline = timeline.tline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::text_col(b"prev_lsn"), @@ -1139,7 +1131,7 @@ impl postgres_backend::Handler for PageServerHandler { let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Couldn't load timeline")?; - timeline.tline.compact()?; + timeline.compact()?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; @@ -1159,13 +1151,8 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; - timeline.tline.checkpoint(CheckpointConfig::Forced)?; - - // Also compact it. - // - // FIXME: This probably shouldn't be part of a "checkpoint" command, but a - // separate operation. Update the tests if you change this. - timeline.tline.compact()?; + // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). + timeline.checkpoint(CheckpointConfig::Forced)?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index f696c1f411..61aca8d4ba 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,10 +6,10 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! -use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceAccum}; +use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::reltag::{RelTag, SlruKind}; +use crate::repository::Timeline; use crate::repository::*; -use crate::repository::{Repository, Timeline}; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; @@ -18,34 +18,12 @@ use postgres_ffi::{pg_constants, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::ops::Range; -use std::sync::atomic::{AtomicIsize, Ordering}; -use std::sync::{Arc, Mutex, RwLockReadGuard}; -use tracing::{debug, error, trace, warn}; +use tracing::{debug, trace, warn}; use utils::{bin_ser::BeSer, lsn::Lsn}; /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. pub type BlockNumber = u32; -pub struct DatadirTimeline -where - R: Repository, -{ - /// The underlying key-value store. Callers should not read or modify the - /// data in the underlying store directly. However, it is exposed to have - /// access to information like last-LSN, ancestor, and operations like - /// compaction. - pub tline: Arc, - - /// When did we last calculate the partitioning? - partitioning: Mutex<(KeyPartitioning, Lsn)>, - - /// Configuration: how often should the partitioning be recalculated. - repartition_threshold: u64, - - /// Current logical size of the "datadir", at the last LSN. - current_logical_size: AtomicIsize, -} - #[derive(Debug)] pub enum LsnForTimestamp { Present(Lsn), @@ -54,49 +32,50 @@ pub enum LsnForTimestamp { NoData(Lsn), } -impl DatadirTimeline { - pub fn new(tline: Arc, repartition_threshold: u64) -> Self { - DatadirTimeline { - tline, - partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), - current_logical_size: AtomicIsize::new(0), - repartition_threshold, - } - } - - /// (Re-)calculate the logical size of the database at the latest LSN. - /// - /// This can be a slow operation. - pub fn init_logical_size(&self) -> Result<()> { - let last_lsn = self.tline.get_last_record_lsn(); - self.current_logical_size.store( - self.get_current_logical_size_non_incremental(last_lsn)? as isize, - Ordering::SeqCst, - ); - Ok(()) - } - +/// +/// This trait provides all the functionality to store PostgreSQL relations, SLRUs, +/// and other special kinds of files, in a versioned key-value store. The +/// Timeline trait provides the key-value store. +/// +/// This is a trait, so that we can easily include all these functions in a Timeline +/// implementation. You're not expected to have different implementations of this trait, +/// rather, this provides an interface and implementation, over Timeline. +/// +/// If you wanted to store other kinds of data in the Neon repository, e.g. +/// flat files or MySQL, you would create a new trait like this, with all the +/// functions that make sense for the kind of data you're storing. For flat files, +/// for example, you might have a function like "fn read(path, offset, size)". +/// We might also have that situation in the future, to support multiple PostgreSQL +/// versions, if there are big changes in how the data is organized in the data +/// directory, or if new special files are introduced. +/// +pub trait DatadirTimeline: Timeline { /// Start ingesting a WAL record, or other atomic modification of /// the timeline. /// /// This provides a transaction-like interface to perform a bunch - /// of modifications atomically, all stamped with one LSN. + /// of modifications atomically. /// - /// To ingest a WAL record, call begin_modification(lsn) to get a + /// To ingest a WAL record, call begin_modification() to get a /// DatadirModification object. Use the functions in the object to /// modify the repository state, updating all the pages and metadata - /// that the WAL record affects. When you're done, call commit() to - /// commit the changes. + /// that the WAL record affects. When you're done, call commit(lsn) to + /// commit the changes. All the changes will be stamped with the specified LSN. + /// + /// Calling commit(lsn) will flush all the changes and reset the state, + /// so the `DatadirModification` struct can be reused to perform the next modification. /// /// Note that any pending modifications you make through the /// modification object won't be visible to calls to the 'get' and list /// functions of the timeline until you finish! And if you update the /// same page twice, the last update wins. /// - pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification { + fn begin_modification(&self) -> DatadirModification + where + Self: Sized, + { DatadirModification { tline: self, - lsn, pending_updates: HashMap::new(), pending_deletions: Vec::new(), pending_nblocks: 0, @@ -108,7 +87,7 @@ impl DatadirTimeline { //------------------------------------------------------------------------------ /// Look up given page version. - pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { + fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); let nblocks = self.get_rel_size(tag, lsn)?; @@ -121,11 +100,11 @@ impl DatadirTimeline { } let key = rel_block_to_key(tag, blknum); - self.tline.get(key, lsn) + self.get(key, lsn) } // Get size of a database in blocks - pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let mut total_blocks = 0; let rels = self.list_rels(spcnode, dbnode, lsn)?; @@ -138,7 +117,7 @@ impl DatadirTimeline { } /// Get size of a relation file - pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { + fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); if (tag.forknum == pg_constants::FSM_FORKNUM @@ -153,17 +132,17 @@ impl DatadirTimeline { } let key = rel_size_to_key(tag); - let mut buf = self.tline.get(key, lsn)?; + let mut buf = self.get(key, lsn)?; Ok(buf.get_u32_le()) } /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = RelDirectory::des(&buf)?; let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); @@ -172,10 +151,10 @@ impl DatadirTimeline { } /// Get a list of all existing relations in given tablespace and database. - pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = RelDirectory::des(&buf)?; let rels: HashSet = @@ -190,7 +169,7 @@ impl DatadirTimeline { } /// Look up given SLRU page version. - pub fn get_slru_page_at_lsn( + fn get_slru_page_at_lsn( &self, kind: SlruKind, segno: u32, @@ -198,26 +177,21 @@ impl DatadirTimeline { lsn: Lsn, ) -> Result { let key = slru_block_to_key(kind, segno, blknum); - self.tline.get(key, lsn) + self.get(key, lsn) } /// Get size of an SLRU segment - pub fn get_slru_segment_size( - &self, - kind: SlruKind, - segno: u32, - lsn: Lsn, - ) -> Result { + fn get_slru_segment_size(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.tline.get(key, lsn)?; + let mut buf = self.get(key, lsn)?; Ok(buf.get_u32_le()) } /// Get size of an SLRU segment - pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; let exists = dir.segments.get(&segno).is_some(); @@ -231,10 +205,10 @@ impl DatadirTimeline { /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// - pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { - let gc_cutoff_lsn_guard = self.tline.get_latest_gc_cutoff_lsn(); + fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; - let max_lsn = self.tline.get_last_record_lsn(); + let max_lsn = self.get_last_record_lsn(); // LSNs are always 8-byte aligned. low/mid/high represent the // LSN divided by 8. @@ -325,88 +299,51 @@ impl DatadirTimeline { } /// Get a list of SLRU segments - pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; Ok(dir.segments) } - pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let key = relmap_file_key(spcnode, dbnode); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; Ok(buf) } - pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { + fn list_dbdirs(&self, lsn: Lsn) -> Result> { // fetch directory entry - let buf = self.tline.get(DBDIR_KEY, lsn)?; + let buf = self.get(DBDIR_KEY, lsn)?; let dir = DbDirectory::des(&buf)?; Ok(dir.dbdirs) } - pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { let key = twophase_file_key(xid); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; Ok(buf) } - pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { + fn list_twophase_files(&self, lsn: Lsn) -> Result> { // fetch directory entry - let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let buf = self.get(TWOPHASEDIR_KEY, lsn)?; let dir = TwoPhaseDirectory::des(&buf)?; Ok(dir.xids) } - pub fn get_control_file(&self, lsn: Lsn) -> Result { - self.tline.get(CONTROLFILE_KEY, lsn) + fn get_control_file(&self, lsn: Lsn) -> Result { + self.get(CONTROLFILE_KEY, lsn) } - pub fn get_checkpoint(&self, lsn: Lsn) -> Result { - self.tline.get(CHECKPOINT_KEY, lsn) - } - - /// Get the LSN of the last ingested WAL record. - /// - /// This is just a convenience wrapper that calls through to the underlying - /// repository. - pub fn get_last_record_lsn(&self) -> Lsn { - self.tline.get_last_record_lsn() - } - - /// Check that it is valid to request operations with that lsn. - /// - /// This is just a convenience wrapper that calls through to the underlying - /// repository. - pub fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()> { - self.tline.check_lsn_is_in_scope(lsn, latest_gc_cutoff_lsn) - } - - /// Retrieve current logical size of the timeline - /// - /// NOTE: counted incrementally, includes ancestors, - pub fn get_current_logical_size(&self) -> usize { - let current_logical_size = self.current_logical_size.load(Ordering::Acquire); - match usize::try_from(current_logical_size) { - Ok(sz) => sz, - Err(_) => { - error!( - "current_logical_size is out of range: {}", - current_logical_size - ); - 0 - } - } + fn get_checkpoint(&self, lsn: Lsn) -> Result { + self.get(CHECKPOINT_KEY, lsn) } /// Does the same as get_current_logical_size but counted on demand. @@ -414,16 +351,16 @@ impl DatadirTimeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { // Fetch list of database dirs and iterate them - let buf = self.tline.get(DBDIR_KEY, lsn)?; + let buf = self.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; let mut total_size: usize = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { for rel in self.list_rels(*spcnode, *dbnode, lsn)? { let relsize_key = rel_size_to_key(rel); - let mut buf = self.tline.get(relsize_key, lsn)?; + let mut buf = self.get(relsize_key, lsn)?; let relsize = buf.get_u32_le(); total_size += relsize as usize; @@ -444,7 +381,7 @@ impl DatadirTimeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.tline.get(DBDIR_KEY, lsn)?; + let buf = self.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); @@ -461,7 +398,7 @@ impl DatadirTimeline { rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); - let mut buf = self.tline.get(relsize_key, lsn)?; + let mut buf = self.get(relsize_key, lsn)?; let relsize = buf.get_u32_le(); result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); @@ -477,13 +414,13 @@ impl DatadirTimeline { ] { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); - let buf = self.tline.get(slrudir_key, lsn)?; + let buf = self.get(slrudir_key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.tline.get(segsize_key, lsn)?; + let mut buf = self.get(segsize_key, lsn)?; let segsize = buf.get_u32_le(); result.add_range( @@ -495,7 +432,7 @@ impl DatadirTimeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let buf = self.get(TWOPHASEDIR_KEY, lsn)?; let twophase_dir = TwoPhaseDirectory::des(&buf)?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); xids.sort_unstable(); @@ -508,32 +445,17 @@ impl DatadirTimeline { Ok(result.to_keyspace()) } - - pub fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> { - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 == Lsn(0) - || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold - { - let keyspace = self.collect_keyspace(lsn)?; - let partitioning = keyspace.partition(partition_size); - *partitioning_guard = (partitioning, lsn); - return Ok((partitioning_guard.0.clone(), lsn)); - } - Ok((partitioning_guard.0.clone(), partitioning_guard.1)) - } } /// DatadirModification represents an operation to ingest an atomic set of /// updates to the repository. It is created by the 'begin_record' /// function. It is called for each WAL record, so that all the modifications /// by a one WAL record appear atomic. -pub struct DatadirModification<'a, R: Repository> { +pub struct DatadirModification<'a, T: DatadirTimeline> { /// The timeline this modification applies to. You can access this to /// read the state, but note that any pending updates are *not* reflected /// in the state in 'tline' yet. - pub tline: &'a DatadirTimeline, - - lsn: Lsn, + pub tline: &'a T, // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the @@ -543,7 +465,7 @@ pub struct DatadirModification<'a, R: Repository> { pending_nblocks: isize, } -impl<'a, R: Repository> DatadirModification<'a, R> { +impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -920,7 +842,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub fn flush(&mut self) -> Result<()> { + pub fn flush(&mut self, lsn: Lsn) -> Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -928,13 +850,13 @@ impl<'a, R: Repository> DatadirModification<'a, R> { return Ok(()); } - let writer = self.tline.tline.writer(); + let writer = self.tline.writer(); // Flush relation and SLRU data blocks, keep metadata. let mut result: Result<()> = Ok(()); self.pending_updates.retain(|&key, value| { if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) { - result = writer.put(key, self.lsn, value); + result = writer.put(key, lsn, value); false } else { true @@ -943,10 +865,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { result?; if pending_nblocks != 0 { - self.tline.current_logical_size.fetch_add( - pending_nblocks * pg_constants::BLCKSZ as isize, - Ordering::SeqCst, - ); + writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); self.pending_nblocks = 0; } @@ -956,26 +875,25 @@ impl<'a, R: Repository> DatadirModification<'a, R> { /// /// Finish this atomic update, writing all the updated keys to the /// underlying timeline. + /// All the modifications in this atomic update are stamped by the specified LSN. /// - pub fn commit(self) -> Result<()> { - let writer = self.tline.tline.writer(); + pub fn commit(&mut self, lsn: Lsn) -> Result<()> { + let writer = self.tline.writer(); let pending_nblocks = self.pending_nblocks; + self.pending_nblocks = 0; - for (key, value) in self.pending_updates { - writer.put(key, self.lsn, &value)?; + for (key, value) in self.pending_updates.drain() { + writer.put(key, lsn, &value)?; } - for key_range in self.pending_deletions { - writer.delete(key_range.clone(), self.lsn)?; + for key_range in self.pending_deletions.drain(..) { + writer.delete(key_range, lsn)?; } - writer.finish_write(self.lsn); + writer.finish_write(lsn); if pending_nblocks != 0 { - self.tline.current_logical_size.fetch_add( - pending_nblocks * pg_constants::BLCKSZ as isize, - Ordering::SeqCst, - ); + writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); } Ok(()) @@ -1002,7 +920,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { } } else { let last_lsn = self.tline.get_last_record_lsn(); - self.tline.tline.get(key, last_lsn) + self.tline.get(key, last_lsn) } } @@ -1404,13 +1322,12 @@ fn is_slru_block_key(key: Key) -> bool { pub fn create_test_timeline( repo: R, timeline_id: utils::zid::ZTimelineId, -) -> Result>> { +) -> Result> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; - let tline = DatadirTimeline::new(tline, 256 * 1024); - let mut m = tline.begin_modification(Lsn(8)); + let mut m = tline.begin_modification(); m.init_empty()?; - m.commit()?; - Ok(Arc::new(tline)) + m.commit(Lsn(8))?; + Ok(tline) } #[allow(clippy::bool_assert_comparison)] @@ -1483,7 +1400,7 @@ mod tests { .contains(&TESTREL_A)); // Run checkpoint and garbage collection and check that it's still not visible - newtline.tline.checkpoint(CheckpointConfig::Forced)?; + newtline.checkpoint(CheckpointConfig::Forced)?; repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; assert!(!newtline diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 359c704e81..0ca8c6150c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -185,7 +185,7 @@ impl Value { /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { - type Timeline: Timeline; + type Timeline: crate::DatadirTimeline; /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. /// See [`crate::remote_storage`] for more details about the synchronization. @@ -382,6 +382,11 @@ pub trait Timeline: Send + Sync { lsn: Lsn, latest_gc_cutoff_lsn: &RwLockReadGuard, ) -> Result<()>; + + /// Get the physical size of the timeline at the latest LSN + fn get_physical_size(&self) -> u64; + /// Get the physical size of the timeline at the latest LSN non incrementally + fn get_physical_size_non_incremental(&self) -> Result; } /// Various functions to mutate the timeline. @@ -405,6 +410,8 @@ pub trait TimelineWriter<'a> { /// the 'lsn' or anything older. The previous last record LSN is stored alongside /// the latest and can be read. fn finish_write(&self, lsn: Lsn); + + fn update_current_logical_size(&self, delta: isize); } #[cfg(test)] diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index ac5fb0bc8c..fe1ba4b5bb 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -176,7 +176,6 @@ use crate::{ layered_repository::{ ephemeral_file::is_ephemeral_file, metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, - LayeredRepository, }, storage_sync::{self, index::RemoteIndex}, tenant_mgr::attach_downloaded_tenants, @@ -1257,7 +1256,13 @@ async fn update_local_metadata( timeline_id, } = sync_id; tokio::task::spawn_blocking(move || { - LayeredRepository::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) + crate::layered_repository::save_metadata( + conf, + timeline_id, + tenant_id, + &cloned_metadata, + true, + ) }) .await .with_context(|| { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 1759d3bbb8..640dfa623a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,7 +3,6 @@ use crate::config::PageServerConf; use crate::layered_repository::{load_metadata, LayeredRepository}; -use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; @@ -12,7 +11,7 @@ use crate::thread_mgr::ThreadKind; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; -use crate::{DatadirTimelineImpl, RepositoryImpl}; +use crate::{RepositoryImpl, TimelineImpl}; use anyhow::Context; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -101,7 +100,7 @@ struct Tenant { /// /// Local timelines have more metadata that's loaded into memory, /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. - local_timelines: HashMap>, + local_timelines: HashMap::Timeline>>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -178,7 +177,7 @@ pub enum LocalTimelineUpdate { }, Attach { id: ZTenantTimelineId, - datadir: Arc, + datadir: Arc<::Timeline>, }, } @@ -382,7 +381,7 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) @@ -489,23 +488,18 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any fn load_local_timeline( repo: &RepositoryImpl, timeline_id: ZTimelineId, -) -> anyhow::Result>> { +) -> anyhow::Result> { let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; - let repartition_distance = repo.get_checkpoint_distance() / 10; - let page_tline = Arc::new(DatadirTimelineImpl::new( - inmem_timeline, - repartition_distance, - )); - page_tline.init_logical_size()?; + inmem_timeline.init_logical_size()?; tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach { id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id), - datadir: Arc::clone(&page_tline), + datadir: Arc::clone(&inmem_timeline), }); - Ok(page_tline) + Ok(inmem_timeline) } #[serde_as] diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index b0bb4953ca..e51744d3cc 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -120,6 +120,10 @@ pub fn init_tenant_task_pool() -> anyhow::Result<()> { let runtime = tokio::runtime::Builder::new_multi_thread() .thread_name("tenant-task-worker") .enable_all() + .on_thread_start(|| { + thread_mgr::register(ThreadKind::TenantTaskWorker, "tenant-task-worker") + }) + .on_thread_stop(thread_mgr::deregister) .build()?; let (gc_send, mut gc_recv) = mpsc::channel::(100); diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index ab0d894c70..6dd2e4b00b 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -97,6 +97,9 @@ pub enum ThreadKind { // Thread that schedules new compaction and gc jobs TenantTaskManager, + // Worker thread for tenant tasks thread pool + TenantTaskWorker, + // Thread that flushes frozen in-memory layers to disk LayerFlushThread, @@ -105,18 +108,20 @@ pub enum ThreadKind { StorageSync, } +#[derive(Default)] struct MutableThreadState { /// Tenant and timeline that this thread is associated with. tenant_id: Option, timeline_id: Option, /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. + /// the thread has already exited. OR if this thread is managed externally + /// and was not spawned through thread_mgr.rs::spawn function. join_handle: Option>, } struct PageServerThread { - _thread_id: u64, + thread_id: u64, kind: ThreadKind, @@ -147,7 +152,7 @@ where let (shutdown_tx, shutdown_rx) = watch::channel(()); let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); let thread = Arc::new(PageServerThread { - _thread_id: thread_id, + thread_id, kind, name: name.to_string(), shutdown_requested: AtomicBool::new(false), @@ -315,8 +320,10 @@ pub fn shutdown_threads( drop(thread_mut); let _ = join_handle.join(); } else { - // The thread had not even fully started yet. Or it was shut down - // concurrently and already exited + // Possibly one of: + // * The thread had not even fully started yet. + // * It was shut down concurrently and already exited + // * Is managed through `register`/`deregister` fns without providing a join handle } } } @@ -348,3 +355,56 @@ pub fn is_shutdown_requested() -> bool { } }) } + +/// Needed to register threads that were not spawned through spawn function. +/// For example tokio blocking threads. This function is expected to be used +/// in tandem with `deregister`. +/// NOTE: threads registered through this function cannot be joined +pub fn register(kind: ThreadKind, name: &str) { + CURRENT_THREAD.with(|ct| { + let mut borrowed = ct.borrow_mut(); + if borrowed.is_some() { + panic!("thread already registered") + }; + let (shutdown_tx, shutdown_rx) = watch::channel(()); + let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); + + let thread = Arc::new(PageServerThread { + thread_id, + kind, + name: name.to_owned(), + shutdown_requested: AtomicBool::new(false), + shutdown_tx, + mutable: Mutex::new(MutableThreadState { + tenant_id: None, + timeline_id: None, + join_handle: None, + }), + }); + + *borrowed = Some(Arc::clone(&thread)); + + SHUTDOWN_RX.with(|rx| { + *rx.borrow_mut() = Some(shutdown_rx); + }); + + THREADS.lock().unwrap().insert(thread_id, thread); + }); +} + +// Expected to be used in tandem with `register`. See the doc for `register` for more details +pub fn deregister() { + CURRENT_THREAD.with(|ct| { + let mut borrowed = ct.borrow_mut(); + let thread = match borrowed.take() { + Some(thread) => thread, + None => panic!("calling deregister on unregistered thread"), + }; + + SHUTDOWN_RX.with(|rx| { + *rx.borrow_mut() = None; + }); + + THREADS.lock().unwrap().remove(&thread.thread_id) + }); +} diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index a40e705cb9..1088e516aa 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -26,7 +26,7 @@ use crate::{ repository::{LocalTimelineState, Repository}, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, - DatadirTimeline, RepositoryImpl, + DatadirTimeline, RepositoryImpl, TimelineImpl, }; use crate::{import_datadir, LOG_FILE_NAME}; use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; @@ -49,32 +49,41 @@ pub struct LocalTimelineInfo { #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, + pub current_physical_size_non_incremental: Option, pub timeline_state: LocalTimelineState, } impl LocalTimelineInfo { - pub fn from_loaded_timeline( - datadir_tline: &DatadirTimeline, + pub fn from_loaded_timeline( + timeline: &TimelineImpl, include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, ) -> anyhow::Result { - let last_record_lsn = datadir_tline.tline.get_last_record_lsn(); + let last_record_lsn = timeline.get_last_record_lsn(); let info = LocalTimelineInfo { - ancestor_timeline_id: datadir_tline.tline.get_ancestor_timeline_id(), + ancestor_timeline_id: timeline.get_ancestor_timeline_id(), ancestor_lsn: { - match datadir_tline.tline.get_ancestor_lsn() { + match timeline.get_ancestor_lsn() { Lsn(0) => None, lsn @ Lsn(_) => Some(lsn), } }, - disk_consistent_lsn: datadir_tline.tline.get_disk_consistent_lsn(), + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), last_record_lsn, - prev_record_lsn: Some(datadir_tline.tline.get_prev_record_lsn()), - latest_gc_cutoff_lsn: *datadir_tline.tline.get_latest_gc_cutoff_lsn(), + prev_record_lsn: Some(timeline.get_prev_record_lsn()), + latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Loaded, - current_logical_size: Some(datadir_tline.get_current_logical_size()), + current_physical_size: Some(timeline.get_physical_size()), + current_logical_size: Some(timeline.get_current_logical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { - Some(datadir_tline.get_current_logical_size_non_incremental(last_record_lsn)?) + Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) + } else { + None + }, + current_physical_size_non_incremental: if include_non_incremental_physical_size { + Some(timeline.get_physical_size_non_incremental()?) } else { None }, @@ -97,7 +106,9 @@ impl LocalTimelineInfo { latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Unloaded, current_logical_size: None, + current_physical_size: None, current_logical_size_non_incremental: None, + current_physical_size_non_incremental: None, } } @@ -106,12 +117,16 @@ impl LocalTimelineInfo { timeline_id: ZTimelineId, repo_timeline: &RepositoryTimeline, include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, ) -> anyhow::Result { match repo_timeline { RepositoryTimeline::Loaded(_) => { - let datadir_tline = - tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; - Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size) + let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; + Self::from_loaded_timeline( + &*timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) } RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)), } @@ -298,19 +313,18 @@ fn bootstrap_timeline( // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. let timeline = repo.create_empty_timeline(tli, lsn)?; - let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { bail!("failpoint before-checkpoint-new-timeline"); }); - page_tline.tline.checkpoint(CheckpointConfig::Forced)?; + timeline.checkpoint(CheckpointConfig::Forced)?; info!( "created root timeline {} timeline.lsn {}", tli, - page_tline.tline.get_last_record_lsn() + timeline.get_last_record_lsn() ); // Remove temp dir. We don't need it anymore @@ -322,6 +336,7 @@ fn bootstrap_timeline( pub(crate) fn get_local_timelines( tenant_id: ZTenantId, include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, ) -> Result> { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; @@ -336,6 +351,7 @@ pub(crate) fn get_local_timelines( timeline_id, &repository_timeline, include_non_incremental_logical_size, + include_non_incremental_physical_size, )?, )) } @@ -389,7 +405,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*loaded_timeline, false, false) .context("cannot fill timeline info")? } None => { @@ -397,7 +413,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let new_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&new_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*new_timeline, false, false) .context("cannot fill timeline info")? } }; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 2f39007e9f..8dd14ec177 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,7 +34,6 @@ use std::collections::HashMap; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Repository; use crate::walrecord::*; use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::xlog_utils::*; @@ -44,8 +43,8 @@ use utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); -pub struct WalIngest<'a, R: Repository> { - timeline: &'a DatadirTimeline, +pub struct WalIngest<'a, T: DatadirTimeline> { + timeline: &'a T, checkpoint: CheckPoint, checkpoint_modified: bool, @@ -53,8 +52,8 @@ pub struct WalIngest<'a, R: Repository> { relsize_cache: HashMap, } -impl<'a, R: Repository> WalIngest<'a, R> { - pub fn new(timeline: &DatadirTimeline, startpoint: Lsn) -> Result> { +impl<'a, T: DatadirTimeline> WalIngest<'a, T> { + pub fn new(timeline: &T, startpoint: Lsn) -> Result> { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; @@ -78,13 +77,13 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// pub fn ingest_record( &mut self, - timeline: &DatadirTimeline, recdata: Bytes, lsn: Lsn, + modification: &mut DatadirModification, + decoded: &mut DecodedWALRecord, ) -> Result<()> { - let mut modification = timeline.begin_modification(lsn); + decode_wal_record(recdata, decoded).context("failed decoding wal record")?; - let mut decoded = decode_wal_record(recdata).context("failed decoding wal record")?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -98,7 +97,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, &mut modification, &mut decoded)?; + self.ingest_heapam_record(&mut buf, modification, decoded)?; } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID @@ -106,19 +105,19 @@ impl<'a, R: Repository> WalIngest<'a, R> { == pg_constants::XLOG_SMGR_CREATE { let create = XlSmgrCreate::decode(&mut buf); - self.ingest_xlog_smgr_create(&mut modification, &create)?; + self.ingest_xlog_smgr_create(modification, &create)?; } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(&mut modification, &truncate)?; + self.ingest_xlog_smgr_truncate(modification, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(&mut modification, &createdb)?; + self.ingest_xlog_dbase_create(modification, &createdb)?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_DROP { @@ -137,7 +136,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( - &mut modification, + modification, SlruKind::Clog, segno, rpageno, @@ -146,7 +145,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(&mut modification, &xlrec)?; + self.ingest_clog_truncate_record(modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; @@ -154,7 +153,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - &mut modification, + modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, )?; @@ -164,7 +163,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - &mut modification, + modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, )?; @@ -187,7 +186,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( - &mut modification, + modification, SlruKind::MultiXactOffsets, segno, rpageno, @@ -198,7 +197,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( - &mut modification, + modification, SlruKind::MultiXactMembers, segno, rpageno, @@ -206,14 +205,14 @@ impl<'a, R: Repository> WalIngest<'a, R> { )?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(&mut modification, &xlrec)?; + self.ingest_multixact_create_record(modification, &xlrec)?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(&mut modification, &xlrec)?; + self.ingest_multixact_truncate_record(modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(&mut modification, &xlrec, &decoded)?; + self.ingest_relmap_page(modification, &xlrec, decoded)?; } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -248,7 +247,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(&mut modification, lsn, &decoded, blk)?; + self.ingest_decoded_block(modification, lsn, decoded, blk)?; } // If checkpoint data was updated, store the new version in the repository @@ -261,14 +260,14 @@ impl<'a, R: Repository> WalIngest<'a, R> { // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit()?; + modification.commit(lsn)?; Ok(()) } fn ingest_decoded_block( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, @@ -328,7 +327,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_heapam_record( &mut self, buf: &mut Bytes, - modification: &mut DatadirModification, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { // Handle VM bit updates that are implicitly part of heap records. @@ -472,7 +471,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. fn ingest_xlog_dbase_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlCreateDatabase, ) -> Result<()> { let db_id = rec.db_id; @@ -539,7 +538,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_xlog_smgr_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrCreate, ) -> Result<()> { let rel = RelTag { @@ -557,7 +556,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// This is the same logic as in PostgreSQL's smgr_redo() function. fn ingest_xlog_smgr_truncate( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrTruncate, ) -> Result<()> { let spcnode = rec.rnode.spcnode; @@ -622,7 +621,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// fn ingest_xact_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, ) -> Result<()> { @@ -691,7 +690,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_clog_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlClogTruncate, ) -> Result<()> { info!( @@ -749,7 +748,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_multixact_create_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, ) -> Result<()> { // Create WAL record for updating the multixact-offsets page @@ -828,7 +827,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_multixact_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactTruncate, ) -> Result<()> { self.checkpoint.oldestMulti = xlrec.end_trunc_off; @@ -862,7 +861,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_relmap_page( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, ) -> Result<()> { @@ -878,7 +877,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_creation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, ) -> Result<()> { self.relsize_cache.insert(rel, 0); @@ -888,7 +887,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, img: Bytes, @@ -900,7 +899,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_wal_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, rec: ZenithWalRecord, @@ -912,7 +911,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_truncation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, nblocks: BlockNumber, ) -> Result<()> { @@ -923,7 +922,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_drop( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, ) -> Result<()> { modification.put_rel_drop(rel)?; @@ -948,7 +947,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn handle_rel_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, ) -> Result<()> { @@ -986,7 +985,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_slru_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -999,7 +998,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn handle_slru_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -1052,6 +1051,7 @@ mod tests { use super::*; use crate::pgdatadir_mapping::create_test_timeline; use crate::repository::repo_harness::*; + use crate::repository::Timeline; use postgres_ffi::pg_constants; /// Arbitrary relation tag, for testing. @@ -1062,17 +1062,17 @@ mod tests { forknum: 0, }; - fn assert_current_logical_size(_timeline: &DatadirTimeline, _lsn: Lsn) { + fn assert_current_logical_size(_timeline: &T, _lsn: Lsn) { // TODO } static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - fn init_walingest_test(tline: &DatadirTimeline) -> Result> { - let mut m = tline.begin_modification(Lsn(0x10)); + fn init_walingest_test(tline: &T) -> Result> { + let mut m = tline.begin_modification(); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file - m.commit()?; + m.commit(Lsn(0x10))?; let walingest = WalIngest::new(tline, Lsn(0x10))?; Ok(walingest) @@ -1082,23 +1082,23 @@ mod tests { fn test_relsize() -> Result<()> { let repo = RepoHarness::create("test_relsize")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; - let mut m = tline.begin_modification(Lsn(0x20)); + let mut m = tline.begin_modification(); walingest.put_rel_creation(&mut m, TESTREL_A)?; walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; - m.commit()?; - let mut m = tline.begin_modification(Lsn(0x30)); + m.commit(Lsn(0x20))?; + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; - m.commit()?; - let mut m = tline.begin_modification(Lsn(0x40)); + m.commit(Lsn(0x30))?; + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; - m.commit()?; - let mut m = tline.begin_modification(Lsn(0x50)); + m.commit(Lsn(0x40))?; + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; - m.commit()?; + m.commit(Lsn(0x50))?; - assert_current_logical_size(&tline, Lsn(0x50)); + assert_current_logical_size(&*tline, Lsn(0x50)); // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); @@ -1142,10 +1142,10 @@ mod tests { ); // Truncate last block - let mut m = tline.begin_modification(Lsn(0x60)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; - m.commit()?; - assert_current_logical_size(&tline, Lsn(0x60)); + m.commit(Lsn(0x60))?; + assert_current_logical_size(&*tline, Lsn(0x60)); // Check reported size and contents after truncation assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 2); @@ -1166,15 +1166,15 @@ mod tests { ); // Truncate to zero length - let mut m = tline.begin_modification(Lsn(0x68)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; - m.commit()?; + m.commit(Lsn(0x68))?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0); // Extend from 0 to 2 blocks, leaving a gap - let mut m = tline.begin_modification(Lsn(0x70)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; - m.commit()?; + m.commit(Lsn(0x70))?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2); assert_eq!( tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, @@ -1186,9 +1186,9 @@ mod tests { ); // Extend a lot more, leaving a big gap that spans across segments - let mut m = tline.begin_modification(Lsn(0x80)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; - m.commit()?; + m.commit(Lsn(0x80))?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501); for blk in 2..1500 { assert_eq!( @@ -1210,20 +1210,20 @@ mod tests { fn test_drop_extend() -> Result<()> { let repo = RepoHarness::create("test_drop_extend")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; - let mut m = tline.begin_modification(Lsn(0x20)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; - m.commit()?; + m.commit(Lsn(0x20))?; // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); // Drop rel - let mut m = tline.begin_modification(Lsn(0x30)); + let mut m = tline.begin_modification(); walingest.put_rel_drop(&mut m, TESTREL_A)?; - m.commit()?; + m.commit(Lsn(0x30))?; // Check that rel is not visible anymore assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); @@ -1232,9 +1232,9 @@ mod tests { //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none()); // Re-create it - let mut m = tline.begin_modification(Lsn(0x40)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; - m.commit()?; + m.commit(Lsn(0x40))?; // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); @@ -1250,16 +1250,16 @@ mod tests { fn test_truncate_extend() -> Result<()> { let repo = RepoHarness::create("test_truncate_extend")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; - let mut m = tline.begin_modification(Lsn(0x20)); + let mut m = tline.begin_modification(); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; } - m.commit()?; + m.commit(Lsn(0x20))?; // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); @@ -1280,9 +1280,9 @@ mod tests { // Truncate relation so that second segment was dropped // - only leave one page - let mut m = tline.begin_modification(Lsn(0x60)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?; - m.commit()?; + m.commit(Lsn(0x60))?; // Check reported size and contents after truncation assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1); @@ -1310,12 +1310,12 @@ mod tests { // Extend relation again. // Add enough blocks to create second segment let lsn = Lsn(0x80); - let mut m = tline.begin_modification(lsn); + let mut m = tline.begin_modification(); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; } - m.commit()?; + m.commit(lsn)?; assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize); @@ -1338,18 +1338,18 @@ mod tests { fn test_large_rel() -> Result<()> { let repo = RepoHarness::create("test_large_rel")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; for blknum in 0..pg_constants::RELSEG_SIZE + 1 { lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; - m.commit()?; + m.commit(Lsn(lsn))?; } - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, @@ -1358,34 +1358,34 @@ mod tests { // Truncate one block lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?; - m.commit()?; + m.commit(Lsn(lsn))?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE ); - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?; - m.commit()?; + m.commit(Lsn(lsn))?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE - 1 ); - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time // This tests the behavior at segment boundaries let mut size: i32 = 3000; while size >= 0 { lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; - m.commit()?; + m.commit(Lsn(lsn))?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, size as BlockNumber @@ -1393,7 +1393,7 @@ mod tests { size -= 1; } - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); Ok(()) } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 614bca50ad..f2b1671eb4 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -25,7 +25,8 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::DatadirTimelineImpl; +use crate::repository::{Repository, Timeline}; +use crate::{RepositoryImpl, TimelineImpl}; use utils::{ lsn::Lsn, pq_proto::ReplicationFeedback, @@ -39,7 +40,7 @@ pub(super) fn spawn_connection_manager_task( id: ZTenantTimelineId, broker_loop_prefix: String, mut client: Client, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -229,8 +230,8 @@ async fn subscribe_for_timeline_updates( } } -const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 2.0; -const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 60.0; +const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; +const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { if n == 0 { @@ -245,7 +246,7 @@ async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { struct WalreceiverState { id: ZTenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + local_timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -283,7 +284,7 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( id: ZTenantTimelineId, - local_timeline: Arc, + local_timeline: Arc<::Timeline>, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -1203,13 +1204,10 @@ mod tests { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, - local_timeline: Arc::new(DatadirTimelineImpl::new( - harness - .load() - .create_empty_timeline(TIMELINE_ID, Lsn(0)) - .expect("Failed to create an empty timeline for dummy wal connection manager"), - 10_000, - )), + local_timeline: harness + .load() + .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .expect("Failed to create an empty timeline for dummy wal connection manager"), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1).unwrap(), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 98b36dfe48..ca29c00771 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -9,20 +9,22 @@ use std::{ use anyhow::{bail, ensure, Context}; use bytes::BytesMut; use fail::fail_point; +use futures::StreamExt; use postgres::{SimpleQueryMessage, SimpleQueryRow}; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; -use tokio_stream::StreamExt; use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::{ http::models::WalReceiverEntry, + pgdatadir_mapping::DatadirTimeline, repository::{Repository, Timeline}, tenant_mgr, walingest::WalIngest, + walrecord::DecodedWALRecord, }; use postgres_ffi::waldecoder::WalStreamDecoder; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; @@ -150,19 +152,25 @@ pub async fn handle_walreceiver_connection( waldecoder.feed_bytes(data); - while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - let _enter = info_span!("processing record", lsn = %lsn).entered(); + { + let mut decoded = DecodedWALRecord::default(); + let mut modification = timeline.begin_modification(); + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { + // let _enter = info_span!("processing record", lsn = %lsn).entered(); - // It is important to deal with the aligned records as lsn in getPage@LSN is - // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hitting a deadlock. - ensure!(lsn.is_aligned()); + // It is important to deal with the aligned records as lsn in getPage@LSN is + // aligned and can be several bytes bigger. Without this alignment we are + // at risk of hitting a deadlock. + ensure!(lsn.is_aligned()); - walingest.ingest_record(&timeline, recdata, lsn)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .context("could not ingest record at {lsn}")?; - fail_point!("walreceiver-after-ingest"); + fail_point!("walreceiver-after-ingest"); - last_rec_lsn = lsn; + last_rec_lsn = lsn; + } } if !caught_up && endlsn >= end_of_wal { @@ -170,7 +178,7 @@ pub async fn handle_walreceiver_connection( caught_up = true; } - let timeline_to_check = Arc::clone(&timeline.tline); + let timeline_to_check = Arc::clone(&timeline); tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) .await .with_context(|| { @@ -218,7 +226,7 @@ pub async fn handle_walreceiver_connection( // The last LSN we processed. It is not guaranteed to survive pageserver crash. let write_lsn = u64::from(last_lsn); // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); + let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. let apply_lsn = u64::from(timeline_remote_consistent_lsn); diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 5a384360e2..6b01d52005 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -96,6 +96,7 @@ impl DecodedBkpBlock { } } +#[derive(Default)] pub struct DecodedWALRecord { pub xl_xid: TransactionId, pub xl_info: u8, @@ -505,7 +506,17 @@ impl XlMultiXactTruncate { // block data // ... // main data -pub fn decode_wal_record(record: Bytes) -> Result { +// +// +// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in. +// It would be more natural for this function to return a DecodedWALRecord as return value, +// but reusing the caller-supplied struct avoids an allocation. +// This code is in the hot path for digesting incoming WAL, and is very performance sensitive. +// +pub fn decode_wal_record( + record: Bytes, + decoded: &mut DecodedWALRecord, +) -> Result<(), DeserializeError> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -534,7 +545,7 @@ pub fn decode_wal_record(record: Bytes) -> Result = Vec::new(); + decoded.blocks.clear(); // 2. Decode the headers. // XLogRecordBlockHeaders if any, @@ -713,7 +724,7 @@ pub fn decode_wal_record(record: Bytes) -> Result { @@ -724,7 +735,7 @@ pub fn decode_wal_record(record: Bytes) -> Result Result e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), Sasl(e) => e.to_string_client(), - MalformedPassword => self.to_string(), + MalformedPassword(_) => self.to_string(), _ => "Internal error".to_string(), } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 1d41f7f932..5e87059c86 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,16 +1,14 @@ -mod legacy_console; mod link; mod postgres; pub mod console; +mod legacy_console; pub use legacy_console::{AuthError, AuthErrorImpl}; -use super::ClientCredentials; use crate::{ - compute, - config::{AuthBackendType, ProxyConfig}, - mgmt, + auth::{self, AuthFlow, ClientCredentials}, + compute, config, mgmt, stream::PqStream, waiters::{self, Waiter, Waiters}, }; @@ -78,32 +76,158 @@ impl From for tokio_postgres::Config { } } -pub(super) async fn handle_user( - config: &ProxyConfig, - client: &mut PqStream, - creds: ClientCredentials, -) -> super::Result { - use AuthBackendType::*; - match config.auth_backend { - LegacyConsole => { - legacy_console::handle_user( - &config.auth_endpoint, - &config.auth_link_uri, - client, - &creds, - ) - .await +/// This type serves two purposes: +/// +/// * When `T` is `()`, it's just a regular auth backend selector +/// which we use in [`crate::config::ProxyConfig`]. +/// +/// * However, when we substitute `T` with [`ClientCredentials`], +/// this helps us provide the credentials only to those auth +/// backends which require them for the authentication process. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum BackendType { + /// Legacy Cloud API (V1) + link auth. + LegacyConsole(T), + /// Current Cloud API (V2). + Console(T), + /// Local mock of Cloud API (V2). + Postgres(T), + /// Authentication via a web browser. + Link, +} + +impl BackendType { + /// Very similar to [`std::option::Option::map`]. + /// Maps [`BackendType`] to [`BackendType`] by applying + /// a function to a contained value. + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { + use BackendType::*; + match self { + LegacyConsole(x) => LegacyConsole(f(x)), + Console(x) => Console(f(x)), + Postgres(x) => Postgres(f(x)), + Link => Link, + } + } +} + +impl BackendType> { + /// Very similar to [`std::option::Option::transpose`]. + /// This is most useful for error handling. + pub fn transpose(self) -> Result, E> { + use BackendType::*; + match self { + LegacyConsole(x) => x.map(LegacyConsole), + Console(x) => x.map(Console), + Postgres(x) => x.map(Postgres), + Link => Ok(Link), + } + } +} + +impl BackendType { + /// Authenticate the client via the requested backend, possibly using credentials. + pub async fn authenticate( + mut self, + urls: &config::AuthUrls, + client: &mut PqStream, + ) -> super::Result { + use BackendType::*; + + if let Console(creds) | Postgres(creds) = &mut self { + // If there's no project so far, that entails that client doesn't + // support SNI or other means of passing the project name. + // We now expect to see a very specific payload in the place of password. + if creds.project().is_none() { + let payload = AuthFlow::new(client) + .begin(auth::PasswordHack) + .await? + .authenticate() + .await?; + + // Finally we may finish the initialization of `creds`. + // TODO: add missing type safety to ClientCredentials. + creds.project = Some(payload.project); + + let mut config = match &self { + Console(creds) => { + console::Api::new(&urls.auth_endpoint, creds) + .wake_compute() + .await? + } + Postgres(creds) => { + postgres::Api::new(&urls.auth_endpoint, creds) + .wake_compute() + .await? + } + _ => unreachable!("see the patterns above"), + }; + + // We should use a password from payload as well. + config.password(payload.password); + + return Ok(compute::NodeInfo { + reported_auth_ok: false, + config, + }); + } + } + + match self { + LegacyConsole(creds) => { + legacy_console::handle_user( + &urls.auth_endpoint, + &urls.auth_link_uri, + &creds, + client, + ) + .await + } + Console(creds) => { + console::Api::new(&urls.auth_endpoint, &creds) + .handle_user(client) + .await + } + Postgres(creds) => { + postgres::Api::new(&urls.auth_endpoint, &creds) + .handle_user(client) + .await + } + // NOTE: this auth backend doesn't use client credentials. + Link => link::handle_user(&urls.auth_link_uri, client).await, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_backend_type_map() { + let values = [ + BackendType::LegacyConsole(0), + BackendType::Console(0), + BackendType::Postgres(0), + BackendType::Link, + ]; + + for value in values { + assert_eq!(value.map(|x| x), value); + } + } + + #[test] + fn test_backend_type_transpose() { + let values = [ + BackendType::LegacyConsole(Ok::<_, ()>(0)), + BackendType::Console(Ok(0)), + BackendType::Postgres(Ok(0)), + BackendType::Link, + ]; + + for value in values { + assert_eq!(value.map(Result::unwrap), value.transpose().unwrap()); } - Console => { - console::Api::new(&config.auth_endpoint, &creds)? - .handle_user(client) - .await - } - Postgres => { - postgres::Api::new(&config.auth_endpoint, &creds)? - .handle_user(client) - .await - } - Link => link::handle_user(&config.auth_link_uri, client).await, } } diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 3085f0b0e4..a8ff1a3522 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -1,18 +1,17 @@ //! Cloud API V2. use crate::{ - auth::{self, AuthFlow, ClientCredentials, DatabaseInfo}, - compute, - error::UserFacingError, + auth::{self, AuthFlow, ClientCredentials}, + compute::{self, ComputeConnCfg}, + error::{io_error, UserFacingError}, scram, stream::PqStream, url::ApiUrl, }; use serde::{Deserialize, Serialize}; -use std::{future::Future, io}; +use std::future::Future; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; pub type Result = std::result::Result; @@ -84,8 +83,8 @@ pub(super) struct Api<'a> { impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { - Ok(Self { endpoint, creds }) + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { + Self { endpoint, creds } } /// Authenticate the existing user or throw an error. @@ -100,7 +99,7 @@ impl<'a> Api<'a> { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_get_role_secret"); url.query_pairs_mut() - .append_pair("project", self.creds.project_name.as_ref()?) + .append_pair("project", self.creds.project().expect("impossible")) .append_pair("role", &self.creds.user); // TODO: use a proper logger @@ -120,11 +119,11 @@ impl<'a> Api<'a> { } /// Wake up the compute node and return the corresponding connection info. - async fn wake_compute(&self) -> Result { + pub(super) async fn wake_compute(&self) -> Result { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_wake_compute"); - let project_name = self.creds.project_name.as_ref()?; - url.query_pairs_mut().append_pair("project", project_name); + url.query_pairs_mut() + .append_pair("project", self.creds.project().expect("impossible")); // TODO: use a proper logger println!("cplane request: {url}"); @@ -137,16 +136,20 @@ impl<'a> Api<'a> { let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await.map_err(io_error)?)?; - let (host, port) = parse_host_port(&response.address) - .ok_or(ConsoleAuthError::BadComputeAddress(response.address))?; + // Unfortunately, ownership won't let us use `Option::ok_or` here. + let (host, port) = match parse_host_port(&response.address) { + None => return Err(ConsoleAuthError::BadComputeAddress(response.address)), + Some(x) => x, + }; - Ok(DatabaseInfo { - host, - port, - dbname: self.creds.dbname.to_owned(), - user: self.creds.user.to_owned(), - password: None, - }) + let mut config = ComputeConnCfg::new(); + config + .host(host) + .port(port) + .dbname(&self.creds.dbname) + .user(&self.creds.user); + + Ok(config) } } @@ -160,7 +163,7 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( ) -> auth::Result where GetAuthInfo: Future>, - WakeCompute: Future>, + WakeCompute: Future>, { let auth_info = get_auth_info(endpoint).await?; @@ -179,48 +182,18 @@ where } }; - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; + let mut config = wake_compute(endpoint).await?; + if let Some(keys) = scram_keys { + config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys)); + } Ok(compute::NodeInfo { - db_info: wake_compute(endpoint).await?, - scram_keys, + reported_auth_ok: false, + config, }) } -/// Upcast (almost) any error into an opaque [`io::Error`]. -pub(super) fn io_error(e: impl Into>) -> io::Error { - io::Error::new(io::ErrorKind::Other, e) -} - -fn parse_host_port(input: &str) -> Option<(String, u16)> { +fn parse_host_port(input: &str) -> Option<(&str, u16)> { let (host, port) = input.split_once(':')?; - Some((host.to_owned(), port.parse().ok()?)) -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } + Some((host, port.parse().ok()?)) } diff --git a/proxy/src/auth/backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs index 467da63a98..7a5e9b6f62 100644 --- a/proxy/src/auth/backend/legacy_console.rs +++ b/proxy/src/auth/backend/legacy_console.rs @@ -11,7 +11,7 @@ use crate::{ use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; +use utils::pq_proto::BeMessage as Be; #[derive(Debug, Error)] pub enum AuthErrorImpl { @@ -76,6 +76,12 @@ enum ProxyAuthResponse { NotReady { ready: bool }, // TODO: get rid of `ready` } +impl ClientCredentials { + fn is_existing_user(&self) -> bool { + self.user.ends_with("@zenith") + } +} + async fn authenticate_proxy_client( auth_endpoint: &reqwest::Url, creds: &ClientCredentials, @@ -100,7 +106,7 @@ async fn authenticate_proxy_client( } let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; - println!("got auth info: #{:?}", auth_info); + println!("got auth info: {:?}", auth_info); use ProxyAuthResponse::*; let db_info = match auth_info { @@ -128,7 +134,9 @@ async fn handle_existing_user( // Read client's password hash let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword)?; + let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword( + "the password should be a valid null-terminated utf-8 string", + ))?; let db_info = authenticate_proxy_client( auth_endpoint, @@ -139,21 +147,17 @@ async fn handle_existing_user( ) .await?; - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - Ok(compute::NodeInfo { - db_info, - scram_keys: None, + reported_auth_ok: false, + config: db_info.into(), }) } pub async fn handle_user( auth_endpoint: &reqwest::Url, auth_link_uri: &reqwest::Url, - client: &mut PqStream, creds: &ClientCredentials, + client: &mut PqStream, ) -> auth::Result { if creds.is_existing_user() { handle_existing_user(auth_endpoint, client, creds).await @@ -201,4 +205,24 @@ mod tests { .unwrap(); assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); } + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + }))?; + + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + }))?; + + Ok(()) + } } diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 669c9e00e9..d658a34825 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -41,7 +41,7 @@ pub async fn handle_user( client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; Ok(compute::NodeInfo { - db_info, - scram_keys: None, + reported_auth_ok: true, + config: db_info.into(), }) } diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs index 721b9db095..1d7ab8f249 100644 --- a/proxy/src/auth/backend/postgres.rs +++ b/proxy/src/auth/backend/postgres.rs @@ -3,10 +3,12 @@ use crate::{ auth::{ self, - backend::console::{self, io_error, AuthInfo, Result}, - ClientCredentials, DatabaseInfo, + backend::console::{self, AuthInfo, Result}, + ClientCredentials, }, - compute, scram, + compute::{self, ComputeConnCfg}, + error::io_error, + scram, stream::PqStream, url::ApiUrl, }; @@ -20,8 +22,8 @@ pub(super) struct Api<'a> { impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { - Ok(Self { endpoint, creds }) + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { + Self { endpoint, creds } } /// Authenticate the existing user or throw an error. @@ -56,7 +58,10 @@ impl<'a> Api<'a> { // We shouldn't get more than one row anyway. [row, ..] => { - let entry = row.try_get(0).map_err(io_error)?; + let entry = row + .try_get("rolpassword") + .map_err(|e| io_error(format!("failed to read user's password: {e}")))?; + scram::ServerSecret::parse(entry) .map(AuthInfo::Scram) .or_else(|| { @@ -75,14 +80,14 @@ impl<'a> Api<'a> { } /// We don't need to wake anything locally, so we just return the connection info. - async fn wake_compute(&self) -> Result { - Ok(DatabaseInfo { - // TODO: handle that near CLI params parsing - host: self.endpoint.host_str().unwrap_or("localhost").to_owned(), - port: self.endpoint.port().unwrap_or(5432), - dbname: self.creds.dbname.to_owned(), - user: self.creds.user.to_owned(), - password: None, - }) + pub(super) async fn wake_compute(&self) -> Result { + let mut config = ComputeConnCfg::new(); + config + .host(self.endpoint.host_str().unwrap_or("localhost")) + .port(self.endpoint.port().unwrap_or(5432)) + .dbname(&self.creds.dbname) + .user(&self.creds.user); + + Ok(config) } } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index b5312fbe1f..4c72da1c48 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,39 +1,25 @@ //! User credentials used in authentication. -use crate::compute; -use crate::config::ProxyConfig; use crate::error::UserFacingError; -use crate::stream::PqStream; -use std::collections::HashMap; use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::StartupMessageParams; #[derive(Debug, Error, PartialEq, Eq, Clone)] pub enum ClientCredsParseError { - #[error("Parameter `{0}` is missing in startup packet.")] + #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), - #[error( - "Project name is not specified. \ - EITHER please upgrade the postgres client library (libpq) for SNI support \ - OR pass the project name as a parameter: '&options=project%3D'." - )] - MissingSNIAndProjectName, - #[error("Inconsistent project name inferred from SNI ('{0}') and project option ('{1}').")] - InconsistentProjectNameAndSNI(String, String), - - #[error("Common name is not set.")] - CommonNameNotSet, + InconsistentProjectNames(String, String), #[error( "SNI ('{1}') inconsistently formatted with respect to common name ('{0}'). \ - SNI should be formatted as '.'." + SNI should be formatted as '.{0}'." )] - InconsistentCommonNameAndSNI(String, String), + InconsistentSni(String, String), - #[error("Project name ('{0}') must contain only alphanumeric characters and hyphens ('-').")] - ProjectNameContainsIllegalChars(String), + #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] + MalformedProjectName(String), } impl UserFacingError for ClientCredsParseError {} @@ -44,286 +30,171 @@ impl UserFacingError for ClientCredsParseError {} pub struct ClientCredentials { pub user: String, pub dbname: String, - pub project_name: Result, + pub project: Option, } impl ClientCredentials { - pub fn is_existing_user(&self) -> bool { - // This logic will likely change in the future. - self.user.ends_with("@zenith") + pub fn project(&self) -> Option<&str> { + self.project.as_deref() } +} +impl ClientCredentials { pub fn parse( - mut options: HashMap, - sni_data: Option<&str>, + mut options: StartupMessageParams, + sni: Option<&str>, common_name: Option<&str>, ) -> Result { - let mut get_param = |key| { - options - .remove(key) - .ok_or(ClientCredsParseError::MissingKey(key)) - }; + use ClientCredsParseError::*; + // Some parameters are absolutely necessary, others not so much. + let mut get_param = |key| options.remove(key).ok_or(MissingKey(key)); + + // Some parameters are stored in the startup message. let user = get_param("user")?; let dbname = get_param("database")?; - let project_name = get_param("project").ok(); - let project_name = get_project_name(sni_data, common_name, project_name.as_deref()); + let project_a = get_param("project").ok(); + + // Alternative project name is in fact a subdomain from SNI. + // NOTE: we do not consider SNI if `common_name` is missing. + let project_b = sni + .zip(common_name) + .map(|(sni, cn)| { + // TODO: what if SNI is present but just a common name? + subdomain_from_sni(sni, cn) + .ok_or_else(|| InconsistentSni(sni.to_owned(), cn.to_owned())) + }) + .transpose()?; + + let project = match (project_a, project_b) { + // Invariant: if we have both project name variants, they should match. + (Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a, b))), + (a, b) => a.or(b).map(|name| { + // Invariant: project name may not contain certain characters. + check_project_name(name).map_err(MalformedProjectName) + }), + } + .transpose()?; Ok(Self { user, dbname, - project_name, + project, }) } +} - /// Use credentials to authenticate the user. - pub async fn authenticate( - self, - config: &ProxyConfig, - client: &mut PqStream, - ) -> super::Result { - // This method is just a convenient facade for `handle_user` - super::backend::handle_user(config, client, self).await +fn check_project_name(name: String) -> Result { + if name.chars().all(|c| c.is_alphanumeric() || c == '-') { + Ok(name) + } else { + Err(name) } } -/// Inferring project name from sni_data. -fn project_name_from_sni_data( - sni_data: &str, - common_name: &str, -) -> Result { - let common_name_with_dot = format!(".{common_name}"); - // check that ".{common_name_with_dot}" is the actual suffix in sni_data - if !sni_data.ends_with(&common_name_with_dot) { - return Err(ClientCredsParseError::InconsistentCommonNameAndSNI( - common_name.to_string(), - sni_data.to_string(), +fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { + sni.strip_suffix(common_name)? + .strip_suffix('.') + .map(str::to_owned) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_options<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> StartupMessageParams { + StartupMessageParams::from(pairs.map(|(k, v)| (k.to_owned(), v.to_owned()))) + } + + #[test] + #[ignore = "TODO: fix how database is handled"] + fn parse_bare_minimum() -> anyhow::Result<()> { + // According to postgresql, only `user` should be required. + let options = make_options([("user", "john_doe")]); + + // TODO: check that `creds.dbname` is None. + let creds = ClientCredentials::parse(options, None, None)?; + assert_eq!(creds.user, "john_doe"); + + Ok(()) + } + + #[test] + fn parse_missing_project() -> anyhow::Result<()> { + let options = make_options([("user", "john_doe"), ("database", "world")]); + + let creds = ClientCredentials::parse(options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project, None); + + Ok(()) + } + + #[test] + fn parse_project_from_sni() -> anyhow::Result<()> { + let options = make_options([("user", "john_doe"), ("database", "world")]); + + let sni = Some("foo.localhost"); + let common_name = Some("localhost"); + + let creds = ClientCredentials::parse(options, sni, common_name)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("foo")); + + Ok(()) + } + + #[test] + fn parse_project_from_options() -> anyhow::Result<()> { + let options = make_options([ + ("user", "john_doe"), + ("database", "world"), + ("project", "bar"), + ]); + + let creds = ClientCredentials::parse(options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("bar")); + + Ok(()) + } + + #[test] + fn parse_projects_identical() -> anyhow::Result<()> { + let options = make_options([ + ("user", "john_doe"), + ("database", "world"), + ("project", "baz"), + ]); + + let sni = Some("baz.localhost"); + let common_name = Some("localhost"); + + let creds = ClientCredentials::parse(options, sni, common_name)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("baz")); + + Ok(()) + } + + #[test] + fn parse_projects_different() { + let options = make_options([ + ("user", "john_doe"), + ("database", "world"), + ("project", "first"), + ]); + + let sni = Some("second.localhost"); + let common_name = Some("localhost"); + + assert!(matches!( + ClientCredentials::parse(options, sni, common_name).expect_err("should fail"), + ClientCredsParseError::InconsistentProjectNames(_, _) )); } - // return sni_data without the common name suffix. - Ok(sni_data - .strip_suffix(&common_name_with_dot) - .unwrap() - .to_string()) -} - -#[cfg(test)] -mod tests_for_project_name_from_sni_data { - use super::*; - - #[test] - fn passing() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - project_name_from_sni_data(&sni_data, common_name), - Ok(target_project_name.to_string()) - ); - } - - #[test] - fn throws_inconsistent_common_name_and_sni_data() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let wrong_suffix = "wrongtest.me"; - assert_eq!(common_name.len(), wrong_suffix.len()); - let wrong_common_name = format!("wrong{wrong_suffix}"); - let sni_data = format!("{target_project_name}.{wrong_common_name}"); - assert_eq!( - project_name_from_sni_data(&sni_data, common_name), - Err(ClientCredsParseError::InconsistentCommonNameAndSNI( - common_name.to_string(), - sni_data - )) - ); - } -} - -/// Determine project name from SNI or from project_name parameter from options argument. -fn get_project_name( - sni_data: Option<&str>, - common_name: Option<&str>, - project_name: Option<&str>, -) -> Result { - // determine the project name from sni_data if it exists, otherwise from project_name. - let ret = match sni_data { - Some(sni_data) => { - let common_name = common_name.ok_or(ClientCredsParseError::CommonNameNotSet)?; - let project_name_from_sni = project_name_from_sni_data(sni_data, common_name)?; - // check invariant: project name from options and from sni should match - if let Some(project_name) = &project_name { - if !project_name_from_sni.eq(project_name) { - return Err(ClientCredsParseError::InconsistentProjectNameAndSNI( - project_name_from_sni, - project_name.to_string(), - )); - } - } - project_name_from_sni - } - None => project_name - .ok_or(ClientCredsParseError::MissingSNIAndProjectName)? - .to_string(), - }; - - // check formatting invariant: project name must contain only alphanumeric characters and hyphens. - if !ret.chars().all(|x: char| x.is_alphanumeric() || x == '-') { - return Err(ClientCredsParseError::ProjectNameContainsIllegalChars(ret)); - } - - Ok(ret) -} - -#[cfg(test)] -mod tests_for_project_name_only { - use super::*; - - #[test] - fn passing_from_sni_data_only() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - get_project_name(Some(&sni_data), Some(common_name), None), - Ok(target_project_name.to_string()) - ); - } - - #[test] - fn throws_project_name_contains_illegal_chars_from_sni_data_only() { - let project_name_prefix = "my-project"; - let project_name_suffix = "123"; - let common_name = "localtest.me"; - - for illegal_char_id in 0..256 { - let illegal_char = char::from_u32(illegal_char_id).unwrap(); - if !(illegal_char.is_alphanumeric() || illegal_char == '-') - && illegal_char.to_string().len() == 1 - { - let target_project_name = - format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - get_project_name(Some(&sni_data), Some(common_name), None), - Err(ClientCredsParseError::ProjectNameContainsIllegalChars( - target_project_name - )) - ); - } - } - } - - #[test] - fn passing_from_project_name_only() { - let target_project_name = "my-project-123"; - let common_names = [Some("localtest.me"), None]; - for common_name in common_names { - assert_eq!( - get_project_name(None, common_name, Some(target_project_name)), - Ok(target_project_name.to_string()) - ); - } - } - - #[test] - fn throws_project_name_contains_illegal_chars_from_project_name_only() { - let project_name_prefix = "my-project"; - let project_name_suffix = "123"; - let common_names = [Some("localtest.me"), None]; - - for common_name in common_names { - for illegal_char_id in 0..256 { - let illegal_char: char = char::from_u32(illegal_char_id).unwrap(); - if !(illegal_char.is_alphanumeric() || illegal_char == '-') - && illegal_char.to_string().len() == 1 - { - let target_project_name = - format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); - assert_eq!( - get_project_name(None, common_name, Some(&target_project_name)), - Err(ClientCredsParseError::ProjectNameContainsIllegalChars( - target_project_name - )) - ); - } - } - } - } - - #[test] - fn passing_from_sni_data_and_project_name() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - get_project_name( - Some(&sni_data), - Some(common_name), - Some(target_project_name) - ), - Ok(target_project_name.to_string()) - ); - } - - #[test] - fn throws_inconsistent_project_name_and_sni() { - let project_name_param = "my-project-123"; - let wrong_project_name = "not-my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{wrong_project_name}.{common_name}"); - assert_eq!( - get_project_name(Some(&sni_data), Some(common_name), Some(project_name_param)), - Err(ClientCredsParseError::InconsistentProjectNameAndSNI( - wrong_project_name.to_string(), - project_name_param.to_string() - )) - ); - } - - #[test] - fn throws_common_name_not_set() { - let target_project_name = "my-project-123"; - let wrong_project_name = "not-my-project-123"; - let common_name = "localtest.me"; - let sni_datas = [ - Some(format!("{wrong_project_name}.{common_name}")), - Some(format!("{target_project_name}.{common_name}")), - ]; - let project_names = [None, Some(target_project_name)]; - for sni_data in sni_datas { - for project_name_param in project_names { - assert_eq!( - get_project_name(sni_data.as_deref(), None, project_name_param), - Err(ClientCredsParseError::CommonNameNotSet) - ); - } - } - } - - #[test] - fn throws_inconsistent_common_name_and_sni_data() { - let target_project_name = "my-project-123"; - let wrong_project_name = "not-my-project-123"; - let common_name = "localtest.me"; - let wrong_suffix = "wrongtest.me"; - assert_eq!(common_name.len(), wrong_suffix.len()); - let wrong_common_name = format!("wrong{wrong_suffix}"); - let sni_datas = [ - Some(format!("{wrong_project_name}.{wrong_common_name}")), - Some(format!("{target_project_name}.{wrong_common_name}")), - ]; - let project_names = [None, Some(target_project_name)]; - for project_name_param in project_names { - for sni_data in &sni_datas { - assert_eq!( - get_project_name(sni_data.as_deref(), Some(common_name), project_name_param), - Err(ClientCredsParseError::InconsistentCommonNameAndSNI( - common_name.to_string(), - sni_data.clone().unwrap().to_string() - )) - ); - } - } - } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 7efff13bfc..705f1e3807 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,8 +1,7 @@ //! Main authentication flow. -use super::AuthErrorImpl; -use crate::stream::PqStream; -use crate::{sasl, scram}; +use super::{AuthErrorImpl, PasswordHackPayload}; +use crate::{sasl, scram, stream::PqStream}; use std::io; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; @@ -27,6 +26,17 @@ impl AuthMethod for Scram<'_> { } } +/// Use an ad hoc auth flow (for clients which don't support SNI) proposed in +/// . +pub struct PasswordHack; + +impl AuthMethod for PasswordHack { + #[inline(always)] + fn first_message(&self) -> BeMessage<'_> { + Be::AuthenticationCleartextPassword + } +} + /// This wrapper for [`PqStream`] performs client authentication. #[must_use] pub struct AuthFlow<'a, Stream, State> { @@ -57,13 +67,34 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { } } +impl AuthFlow<'_, S, PasswordHack> { + /// Perform user authentication. Raise an error in case authentication failed. + pub async fn authenticate(self) -> super::Result { + let msg = self.stream.read_password_message().await?; + let password = msg + .strip_suffix(&[0]) + .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + + // The so-called "password" should contain a base64-encoded json. + // We will use it later to route the client to their project. + let bytes = base64::decode(password) + .map_err(|_| AuthErrorImpl::MalformedPassword("bad encoding"))?; + + let payload = serde_json::from_slice(&bytes) + .map_err(|_| AuthErrorImpl::MalformedPassword("invalid payload"))?; + + Ok(payload) + } +} + /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. pub async fn authenticate(self) -> super::Result { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; - let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; + let sasl = sasl::FirstMessage::parse(&msg) + .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?; // Currently, the only supported SASL method is SCRAM. if !scram::METHODS.contains(&sasl.method) { diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs new file mode 100644 index 0000000000..6a1258ab31 --- /dev/null +++ b/proxy/src/auth/password_hack.rs @@ -0,0 +1,102 @@ +//! Payload for ad hoc authentication method for clients that don't support SNI. +//! See the `impl` for [`super::backend::BackendType`]. +//! Read more: . + +use serde::{de, Deserialize, Deserializer}; +use std::fmt; + +#[derive(Deserialize)] +#[serde(untagged)] +pub enum Password { + /// A regular string for utf-8 encoded passwords. + Simple { password: String }, + + /// Password is base64-encoded because it may contain arbitrary byte sequences. + Encoded { + #[serde(rename = "password_", deserialize_with = "deserialize_base64")] + password: Vec, + }, +} + +impl AsRef<[u8]> for Password { + fn as_ref(&self) -> &[u8] { + match self { + Password::Simple { password } => password.as_ref(), + Password::Encoded { password } => password.as_ref(), + } + } +} + +#[derive(Deserialize)] +pub struct PasswordHackPayload { + pub project: String, + + #[serde(flatten)] + pub password: Password, +} + +fn deserialize_base64<'a, D: Deserializer<'a>>(des: D) -> Result, D::Error> { + // It's very tempting to replace this with + // + // ``` + // let base64: &str = Deserialize::deserialize(des)?; + // base64::decode(base64).map_err(serde::de::Error::custom) + // ``` + // + // Unfortunately, we can't always deserialize into `&str`, so we'd + // have to use an allocating `String` instead. Thus, visitor is better. + struct Visitor; + + impl<'de> de::Visitor<'de> for Visitor { + type Value = Vec; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a string") + } + + fn visit_str(self, v: &str) -> Result { + base64::decode(v).map_err(de::Error::custom) + } + } + + des.deserialize_str(Visitor) +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + use serde_json::json; + + #[test] + fn parse_password() -> anyhow::Result<()> { + let password: Password = serde_json::from_value(json!({ + "password": "foo", + }))?; + assert_eq!(password.as_ref(), "foo".as_bytes()); + + let password: Password = serde_json::from_value(json!({ + "password_": base64::encode("foo"), + }))?; + assert_eq!(password.as_ref(), "foo".as_bytes()); + + Ok(()) + } + + #[rstest] + #[case("password", str::to_owned)] + #[case("password_", base64::encode)] + fn parse(#[case] key: &str, #[case] encode: fn(&'static str) -> String) -> anyhow::Result<()> { + let (password, project) = ("password", "pie-in-the-sky"); + let payload = json!({ + "project": project, + key: encode(password), + }); + + let payload: PasswordHackPayload = serde_json::from_value(payload)?; + assert_eq!(payload.password.as_ref(), password.as_bytes()); + assert_eq!(payload.project, project); + + Ok(()) + } +} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index cccd6e60d4..896ef3588d 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,8 +1,6 @@ -use crate::auth::DatabaseInfo; -use crate::cancellation::CancelClosure; -use crate::error::UserFacingError; -use std::io; -use std::net::SocketAddr; +use crate::{cancellation::CancelClosure, error::UserFacingError}; +use futures::TryFutureExt; +use std::{io, net::SocketAddr}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; @@ -21,44 +19,96 @@ pub enum ConnectionError { FailedToFetchPgVersion, } -impl UserFacingError for ConnectionError {} - -/// PostgreSQL version as [`String`]. -pub type Version = String; +impl UserFacingError for ConnectionError { + fn to_string_client(&self) -> String { + use ConnectionError::*; + match self { + // This helps us drop irrelevant library-specific prefixes. + // TODO: propagate severity level and other parameters. + Postgres(err) => match err.as_db_error() { + Some(err) => err.message().to_string(), + None => err.to_string(), + }, + other => other.to_string(), + } + } +} /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; -/// Compute node connection params. +pub type ComputeConnCfg = tokio_postgres::Config; + +/// Various compute node info for establishing connection etc. pub struct NodeInfo { - pub db_info: DatabaseInfo, - pub scram_keys: Option, + /// Did we send [`utils::pq_proto::BeMessage::AuthenticationOk`]? + pub reported_auth_ok: bool, + /// Compute node connection params. + pub config: tokio_postgres::Config, } impl NodeInfo { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { - let host_port = (self.db_info.host.as_str(), self.db_info.port); - let socket = TcpStream::connect(host_port).await?; - let socket_addr = socket.peer_addr()?; - socket2::SockRef::from(&socket).set_keepalive(true)?; + use tokio_postgres::config::Host; - Ok((socket_addr, socket)) + let connect_once = |host, port| { + TcpStream::connect((host, port)).and_then(|socket| async { + let socket_addr = socket.peer_addr()?; + // This prevents load balancer from severing the connection. + socket2::SockRef::from(&socket).set_keepalive(true)?; + Ok((socket_addr, socket)) + }) + }; + + // We can't reuse connection establishing logic from `tokio_postgres` here, + // because it has no means for extracting the underlying socket which we + // require for our business. + let mut connection_error = None; + let ports = self.config.get_ports(); + for (i, host) in self.config.get_hosts().iter().enumerate() { + let port = ports.get(i).or_else(|| ports.get(0)).unwrap_or(&5432); + let host = match host { + Host::Tcp(host) => host.as_str(), + Host::Unix(_) => continue, // unix sockets are not welcome here + }; + + // TODO: maybe we should add a timeout. + match connect_once(host, *port).await { + Ok(socket) => return Ok(socket), + Err(err) => { + // We can't throw an error here, as there might be more hosts to try. + println!("failed to connect to compute `{host}:{port}`: {err}"); + connection_error = Some(err); + } + } + } + + Err(connection_error.unwrap_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("couldn't connect: bad compute config: {:?}", self.config), + ) + })) } +} +pub struct PostgresConnection { + /// Socket connected to a compute node. + pub stream: TcpStream, + /// PostgreSQL version of this instance. + pub version: String, +} + +impl NodeInfo { /// Connect to a corresponding compute node. - pub async fn connect(self) -> Result<(TcpStream, Version, CancelClosure), ConnectionError> { - let (socket_addr, mut socket) = self + pub async fn connect(&self) -> Result<(PostgresConnection, CancelClosure), ConnectionError> { + let (socket_addr, mut stream) = self .connect_raw() .await .map_err(|_| ConnectionError::FailedToConnectToCompute)?; - let mut config = tokio_postgres::Config::from(self.db_info); - if let Some(scram_keys) = self.scram_keys { - config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(scram_keys)); - } - // TODO: establish a secure connection to the DB - let (client, conn) = config.connect_raw(&mut socket, NoTls).await?; + let (client, conn) = self.config.connect_raw(&mut stream, NoTls).await?; let version = conn .parameter("server_version") .ok_or(ConnectionError::FailedToFetchPgVersion)? @@ -66,6 +116,8 @@ impl NodeInfo { let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); - Ok((socket, version, cancel_closure)) + let db = PostgresConnection { stream, version }; + + Ok((db, cancel_closure)) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index df3923de1a..1f01c25734 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,28 +1,16 @@ -use crate::url::ApiUrl; +use crate::{auth, url::ApiUrl}; use anyhow::{bail, ensure, Context}; use std::{str::FromStr, sync::Arc}; -#[derive(Debug)] -pub enum AuthBackendType { - /// Legacy Cloud API (V1). - LegacyConsole, - /// Authentication via a web browser. - Link, - /// Current Cloud API (V2). - Console, - /// Local mock of Cloud API (V2). - Postgres, -} - -impl FromStr for AuthBackendType { +impl FromStr for auth::BackendType<()> { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { - use AuthBackendType::*; + use auth::BackendType::*; Ok(match s { - "legacy" => LegacyConsole, - "console" => Console, - "postgres" => Postgres, + "legacy" => LegacyConsole(()), + "console" => Console(()), + "postgres" => Postgres(()), "link" => Link, _ => bail!("Invalid option `{s}` for auth method"), }) @@ -31,7 +19,11 @@ impl FromStr for AuthBackendType { pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: AuthBackendType, + pub auth_backend: auth::BackendType<()>, + pub auth_urls: AuthUrls, +} + +pub struct AuthUrls { pub auth_endpoint: ApiUrl, pub auth_link_uri: ApiUrl, } @@ -87,10 +79,8 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result>) -> io::Error { + io::Error::new(io::ErrorKind::Other, e) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index b68b2440dd..2521f2af21 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -118,11 +118,15 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; + let auth_urls = config::AuthUrls { + auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, + auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + }; + let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend: arg_matches.value_of("auth-backend").unwrap().parse()?, - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, - auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + auth_urls, })); println!("Version: {GIT_VERSION}"); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 7e364b5e9c..f202782109 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -82,11 +82,22 @@ async fn handle_client( } let tls = config.tls_config.as_ref(); - let (stream, creds) = match handshake(stream, tls, cancel_map).await? { + let (mut stream, params) = match handshake(stream, tls, cancel_map).await? { Some(x) => x, None => return Ok(()), // it's a cancellation request }; + let creds = { + let sni = stream.get_ref().sni_hostname(); + let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let result = config + .auth_backend + .map(|_| auth::ClientCredentials::parse(params, sni, common_name)) + .transpose(); + + async { result }.or_else(|e| stream.throw_error(e)).await? + }; + let client = Client::new(stream, creds); cancel_map .with_session(|session| client.connect_to_db(config, session)) @@ -101,12 +112,10 @@ async fn handshake( stream: S, mut tls: Option<&TlsConfig>, cancel_map: &CancelMap, -) -> anyhow::Result>, auth::ClientCredentials)>> { +) -> anyhow::Result>, StartupMessageParams)>> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); - let common_name = tls.and_then(|cfg| cfg.common_name.as_deref()); - let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; @@ -147,18 +156,7 @@ async fn handshake( stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - // Get SNI info when available - let sni_data = match stream.get_ref() { - Stream::Tls { tls } => tls.get_ref().1.sni_hostname().map(|s| s.to_owned()), - _ => None, - }; - - // Construct credentials - let creds = - auth::ClientCredentials::parse(params, sni_data.as_deref(), common_name); - let creds = async { creds }.or_else(|e| stream.throw_error(e)).await?; - - break Ok(Some((stream, creds))); + break Ok(Some((stream, params))); } CancelRequest(cancel_key_data) => { cancel_map.cancel_session(cancel_key_data).await?; @@ -174,12 +172,12 @@ struct Client { /// The underlying libpq protocol stream. stream: PqStream, /// Client credentials that we care about. - creds: auth::ClientCredentials, + creds: auth::BackendType, } impl Client { /// Construct a new connection context. - fn new(stream: PqStream, creds: auth::ClientCredentials) -> Self { + fn new(stream: PqStream, creds: auth::BackendType) -> Self { Self { stream, creds } } } @@ -194,16 +192,22 @@ impl Client { let Self { mut stream, creds } = self; // Authenticate and connect to a compute node. - let auth = creds.authenticate(config, &mut stream).await; + let auth = creds.authenticate(&config.auth_urls, &mut stream).await; let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; - let (db, version, cancel_closure) = - node.connect().or_else(|e| stream.throw_error(e)).await?; + let (db, cancel_closure) = node.connect().or_else(|e| stream.throw_error(e)).await?; let cancel_key_data = session.enable_cancellation(cancel_closure); + // Report authentication success if we haven't done this already. + if !node.reported_auth_ok { + stream + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + } + stream .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&version), + BeParameterStatusMessage::ServerVersion(&db.version), ))? .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? .write_message(&BeMessage::ReadyForQuery) @@ -217,7 +221,7 @@ impl Client { } // Starting from here we only proxy the client's traffic. - let mut db = MetricsStream::new(db, inc_proxied); + let mut db = MetricsStream::new(db.stream, inc_proxied); let mut client = MetricsStream::new(stream.into_inner(), inc_proxied); let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; @@ -279,9 +283,13 @@ mod tests { let config = rustls::ServerConfig::builder() .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![cert], key)?; + .with_single_cert(vec![cert], key)? + .into(); - config.into() + TlsConfig { + config, + common_name: Some(common_name.to_string()), + } }; let client_config = { @@ -297,11 +305,6 @@ mod tests { ClientConfig { config, hostname } }; - let tls_config = TlsConfig { - config: tls_config, - common_name: Some(common_name.to_string()), - }; - Ok((client_config, tls_config)) } @@ -357,7 +360,7 @@ mod tests { auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let cancel_map = CancelMap::default(); - let (mut stream, _creds) = handshake(client, tls.as_ref(), &cancel_map) + let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map) .await? .context("handshake failed")?; @@ -436,32 +439,6 @@ mod tests { proxy.await? } - #[tokio::test] - async fn give_user_an_error_for_bad_creds() -> anyhow::Result<()> { - let (client, server) = tokio::io::duplex(1024); - - let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); - - let client_err = tokio_postgres::Config::new() - .ssl_mode(SslMode::Disable) - .connect_raw(server, NoTls) - .await - .err() // -> Option - .context("client shouldn't be able to connect")?; - - // TODO: this is ugly, but `format!` won't allow us to extract fmt string - assert!(client_err.to_string().contains("missing in startup packet")); - - let server_err = proxy - .await? - .err() // -> Option - .context("server shouldn't accept client")?; - - assert!(client_err.to_string().contains(&server_err.to_string())); - - Ok(()) - } - #[tokio::test] async fn keepalive_is_inherited() -> anyhow::Result<()> { use tokio::net::{TcpListener, TcpStream}; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 42b0185fde..54ff8bcc07 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -145,6 +145,14 @@ impl Stream { pub fn from_raw(raw: S) -> Self { Self::Raw { raw } } + + /// Return SNI hostname when it's available. + pub fn sni_hostname(&self) -> Option<&str> { + match self { + Stream::Raw { .. } => None, + Stream::Tls { tls } => tls.get_ref().1.sni_hostname(), + } + } } #[derive(Debug, Error)] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 373108c61b..f6ae9e75d7 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -20,7 +20,6 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8 anyhow = "1.0" crc32c = "0.6.0" humantime = "2.1.0" -walkdir = "2" url = "2.2.2" signal-hook = "0.3.10" serde = { version = "1.0", features = ["derive"] } @@ -28,11 +27,9 @@ serde_with = "1.12.0" hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-util = { version = "0.7", features = ["io"] } git-version = "0.3.5" async-trait = "0.1" once_cell = "1.10.0" -futures = "0.3.13" toml_edit = { version = "0.13", features = ["easy"] } postgres_ffi = { path = "../libs/postgres_ffi" } diff --git a/safekeeper/src/http/models.rs b/safekeeper/src/http/models.rs index 77efc0cc21..4b3ae7798e 100644 --- a/safekeeper/src/http/models.rs +++ b/safekeeper/src/http/models.rs @@ -1,9 +1,8 @@ use serde::{Deserialize, Serialize}; -use utils::zid::{NodeId, ZTenantId, ZTimelineId}; +use utils::zid::{NodeId, ZTimelineId}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, pub peer_ids: Vec, } diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml new file mode 100644 index 0000000000..da225f244b --- /dev/null +++ b/safekeeper/src/http/openapi_spec.yaml @@ -0,0 +1,365 @@ +openapi: "3.0.2" +info: + title: Safekeeper control API + version: "1.0" + + +servers: + - url: "http://localhost:7676" + + +paths: + /v1/status: + get: + tags: + - "Info" + summary: Get safekeeper status + description: "" + operationId: v1GetSafekeeperStatus + responses: + "200": + description: Safekeeper status + content: + application/json: + schema: + $ref: "#/components/schemas/SafekeeperStatus" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + + delete: + tags: + - "Tenant" + summary: Delete tenant and all its timelines + description: "Deletes tenant and returns a map of timelines that were deleted along with their statuses" + operationId: v1DeleteTenant + responses: + "200": + description: Tenant deleted + content: + application/json: + schema: + $ref: "#/components/schemas/TenantDeleteResult" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}/timeline: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Timeline" + summary: Register new timeline + description: "" + operationId: v1CreateTenantTimeline + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineCreateRequest" + responses: + "201": + description: Timeline created + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}/timeline/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + + get: + tags: + - "Timeline" + summary: Get timeline information and status + description: "" + operationId: v1GetTenantTimeline + responses: + "200": + description: Timeline status + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineStatus" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + delete: + tags: + - "Timeline" + summary: Delete timeline + description: "" + operationId: v1DeleteTenantTimeline + responses: + "200": + description: Timeline deleted + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineDeleteResult" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/record_safekeeper_info/{tenant_id}/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Tests" + summary: Used only in tests to hand craft required data + description: "" + operationId: v1RecordSafekeeperInfo + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/SkTimelineInfo" + responses: + "200": + description: Timeline info posted + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + + + schemas: + + # + # Requests + # + + TimelineCreateRequest: + type: object + required: + - timeline_id + - peer_ids + properties: + timeline_id: + type: string + format: hex + peer_ids: + type: array + items: + type: integer + minimum: 0 + + SkTimelineInfo: + type: object + required: + - last_log_term + - flush_lsn + - commit_lsn + - backup_lsn + - remote_consistent_lsn + - peer_horizon_lsn + - safekeeper_connstr + properties: + last_log_term: + type: integer + minimum: 0 + flush_lsn: + type: string + commit_lsn: + type: string + backup_lsn: + type: string + remote_consistent_lsn: + type: string + peer_horizon_lsn: + type: string + safekeeper_connstr: + type: string + + # + # Responses + # + + SafekeeperStatus: + type: object + required: + - id + properties: + id: + type: integer + minimum: 0 # kind of unsigned integer + + TimelineStatus: + type: object + required: + - timeline_id + - tenant_id + properties: + timeline_id: + type: string + format: hex + tenant_id: + type: string + format: hex + acceptor_state: + $ref: '#/components/schemas/AcceptorStateStatus' + flush_lsn: + type: string + timeline_start_lsn: + type: string + local_start_lsn: + type: string + commit_lsn: + type: string + backup_lsn: + type: string + peer_horizon_lsn: + type: string + remote_consistent_lsn: + type: string + + AcceptorStateStatus: + type: object + required: + - term + - epoch + properties: + term: + type: integer + minimum: 0 # kind of unsigned integer + epoch: + type: integer + minimum: 0 # kind of unsigned integer + term_history: + type: array + items: + $ref: '#/components/schemas/TermSwitchEntry' + + TermSwitchEntry: + type: object + required: + - term + - lsn + properties: + term: + type: integer + minimum: 0 # kind of unsigned integer + lsn: + type: string + + TimelineDeleteResult: + type: object + required: + - dir_existed + - was_active + properties: + dir_existed: + type: boolean + was_active: + type: boolean + + TenantDeleteResult: + type: object + additionalProperties: + $ref: "#/components/schemas/TimelineDeleteResult" + example: + 57fd1b39f23704a63423de0a8435d85c: + dir_existed: true + was_active: false + 67fd1b39f23704a63423gb8435d85c33: + dir_existed: false + was_active: false + + # + # Errors + # + + GenericErrorContent: + type: object + properties: + msg: + type: string + + responses: + + # + # Errors + # + + GenericError: + description: Generic error response + content: + application/json: + schema: + $ref: "#/components/schemas/GenericErrorContent" + + ForbiddenError: + description: Forbidden error response + content: + application/json: + schema: + type: object + required: + - msg + properties: + msg: + type: string + + +security: + - JWT: [] diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 33581c6c31..13356c5921 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -126,7 +126,7 @@ async fn timeline_create_handler(mut request: Request) -> Result str: + return base64.b64encode(s.encode('utf-8')).decode('utf-8') + + magic = encode(json.dumps({ + 'project': 'irrelevant', + 'password': password, + })) + + static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) + + magic = encode(json.dumps({ + 'project': 'irrelevant', + 'password_': encode(password), + })) + + static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) # Pass extra options to the server. @@ -11,8 +37,8 @@ def test_proxy_select_1(static_proxy): # See https://github.com/neondatabase/neon/issues/1287 @pytest.mark.xfail def test_proxy_options(static_proxy): - with static_proxy.connect(options="-cproxytest.option=value") as conn: + with static_proxy.connect(options='-cproxytest.option=value') as conn: with conn.cursor() as cur: - cur.execute("SHOW proxytest.option;") + cur.execute('SHOW proxytest.option') value = cur.fetchall()[0][0] assert value == 'value' diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 73f6f52e72..d59f28bcc5 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -26,7 +26,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.utils import lsn_from_hex, subprocess_capture +from fixtures.utils import lsn_from_hex, lsn_to_hex, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -268,6 +268,7 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, env.neon_cli.create_branch( new_branch_name="test_tenant_relocation_second", ancestor_branch_name="test_tenant_relocation_main", + ancestor_start_lsn=lsn_to_hex(current_lsn_main), tenant_id=tenant_id, ) pg_second = env.postgres.create_start(branch_name='test_tenant_relocation_second', diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 7b7b16bcbf..c3788a0e9b 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,10 +1,15 @@ from contextlib import closing +import pathlib +from uuid import UUID +import re import psycopg2.extras import psycopg2.errors from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local from fixtures.log_helper import log import time +from fixtures.utils import get_timeline_dir_size + def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env @@ -176,3 +181,129 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): cur.execute("SELECT * from pg_size_pretty(pg_cluster_size())") pg_cluster_size = cur.fetchone() log.info(f"pg_cluster_size = {pg_cluster_size}") + + +def test_timeline_physical_size_init(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_init') + pg = env.postgres.create_start("test_timeline_physical_size_init") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 1000) g""", + ]) + + # restart the pageserer to force calculating timeline's initial physical size + env.pageserver.stop() + env.pageserver.start() + + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_checkpoint') + pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 1000) g""", + ]) + + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): + # Disable background compaction as we don't want it to happen after `get_physical_size` request + # and before checking the expected size on disk, which makes the assertion failed + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + + env = neon_env_builder.init_start() + + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_compaction') + pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ]) + + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}") + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): + # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request + # and before checking the expected size on disk, which makes the assertion failed + neon_env_builder.pageserver_config_override = \ + "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" + + env = neon_env_builder.init_start() + + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_gc') + pg = env.postgres.create_start("test_timeline_physical_size_post_gc") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ]) + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + pg.safe_psql(""" + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """) + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + + env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0") + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): + env = neon_simple_env + + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_metric') + pg = env.postgres.create_start("test_timeline_physical_size_metric") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ]) + + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + + # get the metrics and parse the metric for the current timeline's physical size + metrics = env.pageserver.http_client().get_metrics() + matches = re.search( + f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + metrics, + re.MULTILINE) + assert matches + + # assert that the metric matches the actual physical size on disk + tl_physical_size_metric = int(matches.group(1)) + timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id) + assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) + + +def assert_physical_size(env: NeonEnv, tenant_id: UUID, timeline_id: UUID): + """Check the current physical size returned from timeline API + matches the total physical size of the timeline on disk""" + client = env.pageserver.http_client() + res = assert_timeline_local(client, tenant_id, timeline_id) + timeline_path = env.timeline_dir(tenant_id, timeline_id) + assert res["local"]["current_physical_size"] == res["local"][ + "current_physical_size_non_incremental"] + assert res["local"]["current_physical_size"] == get_timeline_dir_size(timeline_path) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 9b876f780d..5014a7ad4e 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -203,61 +203,6 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): assert cur.fetchone() == (500500, ) -start_delay_sec = 2 - - -def delayed_safekeeper_start(wa): - time.sleep(start_delay_sec) - wa.start() - - -# When majority of acceptors is offline, commits are expected to be frozen -def test_unavailability(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 2 - env = neon_env_builder.init_start() - - env.neon_cli.create_branch('test_safekeepers_unavailability') - pg = env.postgres.create_start('test_safekeepers_unavailability') - - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - pg_conn = pg.connect() - cur = pg_conn.cursor() - - # check basic work with table - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t values (1, 'payload')") - - # shutdown one of two acceptors, that is, majority - env.safekeepers[0].stop() - - proc = Process(target=delayed_safekeeper_start, args=(env.safekeepers[0], )) - proc.start() - - start = time.time() - cur.execute("INSERT INTO t values (2, 'payload')") - # ensure that the query above was hanging while acceptor was down - assert (time.time() - start) >= start_delay_sec - proc.join() - - # for the world's balance, do the same with second acceptor - env.safekeepers[1].stop() - - proc = Process(target=delayed_safekeeper_start, args=(env.safekeepers[1], )) - proc.start() - - start = time.time() - cur.execute("INSERT INTO t values (3, 'payload')") - # ensure that the query above was hanging while acceptor was down - assert (time.time() - start) >= start_delay_sec - proc.join() - - cur.execute("INSERT INTO t values (4, 'payload')") - - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (10, ) - - # shut down random subset of acceptors, sleep, wake them up, rinse, repeat def xmas_garland(acceptors, stop): while not bool(stop.value): diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index d74ef8840a..bf7d8e3645 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -146,9 +146,8 @@ async def run_restarts_under_load(env: NeonEnv, max_transfer=100, period_time=4, iterations=10): - # Set timeout for this test at 5 minutes. It should be enough for test to complete - # and less than CircleCI's no_output_timeout, taking into account that this timeout - # is checked only at the beginning of every iteration. + # Set timeout for this test at 5 minutes. It should be enough for test to complete, + # taking into account that this timeout is checked only at the beginning of every iteration. test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() @@ -404,3 +403,55 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_branch('test_concurrent_computes') asyncio.run(run_concurrent_computes(env)) + + +# Stop safekeeper and check that query cannot be executed while safekeeper is down. +# Query will insert a single row into a table. +async def check_unavailability(sk: Safekeeper, + conn: asyncpg.Connection, + key: int, + start_delay_sec: int = 2): + # shutdown one of two acceptors, that is, majority + sk.stop() + + bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')")) + + await asyncio.sleep(start_delay_sec) + # ensure that the query has not been executed yet + assert not bg_query.done() + + # start safekeeper and await the query + sk.start() + await bg_query + assert bg_query.done() + + +async def run_unavailability(env: NeonEnv, pg: Postgres): + conn = await pg.connect_async() + + # check basic work with table + await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.execute("INSERT INTO t values (1, 'payload')") + + # stop safekeeper and check that query cannot be executed while safekeeper is down + await check_unavailability(env.safekeepers[0], conn, 2) + + # for the world's balance, do the same with second safekeeper + await check_unavailability(env.safekeepers[1], conn, 3) + + # check that we can execute queries after restart + await conn.execute("INSERT INTO t values (4, 'payload')") + + result_sum = await conn.fetchval('SELECT sum(key) FROM t') + assert result_sum == 10 + + +# When majority of acceptors is offline, commits are expected to be frozen +def test_unavailability(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 2 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_safekeepers_unavailability') + pg = env.postgres.create_start('test_safekeepers_unavailability') + + asyncio.run(run_unavailability(env, pg)) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index c6e6289a5c..51545d0217 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,5 +1,5 @@ pytest_plugins = ("fixtures.neon_fixtures", "fixtures.benchmark_fixture", + "fixtures.pg_stats", "fixtures.compare_fixtures", - "fixtures.slow", - "fixtures.pg_stats") + "fixtures.slow") diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3a6a233208..397b932ec9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -30,7 +30,7 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn -from typing import Any, Callable, Dict, Iterator, List, Optional, Type, TypeVar, cast, Union, Tuple +from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal import requests @@ -280,20 +280,18 @@ class PgProtocol: return str(make_dsn(**self.conn_options(**kwargs))) def conn_options(self, **kwargs): - conn_options = self.default_options.copy() + result = self.default_options.copy() if 'dsn' in kwargs: - conn_options.update(parse_dsn(kwargs['dsn'])) - conn_options.update(kwargs) + result.update(parse_dsn(kwargs['dsn'])) + result.update(kwargs) # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. - if 'options' in conn_options: - conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options'] - else: - conn_options['options'] = "-cstatement_timeout=120s" - return conn_options + options = result.get('options', '') + result['options'] = f'-cstatement_timeout=120s {options}' + return result # autocommit=True here by default because that's what we need most of the time def connect(self, autocommit=True, **kwargs) -> PgConnection: @@ -693,6 +691,10 @@ class NeonEnv: """ Get list of safekeeper endpoints suitable for safekeepers GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) + def timeline_dir(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Path: + """Get a timeline directory's path based on the repo directory of the test environment""" + return self.repo_dir / "tenants" / tenant_id.hex / "timelines" / timeline_id.hex + @cached_property def auth_keys(self) -> AuthKeys: pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes() @@ -865,8 +867,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" - ) + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + + "?include-non-incremental-logical-size=1&include-non-incremental-physical-size=1") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) @@ -1514,29 +1516,25 @@ def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: class NeonProxy(PgProtocol): - def __init__(self, port: int, pg_port: int): - super().__init__(host="127.0.0.1", - user="proxy_user", - password="pytest2", - port=port, - dbname='postgres') - self.http_port = 7001 - self.host = "127.0.0.1" - self.port = port - self.pg_port = pg_port + def __init__(self, proxy_port: int, http_port: int, auth_endpoint: str): + super().__init__(dsn=auth_endpoint, port=proxy_port) + self.host = '127.0.0.1' + self.http_port = http_port + self.proxy_port = proxy_port + self.auth_endpoint = auth_endpoint self._popen: Optional[subprocess.Popen[bytes]] = None def start(self) -> None: assert self._popen is None # Start proxy - bin_proxy = os.path.join(str(neon_binpath), 'proxy') - args = [bin_proxy] - args.extend(["--http", f"{self.host}:{self.http_port}"]) - args.extend(["--proxy", f"{self.host}:{self.port}"]) - args.extend(["--auth-backend", "postgres"]) - args.extend( - ["--auth-endpoint", f"postgres://proxy_auth:pytest1@localhost:{self.pg_port}/postgres"]) + args = [ + os.path.join(str(neon_binpath), 'proxy'), + *["--http", f"{self.host}:{self.http_port}"], + *["--proxy", f"{self.host}:{self.proxy_port}"], + *["--auth-backend", "postgres"], + *["--auth-endpoint", self.auth_endpoint], + ] self._popen = subprocess.Popen(args) self._wait_until_ready() @@ -1557,13 +1555,21 @@ class NeonProxy(PgProtocol): @pytest.fixture(scope='function') def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: """Neon proxy that routes directly to vanilla postgres.""" - vanilla_pg.start() - vanilla_pg.safe_psql("create user proxy_auth with password 'pytest1' superuser") - vanilla_pg.safe_psql("create user proxy_user with password 'pytest2'") - port = port_distributor.get_port() - pg_port = vanilla_pg.default_options['port'] - with NeonProxy(port, pg_port) as proxy: + # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql` + vanilla_pg.start() + vanilla_pg.safe_psql("create user proxy with login superuser password 'password'") + + port = vanilla_pg.default_options['port'] + host = vanilla_pg.default_options['host'] + dbname = vanilla_pg.default_options['dbname'] + auth_endpoint = f'postgres://proxy:password@{host}:{port}/{dbname}' + + proxy_port = port_distributor.get_port() + http_port = port_distributor.get_port() + + with NeonProxy(proxy_port=proxy_port, http_port=http_port, + auth_endpoint=auth_endpoint) as proxy: proxy.start() yield proxy @@ -1923,7 +1929,7 @@ class SafekeeperHttpClient(requests.Session): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() def timeline_status(self, tenant_id: str, timeline_id: str) -> SafekeeperTimelineStatus: - res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id}/{timeline_id}") + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index c49fa08d77..bc50a43ada 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,9 +1,11 @@ +import contextlib import os +import pathlib import shutil import subprocess from pathlib import Path -from typing import Any, List +from typing import Any, List, Tuple from fixtures.log_helper import log @@ -89,3 +91,36 @@ def get_dir_size(path: str) -> int: pass # file could be concurrently removed return totalbytes + + +def get_timeline_dir_size(path: pathlib.Path) -> int: + """Get the timeline directory's total size, which only counts the layer files' size.""" + sz = 0 + for dir_entry in path.iterdir(): + with contextlib.suppress(Exception): + # file is an image layer + _ = parse_image_layer(dir_entry.name) + sz += dir_entry.stat().st_size + continue + + with contextlib.suppress(Exception): + # file is a delta layer + _ = parse_delta_layer(dir_entry.name) + sz += dir_entry.stat().st_size + continue + return sz + + +def parse_image_layer(f_name: str) -> Tuple[int, int, int]: + """Parse an image layer file name. Return key start, key end, and snapshot lsn""" + parts = f_name.split("__") + key_parts = parts[0].split("-") + return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16) + + +def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: + """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end""" + parts = f_name.split("__") + key_parts = parts[0].split("-") + lsn_parts = parts[1].split("-") + return int(key_parts[0], 16), int(key_parts[1], 16), int(lsn_parts[0], 16), int(lsn_parts[1], 16) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py new file mode 100644 index 0000000000..1d39b0830d --- /dev/null +++ b/test_runner/performance/test_branch_creation.py @@ -0,0 +1,110 @@ +import random +import time +import statistics +import threading +import timeit +import pytest +from typing import List +from fixtures.benchmark_fixture import MetricReport +from fixtures.compare_fixtures import NeonCompare +from fixtures.log_helper import log + + +def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): + neon_compare.zenbenchmark.record("branch_creation_duration_max", + max(durs), + 's', + MetricReport.LOWER_IS_BETTER) + neon_compare.zenbenchmark.record("branch_creation_duration_avg", + statistics.mean(durs), + 's', + MetricReport.LOWER_IS_BETTER) + neon_compare.zenbenchmark.record("branch_creation_duration_stdev", + statistics.stdev(durs), + 's', + MetricReport.LOWER_IS_BETTER) + + +@pytest.mark.parametrize("n_branches", [20]) +# Test measures the latency of branch creation during a heavy [1] workload. +# +# [1]: to simulate a heavy workload, the test tweaks the GC and compaction settings +# to increase the task's frequency. The test runs `pgbench` in each new branch. +# Each branch is created from a randomly picked source branch. +def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int): + env = neon_compare.env + pg_bin = neon_compare.pg_bin + + # Use aggressive GC and checkpoint settings, so GC and compaction happen more often during the test + tenant, _ = env.neon_cli.create_tenant( + conf={ + 'gc_period': '5 s', + 'gc_horizon': f'{4 * 1024 ** 2}', + 'checkpoint_distance': f'{2 * 1024 ** 2}', + 'compaction_target_size': f'{1024 ** 2}', + 'compaction_threshold': '2', + # set PITR interval to be small, so we can do GC + 'pitr_interval': '5 s' + }) + + def run_pgbench(branch: str): + log.info(f"Start a pgbench workload on branch {branch}") + + pg = env.postgres.create_start(branch, tenant_id=tenant) + connstr = pg.connstr() + + pg_bin.run_capture(['pgbench', '-i', connstr]) + pg_bin.run_capture(['pgbench', '-c10', '-T10', connstr]) + + pg.stop() + + env.neon_cli.create_branch('b0', tenant_id=tenant) + + threads: List[threading.Thread] = [] + threads.append(threading.Thread(target=run_pgbench, args=('b0', ), daemon=True)) + threads[-1].start() + + branch_creation_durations = [] + for i in range(n_branches): + time.sleep(1.0) + + # random a source branch + p = random.randint(0, i) + + timer = timeit.default_timer() + env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p), tenant_id=tenant) + dur = timeit.default_timer() - timer + + log.info(f"Creating branch b{i+1} took {dur}s") + branch_creation_durations.append(dur) + + threads.append(threading.Thread(target=run_pgbench, args=(f'b{i+1}', ), daemon=True)) + threads[-1].start() + + for thread in threads: + thread.join() + + _record_branch_creation_durations(neon_compare, branch_creation_durations) + + +@pytest.mark.parametrize("n_branches", [1024]) +# Test measures the latency of branch creation when creating a lot of branches. +def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): + env = neon_compare.env + + env.neon_cli.create_branch('b0') + + pg = env.postgres.create_start('b0') + neon_compare.pg_bin.run_capture(['pgbench', '-i', '-s10', pg.connstr()]) + + branch_creation_durations = [] + + for i in range(n_branches): + # random a source branch + p = random.randint(0, i) + timer = timeit.default_timer() + env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p)) + dur = timeit.default_timer() - timer + branch_creation_durations.append(dur) + + _record_branch_creation_durations(neon_compare, branch_creation_durations) diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py index a8a9e3cd4d..b9bca90231 100644 --- a/test_runner/performance/test_compare_pg_stats.py +++ b/test_runner/performance/test_compare_pg_stats.py @@ -1,4 +1,6 @@ import os +import threading +import time from typing import List import pytest @@ -87,3 +89,34 @@ def test_compare_pg_stats_wal_with_pgbench_default(neon_with_baseline: PgCompare env.pg_bin.run_capture( ['pgbench', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) env.flush() + + +@pytest.mark.parametrize("n_tables", [1, 10]) +@pytest.mark.parametrize("duration", get_durations_matrix(10)) +def test_compare_pg_stats_wo_with_heavy_write(neon_with_baseline: PgCompare, + n_tables: int, + duration: int, + pg_stats_wo: List[PgStatTable]): + env = neon_with_baseline + with env.pg.connect().cursor() as cur: + for i in range(n_tables): + cur.execute( + f"CREATE TABLE t{i}(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" + ) + + def start_single_table_workload(table_id: int): + start = time.time() + with env.pg.connect().cursor() as cur: + while time.time() - start < duration: + cur.execute(f"INSERT INTO t{table_id} SELECT FROM generate_series(1,1000)") + + with env.record_pg_stats(pg_stats_wo): + threads = [ + threading.Thread(target=start_single_table_workload, args=(i, )) + for i in range(n_tables) + ] + + for thread in threads: + thread.start() + for thread in threads: + thread.join() diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py new file mode 100644 index 0000000000..ee867a9845 --- /dev/null +++ b/test_runner/performance/test_dup_key.py @@ -0,0 +1,50 @@ +import pytest +from contextlib import closing +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ]) +def test_dup_key(env: PgCompare): + # Update the same page many times, then measure read performance + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute('drop table if exists t, f;') + + cur.execute("SET synchronous_commit=off") + cur.execute("SET statement_timeout=0") + + # Write many updates to the same row + with env.record_duration('write'): + cur.execute("create table t (i integer, filler text);") + cur.execute('insert into t values (0);') + cur.execute(""" +do $$ +begin + for ivar in 1..5000000 loop + update t set i = ivar, filler = repeat('a', 50); + update t set i = ivar, filler = repeat('b', 50); + update t set i = ivar, filler = repeat('c', 50); + update t set i = ivar, filler = repeat('d', 50); + rollback; + end loop; +end; +$$; +""") + + # Write 3-4 MB to evict t from compute cache + cur.execute('create table f (i integer);') + cur.execute(f'insert into f values (generate_series(1,100000));') + + # Read + with env.record_duration('read'): + cur.execute('select * from t;') + cur.fetchall()