From df7f6448221c6b7786b8f1f63200f79b604c067e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 19 Jul 2022 11:27:06 +0300 Subject: [PATCH 01/29] Move things around in github yml file, for clarity. Also, this avoids building the list of test binaries in release mode. They are not included in the neon.tgz tarball in release mode. --- .github/workflows/build_and_test.yml | 34 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 01920643ec..0186232e3e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -145,24 +145,15 @@ jobs: cov_prefix=() fi + # FIXME: What's this for? + mkdir -p /tmp/neon/etc/ + + # Install target binaries + mkdir -p /tmp/neon/bin/ binaries=$( "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) - - test_exe_paths=$( - "${cov_prefix[@]}" cargo test --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - - mkdir -p /tmp/neon/bin/ - mkdir -p /tmp/neon/test_bin/ - mkdir -p /tmp/neon/etc/ - - # Keep bloated coverage data files away from the rest of the artifact - mkdir -p /tmp/coverage/ - - # Install target binaries for bin in $binaries; do SRC=target/$BUILD_TYPE/$bin DST=/tmp/neon/bin/$bin @@ -171,9 +162,14 @@ jobs: # Install test executables and write list of all binaries (for code coverage) if [[ $BUILD_TYPE == "debug" ]]; then - for bin in $binaries; do - echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list - done + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ + + mkdir -p /tmp/neon/test_bin/ + test_exe_paths=$( + "${cov_prefix[@]}" cargo test --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) for bin in $test_exe_paths; do SRC=$bin DST=/tmp/neon/test_bin/$(basename $bin) @@ -183,6 +179,10 @@ jobs: strip "$SRC" -o "$DST" echo "$DST" >> /tmp/coverage/binaries.list done + + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list + done fi - name: Prepare neon artifact From 3dce39419794627f0502450dc6e5df7dd282031b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 19 Jul 2022 15:29:51 +0300 Subject: [PATCH 02/29] Use the same cargo options for every cargo call. The "cargo metadata" and "cargo test --no-run" are used in the workflow to just list names of the final binaries, but unless the same cargo options like --release or --debug are used in those calls, they will in fact recompile everything. --- .github/workflows/build_and_test.yml | 56 ++++++++++++++-------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0186232e3e..99d483ea4a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -84,6 +84,29 @@ jobs: submodules: true fetch-depth: 1 + # Set some environment variables used by all the steps. + # + # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. + # It also includes --features, if any + # + # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, + # because "cargo metadata" doesn't accept --release or --debug options + # + - name: Set env variables + run: | + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" + CARGO_FEATURES="" + CARGO_FLAGS="" + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix="" + CARGO_FEATURES="--features profiling" + CARGO_FLAGS="--release $CARGO_FEATURES" + fi + echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV + echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV + echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV + - name: Get postgres artifact for restoration uses: actions/download-artifact@v3 with: @@ -115,43 +138,18 @@ jobs: - name: Run cargo build run: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - CARGO_FLAGS="--release --features profiling" - fi - - "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests - name: Run cargo test run: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - CARGO_FLAGS=--release - fi - - "${cov_prefix[@]}" cargo test $CARGO_FLAGS + ${cov_prefix} cargo test $CARGO_FLAGS - name: Install rust binaries run: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - - # FIXME: What's this for? - mkdir -p /tmp/neon/etc/ - # Install target binaries mkdir -p /tmp/neon/bin/ binaries=$( - "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | + ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) for bin in $binaries; do @@ -167,7 +165,7 @@ jobs: mkdir -p /tmp/neon/test_bin/ test_exe_paths=$( - "${cov_prefix[@]}" cargo test --message-format=json --no-run | + ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run | jq -r '.executable | select(. != null)' ) for bin in $test_exe_paths; do From 5ff7a7dd8bbebde1fc14990a977fbe834f2ed8e0 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 19 Jul 2022 16:33:33 +0100 Subject: [PATCH 03/29] github/workflows: run periodic benchmarks earlier (#2121) --- .github/workflows/benchmarking.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index d08c3c50bd..cfd54325eb 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -11,7 +11,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '36 7 * * *' # run once a day, timezone is utc + - cron: '36 4 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually From 4446791397e88156012e704381e329551b404c60 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 19 Jul 2022 17:40:58 +0100 Subject: [PATCH 04/29] github/workflows: pause stress env deployment (#2122) --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 99d483ea4a..95da34dc62 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -555,7 +555,7 @@ jobs: if [[ "$GITHUB_REF_NAME" == "main" ]]; then STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}' NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}' - echo "::set-output name=include::[$STAGING, $NEON_STRESS]" + echo "::set-output name=include::[$STAGING]" elif [[ "$GITHUB_REF_NAME" == "release" ]]; then PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}' echo "::set-output name=include::[$PRODUCTION]" From 71753dd947373f5536d2a2d4dbad3d8cf866fb6e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 19 Jul 2022 16:37:37 +0300 Subject: [PATCH 05/29] Remove github CI 'build_postgres' job, merging it with 'build_neon' Simplifies the workflow. Makes the overall build a little faster, as the build_postgres step doesn't need to upload the pg.tgz artifact, and the build_neon step doesn't need to download it again. This effectively reverts commit a490f64a68. That commit changed the workflow so that the Postgres binaries were not included in the neon.tgz artifact. With this commit, the pg.tgz artifact is gone, and the Postgres binaries are part of neon.tgz again. --- .../actions/run-python-test-set/action.yml | 15 +--- .github/workflows/build_and_test.yml | 70 +++++-------------- 2 files changed, 17 insertions(+), 68 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index f220be2b12..accb8896de 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -37,12 +37,6 @@ runs: name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact path: ./neon-artifact/ - - name: Get Postgres artifact for restoration - uses: actions/download-artifact@v3 - with: - name: postgres-${{ runner.os }}-${{ inputs.build_type }}-artifact - path: ./pg-artifact/ - - name: Extract Neon artifact shell: bash -ex {0} run: | @@ -50,13 +44,6 @@ runs: tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ rm -rf ./neon-artifact/ - - name: Extract Postgres artifact - shell: bash -ex {0} - run: | - mkdir -p /tmp/neon/tmp_install - tar -xf ./pg-artifact/pg.tgz -C /tmp/neon/tmp_install - rm -rf ./pg-artifact/ - - name: Checkout if: inputs.needs_postgres_source == 'true' uses: actions/checkout@v3 @@ -78,7 +65,7 @@ runs: - name: Run pytest env: NEON_BIN: /tmp/neon/bin - POSTGRES_DISTRIB_DIR: /tmp/neon/tmp_install + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install TEST_OUTPUT: /tmp/test_output # this variable will be embedded in perf test report # and is needed to distinguish different environments diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 95da34dc62..e20dc08697 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -21,7 +21,7 @@ env: COPT: '-Werror' jobs: - build-postgres: + build-neon: runs-on: [ self-hosted, Linux, k8s-runner ] strategy: fail-fast: false @@ -31,6 +31,7 @@ jobs: env: BUILD_TYPE: ${{ matrix.build_type }} + steps: - name: Checkout uses: actions/checkout@v3 @@ -42,48 +43,6 @@ jobs: id: pg_ver run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) - - name: Cache postgres build - id: cache_pg - uses: actions/cache@v3 - with: - path: tmp_install/ - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Build postgres - if: steps.cache_pg.outputs.cache-hit != 'true' - run: mold -run make postgres -j$(nproc) - - # actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache - - name: Prepare postgres artifact - run: tar -C tmp_install/ -czf ./pg.tgz . - - name: Upload postgres artifact - uses: actions/upload-artifact@v3 - with: - retention-days: 7 - if-no-files-found: error - name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact - path: ./pg.tgz - - - build-neon: - runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ build-postgres ] - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] - - env: - BUILD_TYPE: ${{ matrix.build_type }} - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 1 - # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -107,17 +66,6 @@ jobs: echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV - - name: Get postgres artifact for restoration - uses: actions/download-artifact@v3 - with: - name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact - path: ./postgres-artifact/ - - name: Extract postgres artifact - run: | - mkdir ./tmp_install/ - tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/ - rm -rf ./postgres-artifact/ - # Don't include the ~/.cargo/registry/src directory. It contains just # uncompressed versions of the crates in ~/.cargo/registry/cache # directory, and it's faster to let 'cargo' to rebuild it from the @@ -136,6 +84,17 @@ jobs: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + - name: Cache postgres build + id: cache_pg + uses: actions/cache@v3 + with: + path: tmp_install/ + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Build postgres + if: steps.cache_pg.outputs.cache-hit != 'true' + run: mold -run make postgres -j$(nproc) + - name: Run cargo build run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests @@ -183,6 +142,9 @@ jobs: done fi + - name: Install postgres binaries + run: cp -a tmp_install /tmp/neon/pg_install + - name: Prepare neon artifact run: tar -C /tmp/neon/ -czf ./neon.tgz . From 98dd2e4f52731f4d0ebdd75591de056da62e0129 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 19 Jul 2022 18:36:46 +0300 Subject: [PATCH 06/29] Use zstd and multiple threads to compress artifact tarball. For faster and better compression. --- .github/actions/run-python-test-set/action.yml | 2 +- .github/workflows/build_and_test.yml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index accb8896de..0d058d47c1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -41,7 +41,7 @@ runs: shell: bash -ex {0} run: | mkdir -p /tmp/neon/ - tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/ rm -rf ./neon-artifact/ - name: Checkout diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e20dc08697..3fecb2bf67 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -146,7 +146,7 @@ jobs: run: cp -a tmp_install /tmp/neon/pg_install - name: Prepare neon artifact - run: tar -C /tmp/neon/ -czf ./neon.tgz . + run: ZSTD_NBTHREADS=0 tar -C /tmp/neon/ -cf ./neon.tar.zst --zstd . - name: Upload neon binaries uses: actions/upload-artifact@v3 @@ -154,7 +154,7 @@ jobs: retention-days: 7 if-no-files-found: error name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact - path: ./neon.tgz + path: ./neon.tar.zst # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data @@ -279,7 +279,7 @@ jobs: - name: Extract Neon artifact run: | mkdir -p /tmp/neon/ - tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/ rm -rf ./neon-artifact/ - name: Restore coverage data From 160e52ec7e70213cc0e87843886c1e2204bdf60d Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Tue, 19 Jul 2022 14:56:25 -0400 Subject: [PATCH 07/29] Optimize branch creation (#2101) Resolves #2054 **Context**: branch creation needs to wait for GC to acquire `gc_cs` lock, which prevents creating new timelines during GC. However, because individual timeline GC iteration also requires `compaction_cs` lock, branch creation may also need to wait for compactions of multiple timelines. This results in large latency when creating a new branch, which we advertised as *"instantly"*. This PR optimizes the latency of branch creation by separating GC into two phases: 1. Collect GC data (branching points, cutoff LSNs, etc) 2. Perform GC for each timeline The GC bottleneck comes from step 2, which must wait for compaction of multiple timelines. This PR modifies the branch creation and GC functions to allow GC to hold the GC lock only in step 1. As a result, branch creation doesn't need to wait for compaction to finish but only needs to wait for GC data collection step, which is fast. --- .github/workflows/build_and_test.yml | 6 +- .github/workflows/codestyle.yml | 2 +- libs/postgres_ffi/build.rs | 10 +- pageserver/src/layered_repository.rs | 291 +++++++++++------- .../batch_others/test_branch_and_gc.py | 66 ++++ .../performance/test_branch_creation.py | 110 +++++++ 6 files changed, 359 insertions(+), 126 deletions(-) create mode 100644 test_runner/performance/test_branch_creation.py diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3fecb2bf67..5874aa9b5c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -81,8 +81,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres build id: cache_pg @@ -268,7 +268,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact for restoration uses: actions/download-artifact@v3 diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 89bfffd4b9..8bcaa8f947 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -101,7 +101,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index c6df4fc0b0..7db2c20e34 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -49,12 +49,12 @@ fn main() { // Finding the location of C headers for the Postgres server: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/tmp_install/include/postgresql/server` - let mut pg_install_dir: PathBuf; - if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { - pg_install_dir = postgres_install_dir.into(); + let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") + { + postgres_install_dir.into() } else { - pg_install_dir = PathBuf::from("tmp_install") - } + PathBuf::from("tmp_install") + }; if pg_install_dir.is_relative() { let cwd = env::current_dir().unwrap(); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6459e802f4..93acce912c 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -281,12 +281,22 @@ impl Repository for LayeredRepository { // concurrently removes data that is needed by the new timeline. let _gc_cs = self.gc_cs.lock().unwrap(); + // In order for the branch creation task to not wait for GC/compaction, + // we need to make sure that the starting LSN of the child branch is not out of scope midway by + // + // 1. holding the GC lock to prevent overwritting timeline's GC data + // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline + // + // Step 2 is to avoid initializing the new branch using data removed by past GC iterations + // or in-queue GC iterations. + let mut timelines = self.timelines.lock().unwrap(); let src_timeline = self .get_timeline_load_internal(src, &mut timelines) // message about timeline being remote is one .context up in the stack .context("failed to load timeline for branching")? .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?; + let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN @@ -296,9 +306,23 @@ impl Repository for LayeredRepository { lsn }); + // Check if the starting LSN is out of scope because it is less than + // 1. the latest GC cutoff LSN or + // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. src_timeline .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context("invalid branch start lsn")?; + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {latest_gc_cutoff_lsn}" + ))?; + { + let gc_info = src_timeline.gc_info.read().unwrap(); + let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + if start_lsn < cutoff { + bail!(format!( + "invalid branch start lsn: less than planned GC cutoff {cutoff}" + )); + } + } // Determine prev-LSN for the new timeline. We can only determine it if // the timeline was branched at the current end of the source timeline. @@ -440,13 +464,7 @@ impl Repository for LayeredRepository { Entry::Vacant(_) => bail!("timeline not found"), }; - // try to acquire gc and compaction locks to prevent errors from missing files - let _gc_guard = self - .gc_cs - .try_lock() - .map_err(|e| anyhow::anyhow!("cannot acquire gc lock {e}"))?; - - let compaction_guard = timeline_entry.get().compaction_guard()?; + let layer_removal_guard = timeline_entry.get().layer_removal_guard()?; let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { @@ -457,7 +475,7 @@ impl Repository for LayeredRepository { })?; info!("detach removed files"); - drop(compaction_guard); + drop(layer_removal_guard); timeline_entry.remove(); Ok(()) @@ -524,10 +542,10 @@ impl LayeredTimelineEntry { } } - fn compaction_guard(&self) -> Result>, anyhow::Error> { + fn layer_removal_guard(&self) -> Result>, anyhow::Error> { match self { LayeredTimelineEntry::Loaded(timeline) => timeline - .compaction_cs + .layer_removal_cs .try_lock() .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) .map(Some), @@ -883,50 +901,50 @@ impl LayeredRepository { let now = Instant::now(); // grab mutex to prevent new timelines from being created here. - let _gc_cs = self.gc_cs.lock().unwrap(); + let gc_cs = self.gc_cs.lock().unwrap(); + + let mut timelines = self.timelines.lock().unwrap(); // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); - let mut timeline_ids = Vec::new(); - let mut timelines = self.timelines.lock().unwrap(); + let timeline_ids = { + if let Some(target_timeline_id) = target_timeline_id.as_ref() { + if timelines.get(target_timeline_id).is_none() { + bail!("gc target timeline does not exist") + } + }; - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - bail!("gc target timeline does not exist") - } + timelines + .iter() + .map(|(timeline_id, timeline_entry)| { + // This is unresolved question for now, how to do gc in presence of remote timelines + // especially when this is combined with branching. + // Somewhat related: https://github.com/zenithdb/zenith/issues/999 + if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { + // If target_timeline is specified, we only need to know branchpoints of its children + if let Some(timelineid) = target_timeline_id { + if ancestor_timeline_id == &timelineid { + all_branchpoints + .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + } + } + // Collect branchpoints for all timelines + else { + all_branchpoints + .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + } + } + + *timeline_id + }) + .collect::>() }; - for (timeline_id, timeline_entry) in timelines.iter() { - timeline_ids.push(*timeline_id); - - // This is unresolved question for now, how to do gc in presence of remote timelines - // especially when this is combined with branching. - // Somewhat related: https://github.com/neondatabase/neon/issues/999 - if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timelineid) = target_timeline_id { - if ancestor_timeline_id == &timelineid { - all_branchpoints - .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); - } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); - } - } - } - // Ok, we now know all the branch points. - // Perform GC for each timeline. - for timeline_id in timeline_ids.into_iter() { - if thread_mgr::is_shutdown_requested() { - // We were requested to shut down. Stop and return with the progress we - // made. - break; - } - + // Update the GC information for each timeline. + let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); + for timeline_id in timeline_ids { // Timeline is known to be local and loaded. let timeline = self .get_timeline_load_internal(timeline_id, &mut *timelines)? @@ -940,7 +958,6 @@ impl LayeredRepository { } if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { - drop(timelines); let branchpoints: Vec = all_branchpoints .range(( Included((timeline_id, Lsn(0))), @@ -948,21 +965,45 @@ impl LayeredRepository { )) .map(|&x| x.1) .collect(); + timeline.update_gc_info(branchpoints, cutoff, pitr)?; - // If requested, force flush all in-memory layers to disk first, - // so that they too can be garbage collected. That's - // used in tests, so we want as deterministic results as possible. - if checkpoint_before_gc { - timeline.checkpoint(CheckpointConfig::Forced)?; - info!("timeline {} checkpoint_before_gc done", timeline_id); - } - timeline.update_gc_info(branchpoints, cutoff, pitr); - let result = timeline.gc()?; - - totals += result; - timelines = self.timelines.lock().unwrap(); + gc_timelines.push(timeline); } } + drop(timelines); + drop(gc_cs); + + // Perform GC for each timeline. + // + // Note that we don't hold the GC lock here because we don't want + // to delay the branch creation task, which requires the GC lock. + // A timeline GC iteration can be slow because it may need to wait for + // compaction (both require `layer_removal_cs` lock), + // but the GC iteration can run concurrently with branch creation. + // + // See comments in [`LayeredRepository::branch_timeline`] for more information + // about why branch creation task can run concurrently with timeline's GC iteration. + for timeline in gc_timelines { + if thread_mgr::is_shutdown_requested() { + // We were requested to shut down. Stop and return with the progress we + // made. + break; + } + + // If requested, force flush all in-memory layers to disk first, + // so that they too can be garbage collected. That's + // used in tests, so we want as deterministic results as possible. + if checkpoint_before_gc { + timeline.checkpoint(CheckpointConfig::Forced)?; + info!( + "timeline {} checkpoint_before_gc done", + timeline.timeline_id + ); + } + + let result = timeline.gc()?; + totals += result; + } totals.elapsed = now.elapsed(); Ok(totals) @@ -1038,11 +1079,11 @@ pub struct LayeredTimeline { /// Used to ensure that there is only one thread layer_flush_lock: Mutex<()>, - // Prevent concurrent compactions. - // Compactions are normally performed by one thread. But compaction can also be manually - // requested by admin (that's used in tests). These forced compactions run in a different - // thread and could be triggered at the same time as a normal, timed compaction. - compaction_cs: Mutex<()>, + /// Layer removal lock. + /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. + /// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`], + /// and [`LayeredRepository::delete_timeline`]. + layer_removal_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected latest_gc_cutoff_lsn: RwLock, @@ -1079,12 +1120,14 @@ struct GcInfo { /// last-record LSN /// /// FIXME: is this inclusive or exclusive? - cutoff: Lsn, + horizon_cutoff: Lsn, - /// In addition to 'retain_lsns', keep everything newer than 'SystemTime::now()' - /// minus 'pitr_interval' + /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this + /// point. /// - pitr: Duration, + /// This is calculated by finding a number such that a record is needed for PITR + /// if only if its LSN is larger than 'pitr_cutoff'. + pitr_cutoff: Lsn, } /// Public interface functions @@ -1324,12 +1367,12 @@ impl LayeredTimeline { write_lock: Mutex::new(()), layer_flush_lock: Mutex::new(()), - compaction_cs: Mutex::new(()), + layer_removal_cs: Mutex::new(()), gc_info: RwLock::new(GcInfo { retain_lsns: Vec::new(), - cutoff: Lsn(0), - pitr: Duration::ZERO, + horizon_cutoff: Lsn(0), + pitr_cutoff: Lsn(0), }), latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), @@ -1950,7 +1993,7 @@ impl LayeredTimeline { // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. - let _compaction_cs = self.compaction_cs.lock().unwrap(); + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); let target_file_size = self.get_checkpoint_distance(); @@ -2267,46 +2310,34 @@ impl LayeredTimeline { /// TODO: that's wishful thinking, compaction doesn't actually do that /// currently. /// - /// The caller specifies how much history is needed with the two arguments: + /// The caller specifies how much history is needed with the 3 arguments: /// /// retain_lsns: keep a version of each page at these LSNs - /// cutoff: also keep everything newer than this LSN + /// cutoff_horizon: also keep everything newer than this LSN + /// pitr: the time duration required to keep data for PITR /// /// The 'retain_lsns' list is currently used to prevent removing files that /// are needed by child timelines. In the future, the user might be able to /// name additional points in time to retain. The caller is responsible for /// collecting that information. /// - /// The 'cutoff' point is used to retain recent versions that might still be + /// The 'cutoff_horizon' point is used to retain recent versions that might still be /// needed by read-only nodes. (As of this writing, the caller just passes /// the latest LSN subtracted by a constant, and doesn't do anything smart /// to figure out what read-only nodes might actually need.) /// - fn update_gc_info(&self, retain_lsns: Vec, cutoff: Lsn, pitr: Duration) { + /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine + /// whether a record is needed for PITR. + fn update_gc_info( + &self, + retain_lsns: Vec, + cutoff_horizon: Lsn, + pitr: Duration, + ) -> Result<()> { let mut gc_info = self.gc_info.write().unwrap(); + + gc_info.horizon_cutoff = cutoff_horizon; gc_info.retain_lsns = retain_lsns; - gc_info.cutoff = cutoff; - gc_info.pitr = pitr; - } - - /// - /// Garbage collect layer files on a timeline that are no longer needed. - /// - /// Currently, we don't make any attempt at removing unneeded page versions - /// within a layer file. We can only remove the whole file if it's fully - /// obsolete. - /// - fn gc(&self) -> Result { - let now = SystemTime::now(); - let mut result: GcResult = Default::default(); - let disk_consistent_lsn = self.get_disk_consistent_lsn(); - - let _compaction_cs = self.compaction_cs.lock().unwrap(); - - let gc_info = self.gc_info.read().unwrap(); - let retain_lsns = &gc_info.retain_lsns; - let cutoff = min(gc_info.cutoff, disk_consistent_lsn); - let pitr = gc_info.pitr; // Calculate pitr cutoff point. // If we cannot determine a cutoff LSN, be conservative and don't GC anything. @@ -2315,6 +2346,7 @@ impl LayeredTimeline { if let Ok(timeline) = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) { + let now = SystemTime::now(); // First, calculate pitr_cutoff_timestamp and then convert it to LSN. // If we don't have enough data to convert to LSN, // play safe and don't remove any layers. @@ -2325,7 +2357,7 @@ impl LayeredTimeline { LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, LsnForTimestamp::Future(lsn) => { debug!("future({})", lsn); - pitr_cutoff_lsn = cutoff; + pitr_cutoff_lsn = gc_info.horizon_cutoff; } LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); @@ -2339,22 +2371,47 @@ impl LayeredTimeline { } else if cfg!(test) { // We don't have local timeline in mocked cargo tests. // So, just ignore pitr_interval setting in this case. - pitr_cutoff_lsn = cutoff; + pitr_cutoff_lsn = gc_info.horizon_cutoff; } + gc_info.pitr_cutoff = pitr_cutoff_lsn; - let new_gc_cutoff = Lsn::min(cutoff, pitr_cutoff_lsn); + Ok(()) + } + + /// + /// Garbage collect layer files on a timeline that are no longer needed. + /// + /// Currently, we don't make any attempt at removing unneeded page versions + /// within a layer file. We can only remove the whole file if it's fully + /// obsolete. + /// + fn gc(&self) -> Result { + let mut result: GcResult = Default::default(); + let now = SystemTime::now(); + + fail_point!("before-timeline-gc"); + + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let gc_info = self.gc_info.read().unwrap(); + + let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.pitr_cutoff; + let retain_lsns = &gc_info.retain_lsns; + + let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); // Nothing to GC. Return early. - if *self.get_latest_gc_cutoff_lsn() >= new_gc_cutoff { + let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + if latest_gc_cutoff >= new_gc_cutoff { info!( - "Nothing to GC for timeline {}. cutoff_lsn {}", - self.timeline_id, new_gc_cutoff + "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", + self.timeline_id ); - result.elapsed = now.elapsed()?; return Ok(result); } - let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered(); + let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. // See branch_timeline() for details. @@ -2388,23 +2445,23 @@ impl LayeredTimeline { result.layers_total += 1; - // 1. Is it newer than cutoff point? - if l.get_lsn_range().end > cutoff { + // 1. Is it newer than GC horizon cutoff point? + if l.get_lsn_range().end > horizon_cutoff { debug!( - "keeping {} because it's newer than cutoff {}", + "keeping {} because it's newer than horizon_cutoff {}", l.filename().display(), - cutoff + horizon_cutoff ); result.layers_needed_by_cutoff += 1; continue 'outer; } // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > pitr_cutoff_lsn { + if l.get_lsn_range().end > pitr_cutoff { debug!( - "keeping {} because it's newer than pitr_cutoff_lsn {}", + "keeping {} because it's newer than pitr_cutoff {}", l.filename().display(), - pitr_cutoff_lsn + pitr_cutoff ); result.layers_needed_by_pitr += 1; continue 'outer; @@ -2823,7 +2880,7 @@ pub mod tests { let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; @@ -2893,7 +2950,7 @@ pub mod tests { // Perform a cycle of checkpoint, compaction, and GC println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; @@ -2970,7 +3027,7 @@ pub mod tests { // Perform a cycle of checkpoint, compaction, and GC println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py index a6210b9176..7157386ce2 100644 --- a/test_runner/batch_others/test_branch_and_gc.py +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -1,3 +1,5 @@ +import threading +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.utils import lsn_from_hex @@ -99,3 +101,67 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): branch_cur.execute('SELECT count(*) FROM foo') assert branch_cur.fetchone() == (200000, ) + + +# This test simulates a race condition happening when branch creation and GC are performed concurrently. +# +# Suppose we want to create a new timeline 't' from a source timeline 's' starting +# from a lsn 'lsn'. Upon creating 't', if we don't hold the GC lock and compare 'lsn' with +# the latest GC information carefully, it's possible for GC to accidentally remove data +# needed by the new timeline. +# +# In this test, GC is requested before the branch creation but is delayed to happen after branch creation. +# As a result, when doing GC for the source timeline, we don't have any information about +# the upcoming new branches, so it's possible to remove data that may be needed by the new branches. +# It's the branch creation task's job to make sure the starting 'lsn' is not out of scope +# and prevent creating branches with invalid starting LSNs. +# +# For more details, see discussion in https://github.com/neondatabase/neon/pull/2101#issuecomment-1185273447. +def test_branch_creation_before_gc(neon_simple_env: NeonEnv): + env = neon_simple_env + # Disable background GC but set the `pitr_interval` to be small, so GC can delete something + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + 'gc_period': '10 m', + 'gc_horizon': f'{10 * 1024 ** 3}', + + # small checkpoint distance to create more delta layer files + 'checkpoint_distance': f'{1024 ** 2}', + + # set the target size to be large to allow the image layer to cover the whole key space + 'compaction_target_size': f'{1024 ** 3}', + + # tweak the default settings to allow quickly create image layers and L1 layers + 'compaction_period': '1 s', + 'compaction_threshold': '2', + 'image_creation_threshold': '1', + + # set PITR interval to be small, so we can do GC + 'pitr_interval': '1 s' + }) + + b0 = env.neon_cli.create_branch('b0', tenant_id=tenant) + pg0 = env.postgres.create_start('b0', tenant_id=tenant) + res = pg0.safe_psql_many(queries=[ + "CREATE TABLE t(key serial primary key)", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + "SELECT pg_current_wal_insert_lsn()", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + ]) + lsn = res[2][0][0] + + # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the + # branch creation task but the individual timeline GC iteration happens *after* + # the branch creation task. + env.pageserver.safe_psql(f"failpoints before-timeline-gc=sleep(2000)") + + def do_gc(): + env.pageserver.safe_psql(f"do_gc {tenant.hex} {b0.hex} 0") + + thread = threading.Thread(target=do_gc, daemon=True) + thread.start() + + # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. + with pytest.raises(Exception, match="invalid branch start lsn"): + env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py new file mode 100644 index 0000000000..1d39b0830d --- /dev/null +++ b/test_runner/performance/test_branch_creation.py @@ -0,0 +1,110 @@ +import random +import time +import statistics +import threading +import timeit +import pytest +from typing import List +from fixtures.benchmark_fixture import MetricReport +from fixtures.compare_fixtures import NeonCompare +from fixtures.log_helper import log + + +def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): + neon_compare.zenbenchmark.record("branch_creation_duration_max", + max(durs), + 's', + MetricReport.LOWER_IS_BETTER) + neon_compare.zenbenchmark.record("branch_creation_duration_avg", + statistics.mean(durs), + 's', + MetricReport.LOWER_IS_BETTER) + neon_compare.zenbenchmark.record("branch_creation_duration_stdev", + statistics.stdev(durs), + 's', + MetricReport.LOWER_IS_BETTER) + + +@pytest.mark.parametrize("n_branches", [20]) +# Test measures the latency of branch creation during a heavy [1] workload. +# +# [1]: to simulate a heavy workload, the test tweaks the GC and compaction settings +# to increase the task's frequency. The test runs `pgbench` in each new branch. +# Each branch is created from a randomly picked source branch. +def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int): + env = neon_compare.env + pg_bin = neon_compare.pg_bin + + # Use aggressive GC and checkpoint settings, so GC and compaction happen more often during the test + tenant, _ = env.neon_cli.create_tenant( + conf={ + 'gc_period': '5 s', + 'gc_horizon': f'{4 * 1024 ** 2}', + 'checkpoint_distance': f'{2 * 1024 ** 2}', + 'compaction_target_size': f'{1024 ** 2}', + 'compaction_threshold': '2', + # set PITR interval to be small, so we can do GC + 'pitr_interval': '5 s' + }) + + def run_pgbench(branch: str): + log.info(f"Start a pgbench workload on branch {branch}") + + pg = env.postgres.create_start(branch, tenant_id=tenant) + connstr = pg.connstr() + + pg_bin.run_capture(['pgbench', '-i', connstr]) + pg_bin.run_capture(['pgbench', '-c10', '-T10', connstr]) + + pg.stop() + + env.neon_cli.create_branch('b0', tenant_id=tenant) + + threads: List[threading.Thread] = [] + threads.append(threading.Thread(target=run_pgbench, args=('b0', ), daemon=True)) + threads[-1].start() + + branch_creation_durations = [] + for i in range(n_branches): + time.sleep(1.0) + + # random a source branch + p = random.randint(0, i) + + timer = timeit.default_timer() + env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p), tenant_id=tenant) + dur = timeit.default_timer() - timer + + log.info(f"Creating branch b{i+1} took {dur}s") + branch_creation_durations.append(dur) + + threads.append(threading.Thread(target=run_pgbench, args=(f'b{i+1}', ), daemon=True)) + threads[-1].start() + + for thread in threads: + thread.join() + + _record_branch_creation_durations(neon_compare, branch_creation_durations) + + +@pytest.mark.parametrize("n_branches", [1024]) +# Test measures the latency of branch creation when creating a lot of branches. +def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): + env = neon_compare.env + + env.neon_cli.create_branch('b0') + + pg = env.postgres.create_start('b0') + neon_compare.pg_bin.run_capture(['pgbench', '-i', '-s10', pg.connstr()]) + + branch_creation_durations = [] + + for i in range(n_branches): + # random a source branch + p = random.randint(0, i) + timer = timeit.default_timer() + env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p)) + dur = timeit.default_timer() - timer + branch_creation_durations.append(dur) + + _record_branch_creation_durations(neon_compare, branch_creation_durations) From abff15dd7c2a64ae15d06679080653aa056a3269 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Jul 2022 15:04:24 +0300 Subject: [PATCH 08/29] Fix test to be more robust with slow pageserver. If the WAL arrives at the pageserver slowly, it's possible that the branch is created before all the data on the parent branch have arrived. That results in a failure: test_runner/batch_others/test_tenant_relocation.py:259: in test_tenant_relocation timeline_id_second, current_lsn_second = populate_branch(pg_second, create_table=False, expected_sum=1001000) test_runner/batch_others/test_tenant_relocation.py:133: in populate_branch assert cur.fetchone() == (expected_sum, ) E assert (500500,) == (1001000,) E At index 0 diff: 500500 != 1001000 E Full diff: E - (1001000,) E + (500500,) To fix, specify the LSN to branch at, so that the pageserver will wait for it arrive. See https://github.com/neondatabase/neon/issues/2063 --- test_runner/batch_others/test_tenant_relocation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 73f6f52e72..d59f28bcc5 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -26,7 +26,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.utils import lsn_from_hex, subprocess_capture +from fixtures.utils import lsn_from_hex, lsn_to_hex, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -268,6 +268,7 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, env.neon_cli.create_branch( new_branch_name="test_tenant_relocation_second", ancestor_branch_name="test_tenant_relocation_main", + ancestor_start_lsn=lsn_to_hex(current_lsn_main), tenant_id=tenant_id, ) pg_second = env.postgres.create_start(branch_name='test_tenant_relocation_second', From b4c74c0ecd9776d91b973fe00bb647de7f227727 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Jul 2022 12:12:02 +0300 Subject: [PATCH 09/29] Clean up unnecessary dependencies. Just to be tidy. --- Cargo.lock | 8 -------- compute_tools/Cargo.toml | 1 - control_plane/Cargo.toml | 1 - libs/metrics/Cargo.toml | 1 - neon_local/Cargo.toml | 1 - pageserver/Cargo.toml | 1 - pageserver/src/walreceiver/walreceiver_connection.rs | 2 +- safekeeper/Cargo.toml | 3 --- 8 files changed, 1 insertion(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f453678e6..5031ae02e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -467,7 +467,6 @@ dependencies = [ "clap 3.2.12", "env_logger", "hyper", - "libc", "log", "postgres", "regex", @@ -517,7 +516,6 @@ dependencies = [ "tar", "thiserror", "toml", - "url", "utils", "workspace_hack", ] @@ -1604,7 +1602,6 @@ version = "0.1.0" dependencies = [ "lazy_static", "libc", - "once_cell", "prometheus", "workspace_hack", ] @@ -1677,7 +1674,6 @@ dependencies = [ "git-version", "pageserver", "postgres", - "postgres_ffi", "safekeeper", "serde_json", "utils", @@ -1905,7 +1901,6 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "tokio-stream", "toml_edit", "tracing", "url", @@ -2764,7 +2759,6 @@ dependencies = [ "daemonize", "etcd_broker", "fs2", - "futures", "git-version", "hex", "humantime", @@ -2784,12 +2778,10 @@ dependencies = [ "tempfile", "tokio", "tokio-postgres", - "tokio-util", "toml_edit", "tracing", "url", "utils", - "walkdir", "workspace_hack", ] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 1022438c2e..78b85d0e79 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -libc = "0.2" anyhow = "1.0" chrono = "0.4" clap = "3.0" diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 21311eea9a..26bb577636 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -14,7 +14,6 @@ regex = "1" anyhow = "1.0" thiserror = "1" nix = "0.23" -url = "2.2.2" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } pageserver = { path = "../pageserver" } diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index 8ff5d1d421..2879dfed81 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -7,5 +7,4 @@ edition = "2021" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency libc = "0.2" lazy_static = "1.4" -once_cell = "1.8.0" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/neon_local/Cargo.toml b/neon_local/Cargo.toml index 8ebd7d5c17..2fc38cfe02 100644 --- a/neon_local/Cargo.toml +++ b/neon_local/Cargo.toml @@ -15,6 +15,5 @@ git-version = "0.3.5" pageserver = { path = "../pageserver" } control_plane = { path = "../control_plane" } safekeeper = { path = "../safekeeper" } -postgres_ffi = { path = "../libs/postgres_ffi" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index b7d97a67c0..215fa151a0 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -29,7 +29,6 @@ postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-stream = "0.1.8" anyhow = { version = "1.0", features = ["backtrace"] } crc32c = "0.6.0" thiserror = "1.0" diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 98b36dfe48..0c8c0ae2f6 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -9,12 +9,12 @@ use std::{ use anyhow::{bail, ensure, Context}; use bytes::BytesMut; use fail::fail_point; +use futures::StreamExt; use postgres::{SimpleQueryMessage, SimpleQueryRow}; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; -use tokio_stream::StreamExt; use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 373108c61b..f6ae9e75d7 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -20,7 +20,6 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8 anyhow = "1.0" crc32c = "0.6.0" humantime = "2.1.0" -walkdir = "2" url = "2.2.2" signal-hook = "0.3.10" serde = { version = "1.0", features = ["derive"] } @@ -28,11 +27,9 @@ serde_with = "1.12.0" hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-util = { version = "0.7", features = ["io"] } git-version = "0.3.5" async-trait = "0.1" once_cell = "1.10.0" -futures = "0.3.13" toml_edit = { version = "0.13", features = ["easy"] } postgres_ffi = { path = "../libs/postgres_ffi" } From f4233fde398172d8734dfae036b71367e274c0fd Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Jul 2022 15:19:46 +0300 Subject: [PATCH 10/29] Silence "Module already imported" warning in python tests We were getting a warning like this from the pg_regress tests: =================== warnings summary =================== /usr/lib/python3/dist-packages/_pytest/config/__init__.py:663 /usr/lib/python3/dist-packages/_pytest/config/__init__.py:663: PytestAssertRewriteWarning: Module already imported so cannot be rewritten: fixtures.pg_stats self.import_plugin(import_spec) -- Docs: https://docs.pytest.org/en/stable/warnings.html ------------------ Benchmark results ------------------- To fix, reorder the imports in conftest.py. I'm not sure what exactly the problem was or why the order matters, but the warning is gone and that's good enough for me. --- test_runner/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index c6e6289a5c..51545d0217 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,5 +1,5 @@ pytest_plugins = ("fixtures.neon_fixtures", "fixtures.benchmark_fixture", + "fixtures.pg_stats", "fixtures.compare_fixtures", - "fixtures.slow", - "fixtures.pg_stats") + "fixtures.slow") From cc680dd81c4d7be96916811fc4de1f859703a0b9 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 20 Jul 2022 15:06:38 +0300 Subject: [PATCH 11/29] Explicitly enable cachepot in Docker builds only --- Dockerfile | 4 ++++ Dockerfile.compute-tools | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/Dockerfile b/Dockerfile index ad85638af3..6f017ac5d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,10 @@ RUN set -e \ FROM neondatabase/rust:1.58 AS build ARG GIT_VERSION=local +# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. +# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +ARG RUSTC_WRAPPER=cachepot ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 71770ae9ed..87b73e139c 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -2,6 +2,10 @@ # NB: keep in sync with rust image version in .circle/config.yml FROM neondatabase/rust:1.58 AS rust-build +# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. +# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +ARG RUSTC_WRAPPER=cachepot ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY From b445cf76658808dfbb1c440e663fb8a5b321d7aa Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 20 Jul 2022 22:13:05 +0300 Subject: [PATCH 12/29] Refactor test_unavailability (#2134) Now test_unavailability uses async instead of Process. The test is refactored to fix a possible race condition. --- test_runner/batch_others/test_wal_acceptor.py | 55 ------------------- .../batch_others/test_wal_acceptor_async.py | 52 ++++++++++++++++++ 2 files changed, 52 insertions(+), 55 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 9b876f780d..5014a7ad4e 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -203,61 +203,6 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): assert cur.fetchone() == (500500, ) -start_delay_sec = 2 - - -def delayed_safekeeper_start(wa): - time.sleep(start_delay_sec) - wa.start() - - -# When majority of acceptors is offline, commits are expected to be frozen -def test_unavailability(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 2 - env = neon_env_builder.init_start() - - env.neon_cli.create_branch('test_safekeepers_unavailability') - pg = env.postgres.create_start('test_safekeepers_unavailability') - - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - pg_conn = pg.connect() - cur = pg_conn.cursor() - - # check basic work with table - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t values (1, 'payload')") - - # shutdown one of two acceptors, that is, majority - env.safekeepers[0].stop() - - proc = Process(target=delayed_safekeeper_start, args=(env.safekeepers[0], )) - proc.start() - - start = time.time() - cur.execute("INSERT INTO t values (2, 'payload')") - # ensure that the query above was hanging while acceptor was down - assert (time.time() - start) >= start_delay_sec - proc.join() - - # for the world's balance, do the same with second acceptor - env.safekeepers[1].stop() - - proc = Process(target=delayed_safekeeper_start, args=(env.safekeepers[1], )) - proc.start() - - start = time.time() - cur.execute("INSERT INTO t values (3, 'payload')") - # ensure that the query above was hanging while acceptor was down - assert (time.time() - start) >= start_delay_sec - proc.join() - - cur.execute("INSERT INTO t values (4, 'payload')") - - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (10, ) - - # shut down random subset of acceptors, sleep, wake them up, rinse, repeat def xmas_garland(acceptors, stop): while not bool(stop.value): diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index d74ef8840a..9577c0980e 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -404,3 +404,55 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_branch('test_concurrent_computes') asyncio.run(run_concurrent_computes(env)) + + +# Stop safekeeper and check that query cannot be executed while safekeeper is down. +# Query will insert a single row into a table. +async def check_unavailability(sk: Safekeeper, + conn: asyncpg.Connection, + key: int, + start_delay_sec: int = 2): + # shutdown one of two acceptors, that is, majority + sk.stop() + + bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')")) + + await asyncio.sleep(start_delay_sec) + # ensure that the query has not been executed yet + assert not bg_query.done() + + # start safekeeper and await the query + sk.start() + await bg_query + assert bg_query.done() + + +async def run_unavailability(env: NeonEnv, pg: Postgres): + conn = await pg.connect_async() + + # check basic work with table + await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.execute("INSERT INTO t values (1, 'payload')") + + # stop safekeeper and check that query cannot be executed while safekeeper is down + await check_unavailability(env.safekeepers[0], conn, 2) + + # for the world's balance, do the same with second safekeeper + await check_unavailability(env.safekeepers[1], conn, 3) + + # check that we can execute queries after restart + await conn.execute("INSERT INTO t values (4, 'payload')") + + result_sum = await conn.fetchval('SELECT sum(key) FROM t') + assert result_sum == 10 + + +# When majority of acceptors is offline, commits are expected to be frozen +def test_unavailability(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 2 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_safekeepers_unavailability') + pg = env.postgres.create_start('test_safekeepers_unavailability') + + asyncio.run(run_unavailability(env, pg)) From 572ae743883df19f5ca9f32d7cdce7a7ca5cca4f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 21 Jul 2022 07:45:11 +0300 Subject: [PATCH 13/29] More precisely control size of inmem layer (#1927) * More precisely control size of inmem layer * Force recompaction of L0 layers if them contains large non-wallogged BLOBs to avoid too large layers * Add modified version of test_hot_update test (test_dup_key.py) which should generate large layers without large number of tables * Change test name in test_dup_key * Add Layer::get_max_key_range function * Add layer::key_iter method and implement new approach of splitting layers during compaction based on total size of all key values * Add test_large_schema test for checking layer file size after compaction * Make clippy happy * Restore checking LSN distance threshold for checkpoint in-memory layer * Optimize stoage keys iterator * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Fix code style * Reduce number of tables in test_large_schema to make it fit in timeout with debug build * Fix style of test_large_schema.py * Fix handlng of duplicates layers Co-authored-by: Heikki Linnakangas --- pageserver/src/layered_repository.rs | 176 ++++++++++++++---- .../src/layered_repository/delta_layer.rs | 84 +++++++++ .../src/layered_repository/ephemeral_file.rs | 2 +- .../src/layered_repository/inmemory_layer.rs | 8 + .../src/layered_repository/storage_layer.rs | 6 + test_runner/batch_others/test_large_schema.py | 82 ++++++++ test_runner/performance/test_dup_key.py | 48 +++++ 7 files changed, 372 insertions(+), 34 deletions(-) create mode 100644 test_runner/batch_others/test_large_schema.py create mode 100644 test_runner/performance/test_dup_key.py diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 93acce912c..3830e4c1bd 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1734,30 +1734,43 @@ impl LayeredTimeline { /// pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { let last_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_size = open_layer.size()?; + drop(layers); + let distance = last_lsn.widening_sub(self.last_freeze_at.load()); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + { + info!( + "check_checkpoint_distance {}, layer size {}", + distance, open_layer_size + ); - // Has more than 'checkpoint_distance' of WAL been accumulated? - let distance = last_lsn.widening_sub(self.last_freeze_at.load()); - if distance >= self.get_checkpoint_distance().into() { - // Yes. Freeze the current in-memory layer. - self.freeze_inmem_layer(true); - self.last_freeze_at.store(last_lsn); + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); - // Launch a thread to flush the frozen layer to disk, unless - // a thread was already running. (If the thread was running - // at the time that we froze the layer, it must've seen the - // the layer we just froze before it exited; see comments - // in flush_frozen_layers()) - if let Ok(guard) = self.layer_flush_lock.try_lock() { - drop(guard); - let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush thread", - false, - move || self_clone.flush_frozen_layers(false), - )?; + // Launch a thread to flush the frozen layer to disk, unless + // a thread was already running. (If the thread was running + // at the time that we froze the layer, it must've seen the + // the layer we just froze before it exited; see comments + // in flush_frozen_layers()) + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + thread_mgr::spawn( + thread_mgr::ThreadKind::LayerFlushThread, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush thread", + false, + move || self_clone.flush_frozen_layers(false), + )?; + } } } Ok(()) @@ -2211,9 +2224,59 @@ impl LayeredTimeline { } }); + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = deltas_to_compact + .iter() + .map(|l| l.key_iter()) + .kmerge_by(|a, b| { + let (a_key, a_lsn, _) = a; + let (b_key, b_lsn, _) = b; + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + }); + // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key // TODO: this actually divides the layers into fixed-size chunks, not // based on the partitioning. // @@ -2222,29 +2285,76 @@ impl LayeredTimeline { let mut new_layers = Vec::new(); let mut prev_key: Option = None; let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key for x in all_values_iter { let (key, lsn, value) = x?; - - if let Some(prev_key) = prev_key { - if key != prev_key && writer.is_some() { - let size = writer.as_mut().unwrap().size(); - if size > target_file_size { - new_layers.push(writer.take().unwrap().finish(prev_key.next())?); + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key, or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + break; + } + key_values_total_size += next_size; + if key_values_total_size > target_file_size { + // split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn + } else { + lsn + }; + dup_end_lsn = next_lsn; + break; + } + } + // handle case when loop reaches last key + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + // check if key cause layer overflow + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + { + new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); writer = None; } } + key_values_total_size = next_key_size; } - if writer.is_none() { writer = Some(DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_id, key, - lsn_range.clone(), + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, )?); } - writer.as_mut().unwrap().put_value(key, lsn, value)?; prev_key = Some(key); } @@ -2276,12 +2386,12 @@ impl LayeredTimeline { // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); - for l in deltas_to_compact { + for l in &deltas_to_compact { l.delete()?; if let Some(path) = l.local_path() { layer_paths_do_delete.insert(path); } - layers.remove_historic(l); + layers.remove_historic(l.clone()); } drop(layers); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index ed342c0cca..d622df531a 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -316,6 +316,18 @@ impl Layer for DeltaLayer { } } + fn key_iter<'a>(&'a self) -> Box + 'a> { + let inner = match self.load() { + Ok(inner) => inner, + Err(e) => panic!("Failed to load a delta layer: {e:?}"), + }; + + match DeltaKeyIter::new(inner) { + Ok(iter) => Box::new(iter), + Err(e) => panic!("Layer index is corrupted: {e:?}"), + } + } + fn delete(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; @@ -822,3 +834,75 @@ impl<'a> DeltaValueIter<'a> { } } } +/// +/// Iterator over all keys stored in a delta layer +/// +/// FIXME: This creates a Vector to hold all keys. +/// That takes up quite a lot of memory. Should do this in a more streaming +/// fashion. +/// +struct DeltaKeyIter { + all_keys: Vec<(DeltaKey, u64)>, + next_idx: usize, +} + +impl Iterator for DeltaKeyIter { + type Item = (Key, Lsn, u64); + + fn next(&mut self) -> Option { + if self.next_idx < self.all_keys.len() { + let (delta_key, size) = &self.all_keys[self.next_idx]; + + let key = delta_key.key(); + let lsn = delta_key.lsn(); + + self.next_idx += 1; + Some((key, lsn, *size)) + } else { + None + } + } +} + +impl<'a> DeltaKeyIter { + fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + + let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new(); + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value| { + let delta_key = DeltaKey::from_slice(key); + let pos = BlobRef(value).pos(); + if let Some(last) = all_keys.last_mut() { + if last.0.key() == delta_key.key() { + return true; + } else { + // subtract offset of new key BLOB and first blob of this key + // to get total size if values associated with this key + let first_pos = last.1; + last.1 = pos - first_pos; + } + } + all_keys.push((delta_key, pos)); + true + }, + )?; + if let Some(last) = all_keys.last_mut() { + // Last key occupies all space till end of layer + last.1 = std::fs::metadata(&file.file.path)?.len() - last.1; + } + let iter = DeltaKeyIter { + all_keys, + next_idx: 0, + }; + + Ok(iter) + } +} diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index cdde9d5d13..299bb4e873 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -43,7 +43,7 @@ pub struct EphemeralFile { _timelineid: ZTimelineId, file: Arc, - size: u64, + pub size: u64, } impl EphemeralFile { diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 87e6877520..1f89f333dd 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -233,6 +233,14 @@ impl Layer for InMemoryLayer { } impl InMemoryLayer { + /// + /// Get layer size on the disk + /// + pub fn size(&self) -> Result { + let inner = self.inner.read().unwrap(); + Ok(inner.file.size) + } + /// /// Create a new, empty, in-memory layer /// diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index aaf765b83d..e10330bdd3 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -139,6 +139,12 @@ pub trait Layer: Send + Sync { /// Iterate through all keys and values stored in the layer fn iter(&self) -> Box> + '_>; + /// Iterate through all keys stored in the layer. Returns key, lsn and value size + /// It is used only for compaction and so is currently implemented only for DeltaLayer + fn key_iter(&self) -> Box + '_> { + panic!("Not implemented") + } + /// Permanently remove this layer from disk. fn delete(&self) -> Result<()>; diff --git a/test_runner/batch_others/test_large_schema.py b/test_runner/batch_others/test_large_schema.py new file mode 100644 index 0000000000..18ae0614a9 --- /dev/null +++ b/test_runner/batch_others/test_large_schema.py @@ -0,0 +1,82 @@ +import time +import os +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.log_helper import log + + +# This test creates large number of tables which cause large catalog. +# Right now Neon serialize directory as single key-value storage entry and so +# it leads to layer filled mostly by one key. +# Originally Neon implementation of checkpoint and compaction is not able to split key which leads +# to large (several gigabytes) layer files (both ephemeral and delta layers). +# It may cause problems with uploading to S3 and also degrade performance because ephemeral file swapping. +# +def test_large_schema(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + pg = env.postgres.create_start('main') + + conn = pg.connect() + cur = conn.cursor() + + tables = 2 # 10 is too much for debug build + partitions = 1000 + for i in range(1, tables + 1): + print(f'iteration {i} / {tables}') + + # Restart compute. Restart is actually not strictly needed. + # It is done mostly because this test originally tries to model the problem reported by Ketteq. + pg.stop() + # Kill and restart the pageserver. + # env.pageserver.stop(immediate=True) + # env.pageserver.start() + pg.start() + + retry_sleep = 0.5 + max_retries = 200 + retries = 0 + while True: + try: + conn = pg.connect() + cur = conn.cursor() + cur.execute(f"CREATE TABLE if not exists t_{i}(pk integer) partition by range (pk)") + for j in range(1, partitions + 1): + cur.execute( + f"create table if not exists p_{i}_{j} partition of t_{i} for values from ({j}) to ({j + 1})" + ) + cur.execute(f"insert into t_{i} values (generate_series(1,{partitions}))") + cur.execute("vacuum full") + conn.close() + + except Exception as error: + # It's normal that it takes some time for the pageserver to + # restart, and for the connection to fail until it does. It + # should eventually recover, so retry until it succeeds. + print(f'failed: {error}') + if retries < max_retries: + retries += 1 + print(f'retry {retries} / {max_retries}') + time.sleep(retry_sleep) + continue + else: + raise + break + + conn = pg.connect() + cur = conn.cursor() + + for i in range(1, tables + 1): + cur.execute(f"SELECT count(*) FROM t_{i}") + assert cur.fetchone() == (partitions, ) + + cur.execute("set enable_sort=off") + cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") + + # Check layer file sizes + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id) + for filename in os.listdir(timeline_path): + if filename.startswith('00000'): + log.info(f'layer {filename} size is {os.path.getsize(timeline_path + filename)}') + assert os.path.getsize(timeline_path + filename) < 512_000_000 diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py new file mode 100644 index 0000000000..a8caceb61a --- /dev/null +++ b/test_runner/performance/test_dup_key.py @@ -0,0 +1,48 @@ +import pytest +from contextlib import closing +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ]) +def test_dup_key(env: PgCompare): + # Update the same page many times, then measure read performance + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SET synchronous_commit=off") + cur.execute("SET statement_timeout=0") + + # Write many updates to the same row + with env.record_duration('write'): + cur.execute("create table t (i integer, filler text);") + cur.execute('insert into t values (0);') + cur.execute(""" +do $$ +begin + for ivar in 1..5000000 loop + update t set i = ivar, filler = repeat('a', 50); + update t set i = ivar, filler = repeat('b', 50); + update t set i = ivar, filler = repeat('c', 50); + update t set i = ivar, filler = repeat('d', 50); + rollback; + end loop; +end; +$$; +""") + + # Write 3-4 MB to evict t from compute cache + cur.execute('create table f (i integer);') + cur.execute(f'insert into f values (generate_series(1,100000));') + + # Read + with env.record_duration('read'): + cur.execute('select * from t;') + cur.fetchall() From ed102f44d9cb101da57f6915afbbc96a14d23570 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Thu, 21 Jul 2022 12:08:26 -0400 Subject: [PATCH 14/29] Reduce memory allocations for page server (#2010) ## Overview This patch reduces the number of memory allocations when running the page server under a heavy write workload. This mostly helps improve the speed of WAL record ingestion. ## Changes - modified `DatadirModification` to allow reuse the struct's allocated memory after each modification - modified `decode_wal_record` to allow passing a `DecodedWALRecord` reference. This helps reuse the struct in each `decode_wal_record` call - added a reusable buffer for serializing object inside the `InMemoryLayer::put_value` function - added a performance test simulating a heavy write workload for testing the changes in this patch ### Semi-related changes - remove redundant serializations when calling `DeltaLayer::put_value` during `InMemoryLayer::write_to_disk` function call [1] - removed the info span `info_span!("processing record", lsn = %lsn)` during each WAL ingestion [2] ## Notes - [1]: in `InMemoryLayer::write_to_disk`, a deserialization is called ``` let val = Value::des(&buf)?; delta_layer_writer.put_value(key, *lsn, val)?; ``` `DeltaLayer::put_value` then creates a serialization based on the previous deserialization ``` let off = self.blob_writer.write_blob(&Value::ser(&val)?)?; ``` - [2]: related: https://github.com/neondatabase/neon/issues/733 --- pageserver/src/import_datadir.rs | 21 ++-- .../src/layered_repository/delta_layer.rs | 14 ++- .../src/layered_repository/inmemory_layer.rs | 22 +++- pageserver/src/pgdatadir_mapping.rs | 38 +++--- pageserver/src/walingest.rs | 112 +++++++++--------- .../src/walreceiver/walreceiver_connection.rs | 25 ++-- pageserver/src/walrecord.rs | 34 ++++-- .../batch_others/test_branch_and_gc.py | 6 + .../performance/test_compare_pg_stats.py | 33 ++++++ 9 files changed, 196 insertions(+), 109 deletions(-) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index f8a41e5b2b..6402657e05 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -16,6 +16,7 @@ use crate::reltag::{RelTag, SlruKind}; use crate::repository::Repository; use crate::repository::Timeline; use crate::walingest::WalIngest; +use crate::walrecord::DecodedWALRecord; use postgres_ffi::relfile_utils::*; use postgres_ffi::waldecoder::*; use postgres_ffi::xlog_utils::*; @@ -38,7 +39,7 @@ pub fn import_timeline_from_postgres_datadir( // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) // Then fishing out pg_control would be unnecessary - let mut modification = tline.begin_modification(lsn); + let mut modification = tline.begin_modification(); modification.init_empty()?; // Import all but pg_wal @@ -57,12 +58,12 @@ pub fn import_timeline_from_postgres_datadir( if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { pg_control = Some(control_file); } - modification.flush()?; + modification.flush(lsn)?; } } // We're done importing all the data files. - modification.commit()?; + modification.commit(lsn)?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; @@ -268,9 +269,11 @@ fn import_wal( waldecoder.feed_bytes(&buf); let mut nrecords = 0; + let mut modification = tline.begin_modification(); + let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(tline, recdata, lsn)?; + walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; last_lsn = lsn; nrecords += 1; @@ -300,7 +303,7 @@ pub fn import_basebackup_from_tar( base_lsn: Lsn, ) -> Result<()> { info!("importing base at {}", base_lsn); - let mut modification = tline.begin_modification(base_lsn); + let mut modification = tline.begin_modification(); modification.init_empty()?; let mut pg_control: Option = None; @@ -318,7 +321,7 @@ pub fn import_basebackup_from_tar( // We found the pg_control file. pg_control = Some(res); } - modification.flush()?; + modification.flush(base_lsn)?; } tar::EntryType::Directory => { debug!("directory {:?}", file_path); @@ -332,7 +335,7 @@ pub fn import_basebackup_from_tar( // sanity check: ensure that pg_control is loaded let _pg_control = pg_control.context("pg_control file not found")?; - modification.commit()?; + modification.commit(base_lsn)?; Ok(()) } @@ -384,9 +387,11 @@ pub fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); + let mut modification = tline.begin_modification(); + let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(tline, recdata, lsn)?; + walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index d622df531a..ce5cb57745 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -672,11 +672,21 @@ impl DeltaLayerWriter { /// The values must be appended in key, lsn order. /// pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init()) + } + + pub fn put_value_bytes( + &mut self, + key: Key, + lsn: Lsn, + val: &[u8], + will_init: bool, + ) -> Result<()> { assert!(self.lsn_range.start <= lsn); - let off = self.blob_writer.write_blob(&Value::ser(&val)?)?; + let off = self.blob_writer.write_blob(val)?; - let blob_ref = BlobRef::new(off, val.will_init()); + let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); self.tree.append(&delta_key.0, blob_ref.0)?; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 1f89f333dd..5f269a868f 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -15,6 +15,7 @@ use crate::layered_repository::storage_layer::{ use crate::repository::{Key, Value}; use crate::walrecord; use anyhow::{bail, ensure, Result}; +use std::cell::RefCell; use std::collections::HashMap; use tracing::*; use utils::{ @@ -30,6 +31,12 @@ use std::ops::Range; use std::path::PathBuf; use std::sync::RwLock; +thread_local! { + /// A buffer for serializing object during [`InMemoryLayer::put_value`]. + /// This buffer is reused for each serialization to avoid additional malloc calls. + static SER_BUFFER: RefCell> = RefCell::new(Vec::new()); +} + pub struct InMemoryLayer { conf: &'static PageServerConf, tenantid: ZTenantId, @@ -278,10 +285,17 @@ impl InMemoryLayer { pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); let mut inner = self.inner.write().unwrap(); - inner.assert_writeable(); - let off = inner.file.write_blob(&Value::ser(val)?)?; + let off = { + SER_BUFFER.with(|x| -> Result<_> { + let mut buf = x.borrow_mut(); + buf.clear(); + val.ser_into(&mut (*buf))?; + let off = inner.file.write_blob(&buf)?; + Ok(off) + })? + }; let vec_map = inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, off).unwrap().0; @@ -350,8 +364,8 @@ impl InMemoryLayer { // Write all page versions for (lsn, pos) in vec_map.as_slice() { cursor.read_blob_into_buf(*pos, &mut buf)?; - let val = Value::des(&buf)?; - delta_layer_writer.put_value(key, *lsn, val)?; + let will_init = Value::des(&buf)?.will_init(); + delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?; } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index f696c1f411..788c9de29e 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -80,23 +80,25 @@ impl DatadirTimeline { /// the timeline. /// /// This provides a transaction-like interface to perform a bunch - /// of modifications atomically, all stamped with one LSN. + /// of modifications atomically. /// - /// To ingest a WAL record, call begin_modification(lsn) to get a + /// To ingest a WAL record, call begin_modification() to get a /// DatadirModification object. Use the functions in the object to /// modify the repository state, updating all the pages and metadata - /// that the WAL record affects. When you're done, call commit() to - /// commit the changes. + /// that the WAL record affects. When you're done, call commit(lsn) to + /// commit the changes. All the changes will be stamped with the specified LSN. + /// + /// Calling commit(lsn) will flush all the changes and reset the state, + /// so the `DatadirModification` struct can be reused to perform the next modification. /// /// Note that any pending modifications you make through the /// modification object won't be visible to calls to the 'get' and list /// functions of the timeline until you finish! And if you update the /// same page twice, the last update wins. /// - pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification { + pub fn begin_modification(&self) -> DatadirModification { DatadirModification { tline: self, - lsn, pending_updates: HashMap::new(), pending_deletions: Vec::new(), pending_nblocks: 0, @@ -533,8 +535,6 @@ pub struct DatadirModification<'a, R: Repository> { /// in the state in 'tline' yet. pub tline: &'a DatadirTimeline, - lsn: Lsn, - // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. @@ -920,7 +920,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub fn flush(&mut self) -> Result<()> { + pub fn flush(&mut self, lsn: Lsn) -> Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -934,7 +934,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { let mut result: Result<()> = Ok(()); self.pending_updates.retain(|&key, value| { if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) { - result = writer.put(key, self.lsn, value); + result = writer.put(key, lsn, value); false } else { true @@ -956,20 +956,22 @@ impl<'a, R: Repository> DatadirModification<'a, R> { /// /// Finish this atomic update, writing all the updated keys to the /// underlying timeline. + /// All the modifications in this atomic update are stamped by the specified LSN. /// - pub fn commit(self) -> Result<()> { + pub fn commit(&mut self, lsn: Lsn) -> Result<()> { let writer = self.tline.tline.writer(); let pending_nblocks = self.pending_nblocks; + self.pending_nblocks = 0; - for (key, value) in self.pending_updates { - writer.put(key, self.lsn, &value)?; + for (key, value) in self.pending_updates.drain() { + writer.put(key, lsn, &value)?; } - for key_range in self.pending_deletions { - writer.delete(key_range.clone(), self.lsn)?; + for key_range in self.pending_deletions.drain(..) { + writer.delete(key_range, lsn)?; } - writer.finish_write(self.lsn); + writer.finish_write(lsn); if pending_nblocks != 0 { self.tline.current_logical_size.fetch_add( @@ -1407,9 +1409,9 @@ pub fn create_test_timeline( ) -> Result>> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; let tline = DatadirTimeline::new(tline, 256 * 1024); - let mut m = tline.begin_modification(Lsn(8)); + let mut m = tline.begin_modification(); m.init_empty()?; - m.commit()?; + m.commit(Lsn(8))?; Ok(Arc::new(tline)) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 2f39007e9f..adc24328ae 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -78,13 +78,13 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// pub fn ingest_record( &mut self, - timeline: &DatadirTimeline, recdata: Bytes, lsn: Lsn, + modification: &mut DatadirModification, + decoded: &mut DecodedWALRecord, ) -> Result<()> { - let mut modification = timeline.begin_modification(lsn); + decode_wal_record(recdata, decoded).context("failed decoding wal record")?; - let mut decoded = decode_wal_record(recdata).context("failed decoding wal record")?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -98,7 +98,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, &mut modification, &mut decoded)?; + self.ingest_heapam_record(&mut buf, modification, decoded)?; } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID @@ -106,19 +106,19 @@ impl<'a, R: Repository> WalIngest<'a, R> { == pg_constants::XLOG_SMGR_CREATE { let create = XlSmgrCreate::decode(&mut buf); - self.ingest_xlog_smgr_create(&mut modification, &create)?; + self.ingest_xlog_smgr_create(modification, &create)?; } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(&mut modification, &truncate)?; + self.ingest_xlog_smgr_truncate(modification, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(&mut modification, &createdb)?; + self.ingest_xlog_dbase_create(modification, &createdb)?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_DROP { @@ -137,7 +137,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( - &mut modification, + modification, SlruKind::Clog, segno, rpageno, @@ -146,7 +146,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(&mut modification, &xlrec)?; + self.ingest_clog_truncate_record(modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; @@ -154,7 +154,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - &mut modification, + modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, )?; @@ -164,7 +164,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - &mut modification, + modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, )?; @@ -187,7 +187,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( - &mut modification, + modification, SlruKind::MultiXactOffsets, segno, rpageno, @@ -198,7 +198,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( - &mut modification, + modification, SlruKind::MultiXactMembers, segno, rpageno, @@ -206,14 +206,14 @@ impl<'a, R: Repository> WalIngest<'a, R> { )?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(&mut modification, &xlrec)?; + self.ingest_multixact_create_record(modification, &xlrec)?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(&mut modification, &xlrec)?; + self.ingest_multixact_truncate_record(modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(&mut modification, &xlrec, &decoded)?; + self.ingest_relmap_page(modification, &xlrec, decoded)?; } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -248,7 +248,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(&mut modification, lsn, &decoded, blk)?; + self.ingest_decoded_block(modification, lsn, decoded, blk)?; } // If checkpoint data was updated, store the new version in the repository @@ -261,7 +261,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit()?; + modification.commit(lsn)?; Ok(()) } @@ -1069,10 +1069,10 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); fn init_walingest_test(tline: &DatadirTimeline) -> Result> { - let mut m = tline.begin_modification(Lsn(0x10)); + let mut m = tline.begin_modification(); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file - m.commit()?; + m.commit(Lsn(0x10))?; let walingest = WalIngest::new(tline, Lsn(0x10))?; Ok(walingest) @@ -1084,19 +1084,19 @@ mod tests { let tline = create_test_timeline(repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&tline)?; - let mut m = tline.begin_modification(Lsn(0x20)); + let mut m = tline.begin_modification(); walingest.put_rel_creation(&mut m, TESTREL_A)?; walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; - m.commit()?; - let mut m = tline.begin_modification(Lsn(0x30)); + m.commit(Lsn(0x20))?; + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; - m.commit()?; - let mut m = tline.begin_modification(Lsn(0x40)); + m.commit(Lsn(0x30))?; + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; - m.commit()?; - let mut m = tline.begin_modification(Lsn(0x50)); + m.commit(Lsn(0x40))?; + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; - m.commit()?; + m.commit(Lsn(0x50))?; assert_current_logical_size(&tline, Lsn(0x50)); @@ -1142,9 +1142,9 @@ mod tests { ); // Truncate last block - let mut m = tline.begin_modification(Lsn(0x60)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; - m.commit()?; + m.commit(Lsn(0x60))?; assert_current_logical_size(&tline, Lsn(0x60)); // Check reported size and contents after truncation @@ -1166,15 +1166,15 @@ mod tests { ); // Truncate to zero length - let mut m = tline.begin_modification(Lsn(0x68)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; - m.commit()?; + m.commit(Lsn(0x68))?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0); // Extend from 0 to 2 blocks, leaving a gap - let mut m = tline.begin_modification(Lsn(0x70)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; - m.commit()?; + m.commit(Lsn(0x70))?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2); assert_eq!( tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, @@ -1186,9 +1186,9 @@ mod tests { ); // Extend a lot more, leaving a big gap that spans across segments - let mut m = tline.begin_modification(Lsn(0x80)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; - m.commit()?; + m.commit(Lsn(0x80))?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501); for blk in 2..1500 { assert_eq!( @@ -1212,18 +1212,18 @@ mod tests { let tline = create_test_timeline(repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&tline)?; - let mut m = tline.begin_modification(Lsn(0x20)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; - m.commit()?; + m.commit(Lsn(0x20))?; // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); // Drop rel - let mut m = tline.begin_modification(Lsn(0x30)); + let mut m = tline.begin_modification(); walingest.put_rel_drop(&mut m, TESTREL_A)?; - m.commit()?; + m.commit(Lsn(0x30))?; // Check that rel is not visible anymore assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); @@ -1232,9 +1232,9 @@ mod tests { //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none()); // Re-create it - let mut m = tline.begin_modification(Lsn(0x40)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; - m.commit()?; + m.commit(Lsn(0x40))?; // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); @@ -1254,12 +1254,12 @@ mod tests { // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; - let mut m = tline.begin_modification(Lsn(0x20)); + let mut m = tline.begin_modification(); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; } - m.commit()?; + m.commit(Lsn(0x20))?; // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); @@ -1280,9 +1280,9 @@ mod tests { // Truncate relation so that second segment was dropped // - only leave one page - let mut m = tline.begin_modification(Lsn(0x60)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?; - m.commit()?; + m.commit(Lsn(0x60))?; // Check reported size and contents after truncation assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1); @@ -1310,12 +1310,12 @@ mod tests { // Extend relation again. // Add enough blocks to create second segment let lsn = Lsn(0x80); - let mut m = tline.begin_modification(lsn); + let mut m = tline.begin_modification(); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; } - m.commit()?; + m.commit(lsn)?; assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize); @@ -1343,10 +1343,10 @@ mod tests { let mut lsn = 0x10; for blknum in 0..pg_constants::RELSEG_SIZE + 1 { lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; - m.commit()?; + m.commit(Lsn(lsn))?; } assert_current_logical_size(&tline, Lsn(lsn)); @@ -1358,9 +1358,9 @@ mod tests { // Truncate one block lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?; - m.commit()?; + m.commit(Lsn(lsn))?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE @@ -1369,9 +1369,9 @@ mod tests { // Truncate another block lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?; - m.commit()?; + m.commit(Lsn(lsn))?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE - 1 @@ -1383,9 +1383,9 @@ mod tests { let mut size: i32 = 3000; while size >= 0 { lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; - m.commit()?; + m.commit(Lsn(lsn))?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, size as BlockNumber diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 0c8c0ae2f6..cc1a9cc5eb 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -23,6 +23,7 @@ use crate::{ repository::{Repository, Timeline}, tenant_mgr, walingest::WalIngest, + walrecord::DecodedWALRecord, }; use postgres_ffi::waldecoder::WalStreamDecoder; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; @@ -150,19 +151,25 @@ pub async fn handle_walreceiver_connection( waldecoder.feed_bytes(data); - while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - let _enter = info_span!("processing record", lsn = %lsn).entered(); + { + let mut decoded = DecodedWALRecord::default(); + let mut modification = timeline.begin_modification(); + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { + // let _enter = info_span!("processing record", lsn = %lsn).entered(); - // It is important to deal with the aligned records as lsn in getPage@LSN is - // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hitting a deadlock. - ensure!(lsn.is_aligned()); + // It is important to deal with the aligned records as lsn in getPage@LSN is + // aligned and can be several bytes bigger. Without this alignment we are + // at risk of hitting a deadlock. + ensure!(lsn.is_aligned()); - walingest.ingest_record(&timeline, recdata, lsn)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .context("could not ingest record at {lsn}")?; - fail_point!("walreceiver-after-ingest"); + fail_point!("walreceiver-after-ingest"); - last_rec_lsn = lsn; + last_rec_lsn = lsn; + } } if !caught_up && endlsn >= end_of_wal { diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 5a384360e2..6b01d52005 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -96,6 +96,7 @@ impl DecodedBkpBlock { } } +#[derive(Default)] pub struct DecodedWALRecord { pub xl_xid: TransactionId, pub xl_info: u8, @@ -505,7 +506,17 @@ impl XlMultiXactTruncate { // block data // ... // main data -pub fn decode_wal_record(record: Bytes) -> Result { +// +// +// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in. +// It would be more natural for this function to return a DecodedWALRecord as return value, +// but reusing the caller-supplied struct avoids an allocation. +// This code is in the hot path for digesting incoming WAL, and is very performance sensitive. +// +pub fn decode_wal_record( + record: Bytes, + decoded: &mut DecodedWALRecord, +) -> Result<(), DeserializeError> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -534,7 +545,7 @@ pub fn decode_wal_record(record: Bytes) -> Result = Vec::new(); + decoded.blocks.clear(); // 2. Decode the headers. // XLogRecordBlockHeaders if any, @@ -713,7 +724,7 @@ pub fn decode_wal_record(record: Bytes) -> Result { @@ -724,7 +735,7 @@ pub fn decode_wal_record(record: Bytes) -> Result Result Date: Tue, 12 Jul 2022 23:07:26 +0300 Subject: [PATCH 15/29] register tenants task thread pool threads in thread_mgr needed to avoid this warning: is_shutdown_requested() called in an unexpected thread --- pageserver/src/tenant_tasks.rs | 4 ++ pageserver/src/thread_mgr.rs | 70 +++++++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index b0bb4953ca..e51744d3cc 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -120,6 +120,10 @@ pub fn init_tenant_task_pool() -> anyhow::Result<()> { let runtime = tokio::runtime::Builder::new_multi_thread() .thread_name("tenant-task-worker") .enable_all() + .on_thread_start(|| { + thread_mgr::register(ThreadKind::TenantTaskWorker, "tenant-task-worker") + }) + .on_thread_stop(thread_mgr::deregister) .build()?; let (gc_send, mut gc_recv) = mpsc::channel::(100); diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index ab0d894c70..6dd2e4b00b 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -97,6 +97,9 @@ pub enum ThreadKind { // Thread that schedules new compaction and gc jobs TenantTaskManager, + // Worker thread for tenant tasks thread pool + TenantTaskWorker, + // Thread that flushes frozen in-memory layers to disk LayerFlushThread, @@ -105,18 +108,20 @@ pub enum ThreadKind { StorageSync, } +#[derive(Default)] struct MutableThreadState { /// Tenant and timeline that this thread is associated with. tenant_id: Option, timeline_id: Option, /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. + /// the thread has already exited. OR if this thread is managed externally + /// and was not spawned through thread_mgr.rs::spawn function. join_handle: Option>, } struct PageServerThread { - _thread_id: u64, + thread_id: u64, kind: ThreadKind, @@ -147,7 +152,7 @@ where let (shutdown_tx, shutdown_rx) = watch::channel(()); let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); let thread = Arc::new(PageServerThread { - _thread_id: thread_id, + thread_id, kind, name: name.to_string(), shutdown_requested: AtomicBool::new(false), @@ -315,8 +320,10 @@ pub fn shutdown_threads( drop(thread_mut); let _ = join_handle.join(); } else { - // The thread had not even fully started yet. Or it was shut down - // concurrently and already exited + // Possibly one of: + // * The thread had not even fully started yet. + // * It was shut down concurrently and already exited + // * Is managed through `register`/`deregister` fns without providing a join handle } } } @@ -348,3 +355,56 @@ pub fn is_shutdown_requested() -> bool { } }) } + +/// Needed to register threads that were not spawned through spawn function. +/// For example tokio blocking threads. This function is expected to be used +/// in tandem with `deregister`. +/// NOTE: threads registered through this function cannot be joined +pub fn register(kind: ThreadKind, name: &str) { + CURRENT_THREAD.with(|ct| { + let mut borrowed = ct.borrow_mut(); + if borrowed.is_some() { + panic!("thread already registered") + }; + let (shutdown_tx, shutdown_rx) = watch::channel(()); + let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); + + let thread = Arc::new(PageServerThread { + thread_id, + kind, + name: name.to_owned(), + shutdown_requested: AtomicBool::new(false), + shutdown_tx, + mutable: Mutex::new(MutableThreadState { + tenant_id: None, + timeline_id: None, + join_handle: None, + }), + }); + + *borrowed = Some(Arc::clone(&thread)); + + SHUTDOWN_RX.with(|rx| { + *rx.borrow_mut() = Some(shutdown_rx); + }); + + THREADS.lock().unwrap().insert(thread_id, thread); + }); +} + +// Expected to be used in tandem with `register`. See the doc for `register` for more details +pub fn deregister() { + CURRENT_THREAD.with(|ct| { + let mut borrowed = ct.borrow_mut(); + let thread = match borrowed.take() { + Some(thread) => thread, + None => panic!("calling deregister on unregistered thread"), + }; + + SHUTDOWN_RX.with(|rx| { + *rx.borrow_mut() = None; + }); + + THREADS.lock().unwrap().remove(&thread.thread_id) + }); +} From 9dcb9ca3da358a678daf040eda2c94b0b8dd9fab Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 22 Jul 2022 11:00:05 +0100 Subject: [PATCH 16/29] test/performance: ensure we don't have tables that we're creating (#2135) --- test_runner/performance/test_dup_key.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py index a8caceb61a..ee867a9845 100644 --- a/test_runner/performance/test_dup_key.py +++ b/test_runner/performance/test_dup_key.py @@ -17,6 +17,8 @@ def test_dup_key(env: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: + cur.execute('drop table if exists t, f;') + cur.execute("SET synchronous_commit=off") cur.execute("SET statement_timeout=0") From 39c59b8df5069efb9364280cf64b8f9ecf4241b3 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 22 Jul 2022 07:44:20 -0400 Subject: [PATCH 17/29] Fix flaky test_branch_creation_before_gc test (#2142) --- test_runner/batch_others/test_branch_and_gc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py index b8ce63b069..901b3f3d0f 100644 --- a/test_runner/batch_others/test_branch_and_gc.py +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -139,7 +139,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): 'image_creation_threshold': '1', # set PITR interval to be small, so we can do GC - 'pitr_interval': '1 s' + 'pitr_interval': '0 s' }) b0 = env.neon_cli.create_branch('b0', tenant_id=tenant) From 5f4ccae5c5d426d8587ac9f91b251f8f842f4333 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Mon, 25 Jul 2022 17:23:10 +0300 Subject: [PATCH 18/29] [proxy] Add the `password hack` authentication flow (#2095) [proxy] Add the `password hack` authentication flow This lets us authenticate users which can use neither SNI (due to old libpq) nor connection string `options` (due to restrictions in other client libraries). Note: `PasswordHack` will accept passwords which are not encoded in base64 via the "password" field. The assumption is that most user passwords will be valid utf-8 strings, and the rest may still be passed via "password_". --- libs/utils/src/pq_proto.rs | 4 +- proxy/src/auth.rs | 12 +- proxy/src/auth/backend.rs | 186 ++++++++-- proxy/src/auth/backend/console.rs | 91 ++--- proxy/src/auth/backend/legacy_console.rs | 44 ++- proxy/src/auth/backend/link.rs | 4 +- proxy/src/auth/backend/postgres.rs | 35 +- proxy/src/auth/credentials.rs | 431 ++++++++--------------- proxy/src/auth/flow.rs | 39 +- proxy/src/auth/password_hack.rs | 102 ++++++ proxy/src/compute.rs | 104 ++++-- proxy/src/config.rs | 36 +- proxy/src/error.rs | 7 + proxy/src/main.rs | 8 +- proxy/src/proxy.rs | 91 ++--- proxy/src/stream.rs | 8 + test_runner/batch_others/test_proxy.py | 32 +- test_runner/fixtures/neon_fixtures.py | 66 ++-- 18 files changed, 750 insertions(+), 550 deletions(-) create mode 100644 proxy/src/auth/password_hack.rs diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 0a320f123c..3dcae4d0af 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -47,10 +47,12 @@ pub enum FeStartupPacket { StartupMessage { major_version: u32, minor_version: u32, - params: HashMap, + params: StartupMessageParams, }, } +pub type StartupMessageParams = HashMap; + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct CancelKeyData { pub backend_pid: i32, diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 9bddd58fce..61c7458e16 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,11 +1,14 @@ //! Client authentication mechanisms. pub mod backend; -pub use backend::DatabaseInfo; +pub use backend::{BackendType, DatabaseInfo}; mod credentials; pub use credentials::ClientCredentials; +mod password_hack; +use password_hack::PasswordHackPayload; + mod flow; pub use flow::*; @@ -29,9 +32,8 @@ pub enum AuthErrorImpl { #[error(transparent)] Sasl(#[from] crate::sasl::Error), - /// For passwords that couldn't be processed by [`backend::legacy_console::parse_password`]. - #[error("Malformed password message")] - MalformedPassword, + #[error("Malformed password message: {0}")] + MalformedPassword(&'static str), /// Errors produced by [`crate::stream::PqStream`]. #[error(transparent)] @@ -76,7 +78,7 @@ impl UserFacingError for AuthError { Console(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), Sasl(e) => e.to_string_client(), - MalformedPassword => self.to_string(), + MalformedPassword(_) => self.to_string(), _ => "Internal error".to_string(), } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 1d41f7f932..5e87059c86 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,16 +1,14 @@ -mod legacy_console; mod link; mod postgres; pub mod console; +mod legacy_console; pub use legacy_console::{AuthError, AuthErrorImpl}; -use super::ClientCredentials; use crate::{ - compute, - config::{AuthBackendType, ProxyConfig}, - mgmt, + auth::{self, AuthFlow, ClientCredentials}, + compute, config, mgmt, stream::PqStream, waiters::{self, Waiter, Waiters}, }; @@ -78,32 +76,158 @@ impl From for tokio_postgres::Config { } } -pub(super) async fn handle_user( - config: &ProxyConfig, - client: &mut PqStream, - creds: ClientCredentials, -) -> super::Result { - use AuthBackendType::*; - match config.auth_backend { - LegacyConsole => { - legacy_console::handle_user( - &config.auth_endpoint, - &config.auth_link_uri, - client, - &creds, - ) - .await +/// This type serves two purposes: +/// +/// * When `T` is `()`, it's just a regular auth backend selector +/// which we use in [`crate::config::ProxyConfig`]. +/// +/// * However, when we substitute `T` with [`ClientCredentials`], +/// this helps us provide the credentials only to those auth +/// backends which require them for the authentication process. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum BackendType { + /// Legacy Cloud API (V1) + link auth. + LegacyConsole(T), + /// Current Cloud API (V2). + Console(T), + /// Local mock of Cloud API (V2). + Postgres(T), + /// Authentication via a web browser. + Link, +} + +impl BackendType { + /// Very similar to [`std::option::Option::map`]. + /// Maps [`BackendType`] to [`BackendType`] by applying + /// a function to a contained value. + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { + use BackendType::*; + match self { + LegacyConsole(x) => LegacyConsole(f(x)), + Console(x) => Console(f(x)), + Postgres(x) => Postgres(f(x)), + Link => Link, + } + } +} + +impl BackendType> { + /// Very similar to [`std::option::Option::transpose`]. + /// This is most useful for error handling. + pub fn transpose(self) -> Result, E> { + use BackendType::*; + match self { + LegacyConsole(x) => x.map(LegacyConsole), + Console(x) => x.map(Console), + Postgres(x) => x.map(Postgres), + Link => Ok(Link), + } + } +} + +impl BackendType { + /// Authenticate the client via the requested backend, possibly using credentials. + pub async fn authenticate( + mut self, + urls: &config::AuthUrls, + client: &mut PqStream, + ) -> super::Result { + use BackendType::*; + + if let Console(creds) | Postgres(creds) = &mut self { + // If there's no project so far, that entails that client doesn't + // support SNI or other means of passing the project name. + // We now expect to see a very specific payload in the place of password. + if creds.project().is_none() { + let payload = AuthFlow::new(client) + .begin(auth::PasswordHack) + .await? + .authenticate() + .await?; + + // Finally we may finish the initialization of `creds`. + // TODO: add missing type safety to ClientCredentials. + creds.project = Some(payload.project); + + let mut config = match &self { + Console(creds) => { + console::Api::new(&urls.auth_endpoint, creds) + .wake_compute() + .await? + } + Postgres(creds) => { + postgres::Api::new(&urls.auth_endpoint, creds) + .wake_compute() + .await? + } + _ => unreachable!("see the patterns above"), + }; + + // We should use a password from payload as well. + config.password(payload.password); + + return Ok(compute::NodeInfo { + reported_auth_ok: false, + config, + }); + } + } + + match self { + LegacyConsole(creds) => { + legacy_console::handle_user( + &urls.auth_endpoint, + &urls.auth_link_uri, + &creds, + client, + ) + .await + } + Console(creds) => { + console::Api::new(&urls.auth_endpoint, &creds) + .handle_user(client) + .await + } + Postgres(creds) => { + postgres::Api::new(&urls.auth_endpoint, &creds) + .handle_user(client) + .await + } + // NOTE: this auth backend doesn't use client credentials. + Link => link::handle_user(&urls.auth_link_uri, client).await, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_backend_type_map() { + let values = [ + BackendType::LegacyConsole(0), + BackendType::Console(0), + BackendType::Postgres(0), + BackendType::Link, + ]; + + for value in values { + assert_eq!(value.map(|x| x), value); + } + } + + #[test] + fn test_backend_type_transpose() { + let values = [ + BackendType::LegacyConsole(Ok::<_, ()>(0)), + BackendType::Console(Ok(0)), + BackendType::Postgres(Ok(0)), + BackendType::Link, + ]; + + for value in values { + assert_eq!(value.map(Result::unwrap), value.transpose().unwrap()); } - Console => { - console::Api::new(&config.auth_endpoint, &creds)? - .handle_user(client) - .await - } - Postgres => { - postgres::Api::new(&config.auth_endpoint, &creds)? - .handle_user(client) - .await - } - Link => link::handle_user(&config.auth_link_uri, client).await, } } diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 3085f0b0e4..a8ff1a3522 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -1,18 +1,17 @@ //! Cloud API V2. use crate::{ - auth::{self, AuthFlow, ClientCredentials, DatabaseInfo}, - compute, - error::UserFacingError, + auth::{self, AuthFlow, ClientCredentials}, + compute::{self, ComputeConnCfg}, + error::{io_error, UserFacingError}, scram, stream::PqStream, url::ApiUrl, }; use serde::{Deserialize, Serialize}; -use std::{future::Future, io}; +use std::future::Future; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; pub type Result = std::result::Result; @@ -84,8 +83,8 @@ pub(super) struct Api<'a> { impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { - Ok(Self { endpoint, creds }) + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { + Self { endpoint, creds } } /// Authenticate the existing user or throw an error. @@ -100,7 +99,7 @@ impl<'a> Api<'a> { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_get_role_secret"); url.query_pairs_mut() - .append_pair("project", self.creds.project_name.as_ref()?) + .append_pair("project", self.creds.project().expect("impossible")) .append_pair("role", &self.creds.user); // TODO: use a proper logger @@ -120,11 +119,11 @@ impl<'a> Api<'a> { } /// Wake up the compute node and return the corresponding connection info. - async fn wake_compute(&self) -> Result { + pub(super) async fn wake_compute(&self) -> Result { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_wake_compute"); - let project_name = self.creds.project_name.as_ref()?; - url.query_pairs_mut().append_pair("project", project_name); + url.query_pairs_mut() + .append_pair("project", self.creds.project().expect("impossible")); // TODO: use a proper logger println!("cplane request: {url}"); @@ -137,16 +136,20 @@ impl<'a> Api<'a> { let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await.map_err(io_error)?)?; - let (host, port) = parse_host_port(&response.address) - .ok_or(ConsoleAuthError::BadComputeAddress(response.address))?; + // Unfortunately, ownership won't let us use `Option::ok_or` here. + let (host, port) = match parse_host_port(&response.address) { + None => return Err(ConsoleAuthError::BadComputeAddress(response.address)), + Some(x) => x, + }; - Ok(DatabaseInfo { - host, - port, - dbname: self.creds.dbname.to_owned(), - user: self.creds.user.to_owned(), - password: None, - }) + let mut config = ComputeConnCfg::new(); + config + .host(host) + .port(port) + .dbname(&self.creds.dbname) + .user(&self.creds.user); + + Ok(config) } } @@ -160,7 +163,7 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( ) -> auth::Result where GetAuthInfo: Future>, - WakeCompute: Future>, + WakeCompute: Future>, { let auth_info = get_auth_info(endpoint).await?; @@ -179,48 +182,18 @@ where } }; - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; + let mut config = wake_compute(endpoint).await?; + if let Some(keys) = scram_keys { + config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys)); + } Ok(compute::NodeInfo { - db_info: wake_compute(endpoint).await?, - scram_keys, + reported_auth_ok: false, + config, }) } -/// Upcast (almost) any error into an opaque [`io::Error`]. -pub(super) fn io_error(e: impl Into>) -> io::Error { - io::Error::new(io::ErrorKind::Other, e) -} - -fn parse_host_port(input: &str) -> Option<(String, u16)> { +fn parse_host_port(input: &str) -> Option<(&str, u16)> { let (host, port) = input.split_once(':')?; - Some((host.to_owned(), port.parse().ok()?)) -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } + Some((host, port.parse().ok()?)) } diff --git a/proxy/src/auth/backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs index 467da63a98..7a5e9b6f62 100644 --- a/proxy/src/auth/backend/legacy_console.rs +++ b/proxy/src/auth/backend/legacy_console.rs @@ -11,7 +11,7 @@ use crate::{ use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; +use utils::pq_proto::BeMessage as Be; #[derive(Debug, Error)] pub enum AuthErrorImpl { @@ -76,6 +76,12 @@ enum ProxyAuthResponse { NotReady { ready: bool }, // TODO: get rid of `ready` } +impl ClientCredentials { + fn is_existing_user(&self) -> bool { + self.user.ends_with("@zenith") + } +} + async fn authenticate_proxy_client( auth_endpoint: &reqwest::Url, creds: &ClientCredentials, @@ -100,7 +106,7 @@ async fn authenticate_proxy_client( } let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; - println!("got auth info: #{:?}", auth_info); + println!("got auth info: {:?}", auth_info); use ProxyAuthResponse::*; let db_info = match auth_info { @@ -128,7 +134,9 @@ async fn handle_existing_user( // Read client's password hash let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword)?; + let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword( + "the password should be a valid null-terminated utf-8 string", + ))?; let db_info = authenticate_proxy_client( auth_endpoint, @@ -139,21 +147,17 @@ async fn handle_existing_user( ) .await?; - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - Ok(compute::NodeInfo { - db_info, - scram_keys: None, + reported_auth_ok: false, + config: db_info.into(), }) } pub async fn handle_user( auth_endpoint: &reqwest::Url, auth_link_uri: &reqwest::Url, - client: &mut PqStream, creds: &ClientCredentials, + client: &mut PqStream, ) -> auth::Result { if creds.is_existing_user() { handle_existing_user(auth_endpoint, client, creds).await @@ -201,4 +205,24 @@ mod tests { .unwrap(); assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); } + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + }))?; + + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + }))?; + + Ok(()) + } } diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 669c9e00e9..d658a34825 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -41,7 +41,7 @@ pub async fn handle_user( client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; Ok(compute::NodeInfo { - db_info, - scram_keys: None, + reported_auth_ok: true, + config: db_info.into(), }) } diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs index 721b9db095..1d7ab8f249 100644 --- a/proxy/src/auth/backend/postgres.rs +++ b/proxy/src/auth/backend/postgres.rs @@ -3,10 +3,12 @@ use crate::{ auth::{ self, - backend::console::{self, io_error, AuthInfo, Result}, - ClientCredentials, DatabaseInfo, + backend::console::{self, AuthInfo, Result}, + ClientCredentials, }, - compute, scram, + compute::{self, ComputeConnCfg}, + error::io_error, + scram, stream::PqStream, url::ApiUrl, }; @@ -20,8 +22,8 @@ pub(super) struct Api<'a> { impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { - Ok(Self { endpoint, creds }) + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { + Self { endpoint, creds } } /// Authenticate the existing user or throw an error. @@ -56,7 +58,10 @@ impl<'a> Api<'a> { // We shouldn't get more than one row anyway. [row, ..] => { - let entry = row.try_get(0).map_err(io_error)?; + let entry = row + .try_get("rolpassword") + .map_err(|e| io_error(format!("failed to read user's password: {e}")))?; + scram::ServerSecret::parse(entry) .map(AuthInfo::Scram) .or_else(|| { @@ -75,14 +80,14 @@ impl<'a> Api<'a> { } /// We don't need to wake anything locally, so we just return the connection info. - async fn wake_compute(&self) -> Result { - Ok(DatabaseInfo { - // TODO: handle that near CLI params parsing - host: self.endpoint.host_str().unwrap_or("localhost").to_owned(), - port: self.endpoint.port().unwrap_or(5432), - dbname: self.creds.dbname.to_owned(), - user: self.creds.user.to_owned(), - password: None, - }) + pub(super) async fn wake_compute(&self) -> Result { + let mut config = ComputeConnCfg::new(); + config + .host(self.endpoint.host_str().unwrap_or("localhost")) + .port(self.endpoint.port().unwrap_or(5432)) + .dbname(&self.creds.dbname) + .user(&self.creds.user); + + Ok(config) } } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index b5312fbe1f..4c72da1c48 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,39 +1,25 @@ //! User credentials used in authentication. -use crate::compute; -use crate::config::ProxyConfig; use crate::error::UserFacingError; -use crate::stream::PqStream; -use std::collections::HashMap; use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::StartupMessageParams; #[derive(Debug, Error, PartialEq, Eq, Clone)] pub enum ClientCredsParseError { - #[error("Parameter `{0}` is missing in startup packet.")] + #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), - #[error( - "Project name is not specified. \ - EITHER please upgrade the postgres client library (libpq) for SNI support \ - OR pass the project name as a parameter: '&options=project%3D'." - )] - MissingSNIAndProjectName, - #[error("Inconsistent project name inferred from SNI ('{0}') and project option ('{1}').")] - InconsistentProjectNameAndSNI(String, String), - - #[error("Common name is not set.")] - CommonNameNotSet, + InconsistentProjectNames(String, String), #[error( "SNI ('{1}') inconsistently formatted with respect to common name ('{0}'). \ - SNI should be formatted as '.'." + SNI should be formatted as '.{0}'." )] - InconsistentCommonNameAndSNI(String, String), + InconsistentSni(String, String), - #[error("Project name ('{0}') must contain only alphanumeric characters and hyphens ('-').")] - ProjectNameContainsIllegalChars(String), + #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] + MalformedProjectName(String), } impl UserFacingError for ClientCredsParseError {} @@ -44,286 +30,171 @@ impl UserFacingError for ClientCredsParseError {} pub struct ClientCredentials { pub user: String, pub dbname: String, - pub project_name: Result, + pub project: Option, } impl ClientCredentials { - pub fn is_existing_user(&self) -> bool { - // This logic will likely change in the future. - self.user.ends_with("@zenith") + pub fn project(&self) -> Option<&str> { + self.project.as_deref() } +} +impl ClientCredentials { pub fn parse( - mut options: HashMap, - sni_data: Option<&str>, + mut options: StartupMessageParams, + sni: Option<&str>, common_name: Option<&str>, ) -> Result { - let mut get_param = |key| { - options - .remove(key) - .ok_or(ClientCredsParseError::MissingKey(key)) - }; + use ClientCredsParseError::*; + // Some parameters are absolutely necessary, others not so much. + let mut get_param = |key| options.remove(key).ok_or(MissingKey(key)); + + // Some parameters are stored in the startup message. let user = get_param("user")?; let dbname = get_param("database")?; - let project_name = get_param("project").ok(); - let project_name = get_project_name(sni_data, common_name, project_name.as_deref()); + let project_a = get_param("project").ok(); + + // Alternative project name is in fact a subdomain from SNI. + // NOTE: we do not consider SNI if `common_name` is missing. + let project_b = sni + .zip(common_name) + .map(|(sni, cn)| { + // TODO: what if SNI is present but just a common name? + subdomain_from_sni(sni, cn) + .ok_or_else(|| InconsistentSni(sni.to_owned(), cn.to_owned())) + }) + .transpose()?; + + let project = match (project_a, project_b) { + // Invariant: if we have both project name variants, they should match. + (Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a, b))), + (a, b) => a.or(b).map(|name| { + // Invariant: project name may not contain certain characters. + check_project_name(name).map_err(MalformedProjectName) + }), + } + .transpose()?; Ok(Self { user, dbname, - project_name, + project, }) } +} - /// Use credentials to authenticate the user. - pub async fn authenticate( - self, - config: &ProxyConfig, - client: &mut PqStream, - ) -> super::Result { - // This method is just a convenient facade for `handle_user` - super::backend::handle_user(config, client, self).await +fn check_project_name(name: String) -> Result { + if name.chars().all(|c| c.is_alphanumeric() || c == '-') { + Ok(name) + } else { + Err(name) } } -/// Inferring project name from sni_data. -fn project_name_from_sni_data( - sni_data: &str, - common_name: &str, -) -> Result { - let common_name_with_dot = format!(".{common_name}"); - // check that ".{common_name_with_dot}" is the actual suffix in sni_data - if !sni_data.ends_with(&common_name_with_dot) { - return Err(ClientCredsParseError::InconsistentCommonNameAndSNI( - common_name.to_string(), - sni_data.to_string(), +fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { + sni.strip_suffix(common_name)? + .strip_suffix('.') + .map(str::to_owned) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_options<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> StartupMessageParams { + StartupMessageParams::from(pairs.map(|(k, v)| (k.to_owned(), v.to_owned()))) + } + + #[test] + #[ignore = "TODO: fix how database is handled"] + fn parse_bare_minimum() -> anyhow::Result<()> { + // According to postgresql, only `user` should be required. + let options = make_options([("user", "john_doe")]); + + // TODO: check that `creds.dbname` is None. + let creds = ClientCredentials::parse(options, None, None)?; + assert_eq!(creds.user, "john_doe"); + + Ok(()) + } + + #[test] + fn parse_missing_project() -> anyhow::Result<()> { + let options = make_options([("user", "john_doe"), ("database", "world")]); + + let creds = ClientCredentials::parse(options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project, None); + + Ok(()) + } + + #[test] + fn parse_project_from_sni() -> anyhow::Result<()> { + let options = make_options([("user", "john_doe"), ("database", "world")]); + + let sni = Some("foo.localhost"); + let common_name = Some("localhost"); + + let creds = ClientCredentials::parse(options, sni, common_name)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("foo")); + + Ok(()) + } + + #[test] + fn parse_project_from_options() -> anyhow::Result<()> { + let options = make_options([ + ("user", "john_doe"), + ("database", "world"), + ("project", "bar"), + ]); + + let creds = ClientCredentials::parse(options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("bar")); + + Ok(()) + } + + #[test] + fn parse_projects_identical() -> anyhow::Result<()> { + let options = make_options([ + ("user", "john_doe"), + ("database", "world"), + ("project", "baz"), + ]); + + let sni = Some("baz.localhost"); + let common_name = Some("localhost"); + + let creds = ClientCredentials::parse(options, sni, common_name)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("baz")); + + Ok(()) + } + + #[test] + fn parse_projects_different() { + let options = make_options([ + ("user", "john_doe"), + ("database", "world"), + ("project", "first"), + ]); + + let sni = Some("second.localhost"); + let common_name = Some("localhost"); + + assert!(matches!( + ClientCredentials::parse(options, sni, common_name).expect_err("should fail"), + ClientCredsParseError::InconsistentProjectNames(_, _) )); } - // return sni_data without the common name suffix. - Ok(sni_data - .strip_suffix(&common_name_with_dot) - .unwrap() - .to_string()) -} - -#[cfg(test)] -mod tests_for_project_name_from_sni_data { - use super::*; - - #[test] - fn passing() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - project_name_from_sni_data(&sni_data, common_name), - Ok(target_project_name.to_string()) - ); - } - - #[test] - fn throws_inconsistent_common_name_and_sni_data() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let wrong_suffix = "wrongtest.me"; - assert_eq!(common_name.len(), wrong_suffix.len()); - let wrong_common_name = format!("wrong{wrong_suffix}"); - let sni_data = format!("{target_project_name}.{wrong_common_name}"); - assert_eq!( - project_name_from_sni_data(&sni_data, common_name), - Err(ClientCredsParseError::InconsistentCommonNameAndSNI( - common_name.to_string(), - sni_data - )) - ); - } -} - -/// Determine project name from SNI or from project_name parameter from options argument. -fn get_project_name( - sni_data: Option<&str>, - common_name: Option<&str>, - project_name: Option<&str>, -) -> Result { - // determine the project name from sni_data if it exists, otherwise from project_name. - let ret = match sni_data { - Some(sni_data) => { - let common_name = common_name.ok_or(ClientCredsParseError::CommonNameNotSet)?; - let project_name_from_sni = project_name_from_sni_data(sni_data, common_name)?; - // check invariant: project name from options and from sni should match - if let Some(project_name) = &project_name { - if !project_name_from_sni.eq(project_name) { - return Err(ClientCredsParseError::InconsistentProjectNameAndSNI( - project_name_from_sni, - project_name.to_string(), - )); - } - } - project_name_from_sni - } - None => project_name - .ok_or(ClientCredsParseError::MissingSNIAndProjectName)? - .to_string(), - }; - - // check formatting invariant: project name must contain only alphanumeric characters and hyphens. - if !ret.chars().all(|x: char| x.is_alphanumeric() || x == '-') { - return Err(ClientCredsParseError::ProjectNameContainsIllegalChars(ret)); - } - - Ok(ret) -} - -#[cfg(test)] -mod tests_for_project_name_only { - use super::*; - - #[test] - fn passing_from_sni_data_only() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - get_project_name(Some(&sni_data), Some(common_name), None), - Ok(target_project_name.to_string()) - ); - } - - #[test] - fn throws_project_name_contains_illegal_chars_from_sni_data_only() { - let project_name_prefix = "my-project"; - let project_name_suffix = "123"; - let common_name = "localtest.me"; - - for illegal_char_id in 0..256 { - let illegal_char = char::from_u32(illegal_char_id).unwrap(); - if !(illegal_char.is_alphanumeric() || illegal_char == '-') - && illegal_char.to_string().len() == 1 - { - let target_project_name = - format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - get_project_name(Some(&sni_data), Some(common_name), None), - Err(ClientCredsParseError::ProjectNameContainsIllegalChars( - target_project_name - )) - ); - } - } - } - - #[test] - fn passing_from_project_name_only() { - let target_project_name = "my-project-123"; - let common_names = [Some("localtest.me"), None]; - for common_name in common_names { - assert_eq!( - get_project_name(None, common_name, Some(target_project_name)), - Ok(target_project_name.to_string()) - ); - } - } - - #[test] - fn throws_project_name_contains_illegal_chars_from_project_name_only() { - let project_name_prefix = "my-project"; - let project_name_suffix = "123"; - let common_names = [Some("localtest.me"), None]; - - for common_name in common_names { - for illegal_char_id in 0..256 { - let illegal_char: char = char::from_u32(illegal_char_id).unwrap(); - if !(illegal_char.is_alphanumeric() || illegal_char == '-') - && illegal_char.to_string().len() == 1 - { - let target_project_name = - format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); - assert_eq!( - get_project_name(None, common_name, Some(&target_project_name)), - Err(ClientCredsParseError::ProjectNameContainsIllegalChars( - target_project_name - )) - ); - } - } - } - } - - #[test] - fn passing_from_sni_data_and_project_name() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - get_project_name( - Some(&sni_data), - Some(common_name), - Some(target_project_name) - ), - Ok(target_project_name.to_string()) - ); - } - - #[test] - fn throws_inconsistent_project_name_and_sni() { - let project_name_param = "my-project-123"; - let wrong_project_name = "not-my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{wrong_project_name}.{common_name}"); - assert_eq!( - get_project_name(Some(&sni_data), Some(common_name), Some(project_name_param)), - Err(ClientCredsParseError::InconsistentProjectNameAndSNI( - wrong_project_name.to_string(), - project_name_param.to_string() - )) - ); - } - - #[test] - fn throws_common_name_not_set() { - let target_project_name = "my-project-123"; - let wrong_project_name = "not-my-project-123"; - let common_name = "localtest.me"; - let sni_datas = [ - Some(format!("{wrong_project_name}.{common_name}")), - Some(format!("{target_project_name}.{common_name}")), - ]; - let project_names = [None, Some(target_project_name)]; - for sni_data in sni_datas { - for project_name_param in project_names { - assert_eq!( - get_project_name(sni_data.as_deref(), None, project_name_param), - Err(ClientCredsParseError::CommonNameNotSet) - ); - } - } - } - - #[test] - fn throws_inconsistent_common_name_and_sni_data() { - let target_project_name = "my-project-123"; - let wrong_project_name = "not-my-project-123"; - let common_name = "localtest.me"; - let wrong_suffix = "wrongtest.me"; - assert_eq!(common_name.len(), wrong_suffix.len()); - let wrong_common_name = format!("wrong{wrong_suffix}"); - let sni_datas = [ - Some(format!("{wrong_project_name}.{wrong_common_name}")), - Some(format!("{target_project_name}.{wrong_common_name}")), - ]; - let project_names = [None, Some(target_project_name)]; - for project_name_param in project_names { - for sni_data in &sni_datas { - assert_eq!( - get_project_name(sni_data.as_deref(), Some(common_name), project_name_param), - Err(ClientCredsParseError::InconsistentCommonNameAndSNI( - common_name.to_string(), - sni_data.clone().unwrap().to_string() - )) - ); - } - } - } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 7efff13bfc..705f1e3807 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,8 +1,7 @@ //! Main authentication flow. -use super::AuthErrorImpl; -use crate::stream::PqStream; -use crate::{sasl, scram}; +use super::{AuthErrorImpl, PasswordHackPayload}; +use crate::{sasl, scram, stream::PqStream}; use std::io; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; @@ -27,6 +26,17 @@ impl AuthMethod for Scram<'_> { } } +/// Use an ad hoc auth flow (for clients which don't support SNI) proposed in +/// . +pub struct PasswordHack; + +impl AuthMethod for PasswordHack { + #[inline(always)] + fn first_message(&self) -> BeMessage<'_> { + Be::AuthenticationCleartextPassword + } +} + /// This wrapper for [`PqStream`] performs client authentication. #[must_use] pub struct AuthFlow<'a, Stream, State> { @@ -57,13 +67,34 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { } } +impl AuthFlow<'_, S, PasswordHack> { + /// Perform user authentication. Raise an error in case authentication failed. + pub async fn authenticate(self) -> super::Result { + let msg = self.stream.read_password_message().await?; + let password = msg + .strip_suffix(&[0]) + .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + + // The so-called "password" should contain a base64-encoded json. + // We will use it later to route the client to their project. + let bytes = base64::decode(password) + .map_err(|_| AuthErrorImpl::MalformedPassword("bad encoding"))?; + + let payload = serde_json::from_slice(&bytes) + .map_err(|_| AuthErrorImpl::MalformedPassword("invalid payload"))?; + + Ok(payload) + } +} + /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. pub async fn authenticate(self) -> super::Result { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; - let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; + let sasl = sasl::FirstMessage::parse(&msg) + .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?; // Currently, the only supported SASL method is SCRAM. if !scram::METHODS.contains(&sasl.method) { diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs new file mode 100644 index 0000000000..6a1258ab31 --- /dev/null +++ b/proxy/src/auth/password_hack.rs @@ -0,0 +1,102 @@ +//! Payload for ad hoc authentication method for clients that don't support SNI. +//! See the `impl` for [`super::backend::BackendType`]. +//! Read more: . + +use serde::{de, Deserialize, Deserializer}; +use std::fmt; + +#[derive(Deserialize)] +#[serde(untagged)] +pub enum Password { + /// A regular string for utf-8 encoded passwords. + Simple { password: String }, + + /// Password is base64-encoded because it may contain arbitrary byte sequences. + Encoded { + #[serde(rename = "password_", deserialize_with = "deserialize_base64")] + password: Vec, + }, +} + +impl AsRef<[u8]> for Password { + fn as_ref(&self) -> &[u8] { + match self { + Password::Simple { password } => password.as_ref(), + Password::Encoded { password } => password.as_ref(), + } + } +} + +#[derive(Deserialize)] +pub struct PasswordHackPayload { + pub project: String, + + #[serde(flatten)] + pub password: Password, +} + +fn deserialize_base64<'a, D: Deserializer<'a>>(des: D) -> Result, D::Error> { + // It's very tempting to replace this with + // + // ``` + // let base64: &str = Deserialize::deserialize(des)?; + // base64::decode(base64).map_err(serde::de::Error::custom) + // ``` + // + // Unfortunately, we can't always deserialize into `&str`, so we'd + // have to use an allocating `String` instead. Thus, visitor is better. + struct Visitor; + + impl<'de> de::Visitor<'de> for Visitor { + type Value = Vec; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a string") + } + + fn visit_str(self, v: &str) -> Result { + base64::decode(v).map_err(de::Error::custom) + } + } + + des.deserialize_str(Visitor) +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + use serde_json::json; + + #[test] + fn parse_password() -> anyhow::Result<()> { + let password: Password = serde_json::from_value(json!({ + "password": "foo", + }))?; + assert_eq!(password.as_ref(), "foo".as_bytes()); + + let password: Password = serde_json::from_value(json!({ + "password_": base64::encode("foo"), + }))?; + assert_eq!(password.as_ref(), "foo".as_bytes()); + + Ok(()) + } + + #[rstest] + #[case("password", str::to_owned)] + #[case("password_", base64::encode)] + fn parse(#[case] key: &str, #[case] encode: fn(&'static str) -> String) -> anyhow::Result<()> { + let (password, project) = ("password", "pie-in-the-sky"); + let payload = json!({ + "project": project, + key: encode(password), + }); + + let payload: PasswordHackPayload = serde_json::from_value(payload)?; + assert_eq!(payload.password.as_ref(), password.as_bytes()); + assert_eq!(payload.project, project); + + Ok(()) + } +} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index cccd6e60d4..896ef3588d 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,8 +1,6 @@ -use crate::auth::DatabaseInfo; -use crate::cancellation::CancelClosure; -use crate::error::UserFacingError; -use std::io; -use std::net::SocketAddr; +use crate::{cancellation::CancelClosure, error::UserFacingError}; +use futures::TryFutureExt; +use std::{io, net::SocketAddr}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; @@ -21,44 +19,96 @@ pub enum ConnectionError { FailedToFetchPgVersion, } -impl UserFacingError for ConnectionError {} - -/// PostgreSQL version as [`String`]. -pub type Version = String; +impl UserFacingError for ConnectionError { + fn to_string_client(&self) -> String { + use ConnectionError::*; + match self { + // This helps us drop irrelevant library-specific prefixes. + // TODO: propagate severity level and other parameters. + Postgres(err) => match err.as_db_error() { + Some(err) => err.message().to_string(), + None => err.to_string(), + }, + other => other.to_string(), + } + } +} /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; -/// Compute node connection params. +pub type ComputeConnCfg = tokio_postgres::Config; + +/// Various compute node info for establishing connection etc. pub struct NodeInfo { - pub db_info: DatabaseInfo, - pub scram_keys: Option, + /// Did we send [`utils::pq_proto::BeMessage::AuthenticationOk`]? + pub reported_auth_ok: bool, + /// Compute node connection params. + pub config: tokio_postgres::Config, } impl NodeInfo { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { - let host_port = (self.db_info.host.as_str(), self.db_info.port); - let socket = TcpStream::connect(host_port).await?; - let socket_addr = socket.peer_addr()?; - socket2::SockRef::from(&socket).set_keepalive(true)?; + use tokio_postgres::config::Host; - Ok((socket_addr, socket)) + let connect_once = |host, port| { + TcpStream::connect((host, port)).and_then(|socket| async { + let socket_addr = socket.peer_addr()?; + // This prevents load balancer from severing the connection. + socket2::SockRef::from(&socket).set_keepalive(true)?; + Ok((socket_addr, socket)) + }) + }; + + // We can't reuse connection establishing logic from `tokio_postgres` here, + // because it has no means for extracting the underlying socket which we + // require for our business. + let mut connection_error = None; + let ports = self.config.get_ports(); + for (i, host) in self.config.get_hosts().iter().enumerate() { + let port = ports.get(i).or_else(|| ports.get(0)).unwrap_or(&5432); + let host = match host { + Host::Tcp(host) => host.as_str(), + Host::Unix(_) => continue, // unix sockets are not welcome here + }; + + // TODO: maybe we should add a timeout. + match connect_once(host, *port).await { + Ok(socket) => return Ok(socket), + Err(err) => { + // We can't throw an error here, as there might be more hosts to try. + println!("failed to connect to compute `{host}:{port}`: {err}"); + connection_error = Some(err); + } + } + } + + Err(connection_error.unwrap_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("couldn't connect: bad compute config: {:?}", self.config), + ) + })) } +} +pub struct PostgresConnection { + /// Socket connected to a compute node. + pub stream: TcpStream, + /// PostgreSQL version of this instance. + pub version: String, +} + +impl NodeInfo { /// Connect to a corresponding compute node. - pub async fn connect(self) -> Result<(TcpStream, Version, CancelClosure), ConnectionError> { - let (socket_addr, mut socket) = self + pub async fn connect(&self) -> Result<(PostgresConnection, CancelClosure), ConnectionError> { + let (socket_addr, mut stream) = self .connect_raw() .await .map_err(|_| ConnectionError::FailedToConnectToCompute)?; - let mut config = tokio_postgres::Config::from(self.db_info); - if let Some(scram_keys) = self.scram_keys { - config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(scram_keys)); - } - // TODO: establish a secure connection to the DB - let (client, conn) = config.connect_raw(&mut socket, NoTls).await?; + let (client, conn) = self.config.connect_raw(&mut stream, NoTls).await?; let version = conn .parameter("server_version") .ok_or(ConnectionError::FailedToFetchPgVersion)? @@ -66,6 +116,8 @@ impl NodeInfo { let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); - Ok((socket, version, cancel_closure)) + let db = PostgresConnection { stream, version }; + + Ok((db, cancel_closure)) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index df3923de1a..1f01c25734 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,28 +1,16 @@ -use crate::url::ApiUrl; +use crate::{auth, url::ApiUrl}; use anyhow::{bail, ensure, Context}; use std::{str::FromStr, sync::Arc}; -#[derive(Debug)] -pub enum AuthBackendType { - /// Legacy Cloud API (V1). - LegacyConsole, - /// Authentication via a web browser. - Link, - /// Current Cloud API (V2). - Console, - /// Local mock of Cloud API (V2). - Postgres, -} - -impl FromStr for AuthBackendType { +impl FromStr for auth::BackendType<()> { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { - use AuthBackendType::*; + use auth::BackendType::*; Ok(match s { - "legacy" => LegacyConsole, - "console" => Console, - "postgres" => Postgres, + "legacy" => LegacyConsole(()), + "console" => Console(()), + "postgres" => Postgres(()), "link" => Link, _ => bail!("Invalid option `{s}` for auth method"), }) @@ -31,7 +19,11 @@ impl FromStr for AuthBackendType { pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: AuthBackendType, + pub auth_backend: auth::BackendType<()>, + pub auth_urls: AuthUrls, +} + +pub struct AuthUrls { pub auth_endpoint: ApiUrl, pub auth_link_uri: ApiUrl, } @@ -87,10 +79,8 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result>) -> io::Error { + io::Error::new(io::ErrorKind::Other, e) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index b68b2440dd..2521f2af21 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -118,11 +118,15 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; + let auth_urls = config::AuthUrls { + auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, + auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + }; + let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend: arg_matches.value_of("auth-backend").unwrap().parse()?, - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, - auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + auth_urls, })); println!("Version: {GIT_VERSION}"); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 7e364b5e9c..f202782109 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -82,11 +82,22 @@ async fn handle_client( } let tls = config.tls_config.as_ref(); - let (stream, creds) = match handshake(stream, tls, cancel_map).await? { + let (mut stream, params) = match handshake(stream, tls, cancel_map).await? { Some(x) => x, None => return Ok(()), // it's a cancellation request }; + let creds = { + let sni = stream.get_ref().sni_hostname(); + let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let result = config + .auth_backend + .map(|_| auth::ClientCredentials::parse(params, sni, common_name)) + .transpose(); + + async { result }.or_else(|e| stream.throw_error(e)).await? + }; + let client = Client::new(stream, creds); cancel_map .with_session(|session| client.connect_to_db(config, session)) @@ -101,12 +112,10 @@ async fn handshake( stream: S, mut tls: Option<&TlsConfig>, cancel_map: &CancelMap, -) -> anyhow::Result>, auth::ClientCredentials)>> { +) -> anyhow::Result>, StartupMessageParams)>> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); - let common_name = tls.and_then(|cfg| cfg.common_name.as_deref()); - let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; @@ -147,18 +156,7 @@ async fn handshake( stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - // Get SNI info when available - let sni_data = match stream.get_ref() { - Stream::Tls { tls } => tls.get_ref().1.sni_hostname().map(|s| s.to_owned()), - _ => None, - }; - - // Construct credentials - let creds = - auth::ClientCredentials::parse(params, sni_data.as_deref(), common_name); - let creds = async { creds }.or_else(|e| stream.throw_error(e)).await?; - - break Ok(Some((stream, creds))); + break Ok(Some((stream, params))); } CancelRequest(cancel_key_data) => { cancel_map.cancel_session(cancel_key_data).await?; @@ -174,12 +172,12 @@ struct Client { /// The underlying libpq protocol stream. stream: PqStream, /// Client credentials that we care about. - creds: auth::ClientCredentials, + creds: auth::BackendType, } impl Client { /// Construct a new connection context. - fn new(stream: PqStream, creds: auth::ClientCredentials) -> Self { + fn new(stream: PqStream, creds: auth::BackendType) -> Self { Self { stream, creds } } } @@ -194,16 +192,22 @@ impl Client { let Self { mut stream, creds } = self; // Authenticate and connect to a compute node. - let auth = creds.authenticate(config, &mut stream).await; + let auth = creds.authenticate(&config.auth_urls, &mut stream).await; let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; - let (db, version, cancel_closure) = - node.connect().or_else(|e| stream.throw_error(e)).await?; + let (db, cancel_closure) = node.connect().or_else(|e| stream.throw_error(e)).await?; let cancel_key_data = session.enable_cancellation(cancel_closure); + // Report authentication success if we haven't done this already. + if !node.reported_auth_ok { + stream + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + } + stream .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&version), + BeParameterStatusMessage::ServerVersion(&db.version), ))? .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? .write_message(&BeMessage::ReadyForQuery) @@ -217,7 +221,7 @@ impl Client { } // Starting from here we only proxy the client's traffic. - let mut db = MetricsStream::new(db, inc_proxied); + let mut db = MetricsStream::new(db.stream, inc_proxied); let mut client = MetricsStream::new(stream.into_inner(), inc_proxied); let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; @@ -279,9 +283,13 @@ mod tests { let config = rustls::ServerConfig::builder() .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![cert], key)?; + .with_single_cert(vec![cert], key)? + .into(); - config.into() + TlsConfig { + config, + common_name: Some(common_name.to_string()), + } }; let client_config = { @@ -297,11 +305,6 @@ mod tests { ClientConfig { config, hostname } }; - let tls_config = TlsConfig { - config: tls_config, - common_name: Some(common_name.to_string()), - }; - Ok((client_config, tls_config)) } @@ -357,7 +360,7 @@ mod tests { auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let cancel_map = CancelMap::default(); - let (mut stream, _creds) = handshake(client, tls.as_ref(), &cancel_map) + let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map) .await? .context("handshake failed")?; @@ -436,32 +439,6 @@ mod tests { proxy.await? } - #[tokio::test] - async fn give_user_an_error_for_bad_creds() -> anyhow::Result<()> { - let (client, server) = tokio::io::duplex(1024); - - let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); - - let client_err = tokio_postgres::Config::new() - .ssl_mode(SslMode::Disable) - .connect_raw(server, NoTls) - .await - .err() // -> Option - .context("client shouldn't be able to connect")?; - - // TODO: this is ugly, but `format!` won't allow us to extract fmt string - assert!(client_err.to_string().contains("missing in startup packet")); - - let server_err = proxy - .await? - .err() // -> Option - .context("server shouldn't accept client")?; - - assert!(client_err.to_string().contains(&server_err.to_string())); - - Ok(()) - } - #[tokio::test] async fn keepalive_is_inherited() -> anyhow::Result<()> { use tokio::net::{TcpListener, TcpStream}; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 42b0185fde..54ff8bcc07 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -145,6 +145,14 @@ impl Stream { pub fn from_raw(raw: S) -> Self { Self::Raw { raw } } + + /// Return SNI hostname when it's available. + pub fn sni_hostname(&self) -> Option<&str> { + match self { + Stream::Raw { .. } => None, + Stream::Tls { tls } => tls.get_ref().1.sni_hostname(), + } + } } #[derive(Debug, Error)] diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index ebeede8df7..92c8475e69 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -1,8 +1,34 @@ import pytest +import json +import base64 def test_proxy_select_1(static_proxy): - static_proxy.safe_psql("select 1;", options="project=generic-project-name") + static_proxy.safe_psql('select 1', options='project=generic-project-name') + + +def test_password_hack(static_proxy): + user = 'borat' + password = 'password' + static_proxy.safe_psql(f"create role {user} with login password '{password}'", + options='project=irrelevant') + + def encode(s: str) -> str: + return base64.b64encode(s.encode('utf-8')).decode('utf-8') + + magic = encode(json.dumps({ + 'project': 'irrelevant', + 'password': password, + })) + + static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) + + magic = encode(json.dumps({ + 'project': 'irrelevant', + 'password_': encode(password), + })) + + static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) # Pass extra options to the server. @@ -11,8 +37,8 @@ def test_proxy_select_1(static_proxy): # See https://github.com/neondatabase/neon/issues/1287 @pytest.mark.xfail def test_proxy_options(static_proxy): - with static_proxy.connect(options="-cproxytest.option=value") as conn: + with static_proxy.connect(options='-cproxytest.option=value') as conn: with conn.cursor() as cur: - cur.execute("SHOW proxytest.option;") + cur.execute('SHOW proxytest.option') value = cur.fetchall()[0][0] assert value == 'value' diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3a6a233208..b1fba29e3b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -30,7 +30,7 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn -from typing import Any, Callable, Dict, Iterator, List, Optional, Type, TypeVar, cast, Union, Tuple +from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal import requests @@ -280,20 +280,18 @@ class PgProtocol: return str(make_dsn(**self.conn_options(**kwargs))) def conn_options(self, **kwargs): - conn_options = self.default_options.copy() + result = self.default_options.copy() if 'dsn' in kwargs: - conn_options.update(parse_dsn(kwargs['dsn'])) - conn_options.update(kwargs) + result.update(parse_dsn(kwargs['dsn'])) + result.update(kwargs) # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. - if 'options' in conn_options: - conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options'] - else: - conn_options['options'] = "-cstatement_timeout=120s" - return conn_options + options = result.get('options', '') + result['options'] = f'-cstatement_timeout=120s {options}' + return result # autocommit=True here by default because that's what we need most of the time def connect(self, autocommit=True, **kwargs) -> PgConnection: @@ -1514,29 +1512,25 @@ def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: class NeonProxy(PgProtocol): - def __init__(self, port: int, pg_port: int): - super().__init__(host="127.0.0.1", - user="proxy_user", - password="pytest2", - port=port, - dbname='postgres') - self.http_port = 7001 - self.host = "127.0.0.1" - self.port = port - self.pg_port = pg_port + def __init__(self, proxy_port: int, http_port: int, auth_endpoint: str): + super().__init__(dsn=auth_endpoint, port=proxy_port) + self.host = '127.0.0.1' + self.http_port = http_port + self.proxy_port = proxy_port + self.auth_endpoint = auth_endpoint self._popen: Optional[subprocess.Popen[bytes]] = None def start(self) -> None: assert self._popen is None # Start proxy - bin_proxy = os.path.join(str(neon_binpath), 'proxy') - args = [bin_proxy] - args.extend(["--http", f"{self.host}:{self.http_port}"]) - args.extend(["--proxy", f"{self.host}:{self.port}"]) - args.extend(["--auth-backend", "postgres"]) - args.extend( - ["--auth-endpoint", f"postgres://proxy_auth:pytest1@localhost:{self.pg_port}/postgres"]) + args = [ + os.path.join(str(neon_binpath), 'proxy'), + *["--http", f"{self.host}:{self.http_port}"], + *["--proxy", f"{self.host}:{self.proxy_port}"], + *["--auth-backend", "postgres"], + *["--auth-endpoint", self.auth_endpoint], + ] self._popen = subprocess.Popen(args) self._wait_until_ready() @@ -1557,13 +1551,21 @@ class NeonProxy(PgProtocol): @pytest.fixture(scope='function') def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: """Neon proxy that routes directly to vanilla postgres.""" - vanilla_pg.start() - vanilla_pg.safe_psql("create user proxy_auth with password 'pytest1' superuser") - vanilla_pg.safe_psql("create user proxy_user with password 'pytest2'") - port = port_distributor.get_port() - pg_port = vanilla_pg.default_options['port'] - with NeonProxy(port, pg_port) as proxy: + # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql` + vanilla_pg.start() + vanilla_pg.safe_psql("create user proxy with login superuser password 'password'") + + port = vanilla_pg.default_options['port'] + host = vanilla_pg.default_options['host'] + dbname = vanilla_pg.default_options['dbname'] + auth_endpoint = f'postgres://proxy:password@{host}:{port}/{dbname}' + + proxy_port = port_distributor.get_port() + http_port = port_distributor.get_port() + + with NeonProxy(proxy_port=proxy_port, http_port=http_port, + auth_endpoint=auth_endpoint) as proxy: proxy.start() yield proxy From 45680f9a2d36d3c14ed1daa20565d849a53aa80f Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 25 Jul 2022 18:30:30 +0300 Subject: [PATCH 19/29] Drop CircleCI runs (#2082) --- .circleci/config.yml | 369 ------------------ .../actions/run-python-test-set/action.yml | 2 +- Dockerfile.compute-tools | 2 +- .../batch_others/test_wal_acceptor_async.py | 5 +- 4 files changed, 4 insertions(+), 374 deletions(-) delete mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 00a51eb906..0000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,369 +0,0 @@ -version: 2.1 - -executors: - neon-xlarge-executor: - resource_class: xlarge - docker: - # NB: when changed, do not forget to update rust image tag in all Dockerfiles - - image: neondatabase/rust:1.58 - neon-executor: - docker: - - image: neondatabase/rust:1.58 - -jobs: - # A job to build postgres - build-postgres: - executor: neon-xlarge-executor - parameters: - build_type: - type: enum - enum: ["debug", "release"] - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - # Checkout the git repo (circleci doesn't have a flag to enable submodules here) - - checkout - - # Grab the postgres git revision to build a cache key. - # Append makefile as it could change the way postgres is built. - # Note this works even though the submodule hasn't been checkout out yet. - - run: - name: Get postgres cache key - command: | - git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres - cat Makefile >> /tmp/cache-key-postgres - - - restore_cache: - name: Restore postgres cache - keys: - # Restore ONLY if the rev key matches exactly - - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - - # Build postgres if the restore_cache didn't find a build. - # `make` can't figure out whether the cache is valid, since - # it only compares file timestamps. - - run: - name: build postgres - command: | - if [ ! -e tmp_install/bin/postgres ]; then - # "depth 1" saves some time by not cloning the whole repo - git submodule update --init --depth 1 - # bail out on any warnings - COPT='-Werror' mold -run make postgres -j$(nproc) - fi - - - save_cache: - name: Save postgres cache - key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - paths: - - tmp_install - - # A job to build Neon rust code - build-neon: - executor: neon-xlarge-executor - parameters: - build_type: - type: enum - enum: ["debug", "release"] - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - # Checkout the git repo (without submodules) - - checkout - - # Grab the postgres git revision to build a cache key. - # Append makefile as it could change the way postgres is built. - # Note this works even though the submodule hasn't been checkout out yet. - - run: - name: Get postgres cache key - command: | - git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres - cat Makefile >> /tmp/cache-key-postgres - - - - restore_cache: - name: Restore postgres cache - keys: - # Restore ONLY if the rev key matches exactly - - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - - - restore_cache: - name: Restore rust cache - keys: - # Require an exact match. While an out of date cache might speed up the build, - # there's no way to clean out old packages, so the cache grows every time something - # changes. - - v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} - - # Build the rust code, including test binaries - - run: - name: Rust build << parameters.build_type >> - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - CARGO_FLAGS="--release --features profiling" - fi - - export CARGO_INCREMENTAL=0 - export CACHEPOT_BUCKET=zenith-rust-cachepot - export RUSTC_WRAPPER="" - export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" - export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" - mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests - cachepot -s - - - save_cache: - name: Save rust cache - key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} - paths: - - ~/.cargo/registry - - ~/.cargo/git - - target - - # Run rust unit tests - - run: - name: cargo test - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - CARGO_FLAGS=--release - fi - - cargo test $CARGO_FLAGS - - # Install the rust binaries, for use by test jobs - - run: - name: Install rust binaries - command: | - binaries=$( - cargo metadata --format-version=1 --no-deps | - jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' - ) - - mkdir -p /tmp/zenith/bin - mkdir -p /tmp/zenith/test_bin - mkdir -p /tmp/zenith/etc - - # Install target binaries - for bin in $binaries; do - SRC=target/$BUILD_TYPE/$bin - DST=/tmp/zenith/bin/$bin - cp $SRC $DST - done - - # Install the postgres binaries, for use by test jobs - - run: - name: Install postgres binaries - command: | - cp -a tmp_install /tmp/zenith/pg_install - - # Save rust binaries for other jobs in the workflow - - persist_to_workspace: - root: /tmp/zenith - paths: - - "*" - - check-codestyle-python: - executor: neon-executor - steps: - - checkout - - restore_cache: - keys: - - v2-python-deps-{{ checksum "poetry.lock" }} - - run: - name: Install deps - command: ./scripts/pysync - - save_cache: - key: v2-python-deps-{{ checksum "poetry.lock" }} - paths: - - /home/circleci/.cache/pypoetry/virtualenvs - - run: - name: Print versions - when: always - command: | - poetry run python --version - poetry show - - run: - name: Run yapf to ensure code format - when: always - command: poetry run yapf --recursive --diff . - - run: - name: Run mypy to check types - when: always - command: poetry run mypy . - - run-pytest: - executor: neon-executor - parameters: - # pytest args to specify the tests to run. - # - # This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory, - # or '-k foobar' to run tests containing string 'foobar'. See pytest man page - # section SPECIFYING TESTS / SELECTING TESTS for details. - # - # Select the type of Rust build. Must be "release" or "debug". - build_type: - type: string - default: "debug" - # This parameter is required, to prevent the mistake of running all tests in one job. - test_selection: - type: string - default: "" - # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr - extra_params: - type: string - default: "" - needs_postgres_source: - type: boolean - default: false - run_in_parallel: - type: boolean - default: true - save_perf_report: - type: boolean - default: false - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - - attach_workspace: - at: /tmp/zenith - - checkout - - when: - condition: << parameters.needs_postgres_source >> - steps: - - run: git submodule update --init --depth 1 - - restore_cache: - keys: - - v2-python-deps-{{ checksum "poetry.lock" }} - - run: - name: Install deps - command: ./scripts/pysync - - save_cache: - key: v2-python-deps-{{ checksum "poetry.lock" }} - paths: - - /home/circleci/.cache/pypoetry/virtualenvs - - run: - name: Run pytest - # pytest doesn't output test logs in real time, so CI job may fail with - # `Too long with no output` error, if a test is running for a long time. - # In that case, tests should have internal timeouts that are less than - # no_output_timeout, specified here. - no_output_timeout: 10m - environment: - - NEON_BIN: /tmp/zenith/bin - - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install - - TEST_OUTPUT: /tmp/test_output - # this variable will be embedded in perf test report - # and is needed to distinguish different environments - - PLATFORM: zenith-local-ci - command: | - PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" - rm -rf $PERF_REPORT_DIR - - TEST_SELECTION="test_runner/<< parameters.test_selection >>" - EXTRA_PARAMS="<< parameters.extra_params >>" - if [ -z "$TEST_SELECTION" ]; then - echo "test_selection must be set" - exit 1 - fi - if << parameters.run_in_parallel >>; then - EXTRA_PARAMS="-n4 $EXTRA_PARAMS" - fi - if << parameters.save_perf_report >>; then - if [[ $CIRCLE_BRANCH == "main" ]]; then - mkdir -p "$PERF_REPORT_DIR" - EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" - fi - fi - - export GITHUB_SHA=$CIRCLE_SHA1 - - # Run the tests. - # - # The junit.xml file allows CircleCI to display more fine-grained test information - # in its "Tests" tab in the results page. - # --verbose prints name of each test (helpful when there are - # multiple tests in one file) - # -rA prints summary in the end - # -n4 uses four processes to run tests via pytest-xdist - # -s is not used to prevent pytest from capturing output, because tests are running - # in parallel and logs are mixed between different tests - ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ - --tb=short \ - --verbose \ - -m "not remote_cluster" \ - -rA $TEST_SELECTION $EXTRA_PARAMS - - if << parameters.save_perf_report >>; then - if [[ $CIRCLE_BRANCH == "main" ]]; then - export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO=local - scripts/generate_and_push_perf_report.sh - fi - fi - - run: - # CircleCI artifacts are preserved one file at a time, so skipping - # this step isn't a good idea. If you want to extract the - # pageserver state, perhaps a tarball would be a better idea. - name: Delete all data but logs - when: always - command: | - du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete - du -sh /tmp/test_output/* - - store_artifacts: - path: /tmp/test_output - # The store_test_results step tells CircleCI where to find the junit.xml file. - - store_test_results: - path: /tmp/test_output - # Save data (if any) - - persist_to_workspace: - root: /tmp/zenith - paths: - - "*" - -workflows: - build_and_test: - jobs: - - check-codestyle-python - - build-postgres: - name: build-postgres-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - - build-neon: - name: build-neon-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - requires: - - build-postgres-<< matrix.build_type >> - - run-pytest: - name: pg_regress-tests-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - test_selection: batch_pg_regress - needs_postgres_source: true - requires: - - build-neon-<< matrix.build_type >> - - run-pytest: - name: other-tests-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - test_selection: batch_others - requires: - - build-neon-<< matrix.build_type >> - - run-pytest: - name: benchmarks - context: PERF_TEST_RESULT_CONNSTR - build_type: release - test_selection: performance - run_in_parallel: false - save_perf_report: true - requires: - - build-neon-release diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 0d058d47c1..a956929d92 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -99,7 +99,7 @@ runs: # Run the tests. # - # The junit.xml file allows CircleCI to display more fine-grained test information + # The junit.xml file allows CI tools to display more fine-grained test information # in its "Tests" tab in the results page. # --verbose prints name of each test (helpful when there are # multiple tests in one file) diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 87b73e139c..76cbc2ac30 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,5 +1,5 @@ # First transient image to build compute_tools binaries -# NB: keep in sync with rust image version in .circle/config.yml +# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml FROM neondatabase/rust:1.58 AS rust-build # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 9577c0980e..bf7d8e3645 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -146,9 +146,8 @@ async def run_restarts_under_load(env: NeonEnv, max_transfer=100, period_time=4, iterations=10): - # Set timeout for this test at 5 minutes. It should be enough for test to complete - # and less than CircleCI's no_output_timeout, taking into account that this timeout - # is checked only at the beginning of every iteration. + # Set timeout for this test at 5 minutes. It should be enough for test to complete, + # taking into account that this timeout is checked only at the beginning of every iteration. test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() From 28243d68e60ffc7e69f158522f589f7d2e09186d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 26 Jul 2022 09:11:10 +0300 Subject: [PATCH 20/29] Yet another apporach of copying logical timeline size during branch creation (#2139) * Yet another apporach of copying logical timeline size during branch creation * Fix unit tests * Update pageserver/src/layered_repository.rs Co-authored-by: Thang Pham * Update pageserver/src/layered_repository.rs Co-authored-by: Thang Pham * Update pageserver/src/layered_repository.rs Co-authored-by: Thang Pham Co-authored-by: Thang Pham --- pageserver/src/layered_repository.rs | 44 +++++++++++++++++++++++++--- pageserver/src/pgdatadir_mapping.rs | 6 ++++ pageserver/src/tenant_mgr.rs | 8 +++-- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 3830e4c1bd..5c65b5dc7e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -259,6 +259,7 @@ impl Repository for LayeredRepository { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, + None, ); timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -323,6 +324,20 @@ impl Repository for LayeredRepository { )); } } + // Copy logical size from source timeline if we are branching on the last position. + let init_logical_size = + if let Ok(src_pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenant_id, src) { + let logical_size = src_pgdir.get_current_logical_size(); + // Check LSN after getting logical size to exclude race condition + // when ancestor timeline is concurrently updated + if src_timeline.get_last_record_lsn() == start_lsn { + Some(logical_size) + } else { + None + } + } else { + None + }; // Determine prev-LSN for the new timeline. We can only determine it if // the timeline was branched at the current end of the source timeline. @@ -353,7 +368,14 @@ impl Repository for LayeredRepository { ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; Self::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); + timelines.insert( + dst, + LayeredTimelineEntry::Unloaded { + id: dst, + metadata, + init_logical_size, + }, + ); info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -489,7 +511,7 @@ impl Repository for LayeredRepository { // we need to get metadata of a timeline, another option is to pass it along with Downloaded status let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; // finally we make newly downloaded timeline visible to repository - entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) + entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, init_logical_size: None }) }, }; Ok(()) @@ -506,6 +528,7 @@ enum LayeredTimelineEntry { Unloaded { id: ZTimelineId, metadata: TimelineMetadata, + init_logical_size: Option, }, } @@ -673,13 +696,18 @@ impl LayeredRepository { timelineid: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result>> { + let logical_size: Option; match timelines.get(&timelineid) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { debug!("timeline {} found loaded into memory", &timelineid); return Ok(Some(Arc::clone(local_timeline))); } - LayeredTimelineEntry::Unloaded { .. } => {} + LayeredTimelineEntry::Unloaded { + init_logical_size, .. + } => { + logical_size = *init_logical_size; + } }, None => { debug!("timeline {} not found", &timelineid); @@ -690,7 +718,7 @@ impl LayeredRepository { "timeline {} found on a local disk, but not loaded into the memory, loading", &timelineid ); - let timeline = self.load_local_timeline(timelineid, timelines)?; + let timeline = self.load_local_timeline(timelineid, timelines, logical_size)?; let was_loaded = timelines.insert( timelineid, LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), @@ -707,6 +735,7 @@ impl LayeredRepository { &self, timeline_id: ZTimelineId, timelines: &mut HashMap, + init_logical_size: Option, ) -> anyhow::Result> { let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) .context("failed to load metadata")?; @@ -733,6 +762,7 @@ impl LayeredRepository { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, + init_logical_size, ); timeline .load_layer_map(disk_consistent_lsn) @@ -1099,6 +1129,10 @@ pub struct LayeredTimeline { // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", // though lets keep them both for better error visibility. initdb_lsn: Lsn, + + // Initial logical size of timeline (if known). + // Logical size can be copied from ancestor timeline when new branch is create at last LSN + pub init_logical_size: Option, } /// @@ -1299,6 +1333,7 @@ impl LayeredTimeline { tenant_id: ZTenantId, walredo_mgr: Arc, upload_layers: bool, + init_logical_size: Option, ) -> LayeredTimeline { let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) @@ -1377,6 +1412,7 @@ impl LayeredTimeline { latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), + init_logical_size, } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 788c9de29e..f703fa16af 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -76,6 +76,12 @@ impl DatadirTimeline { Ok(()) } + /// Set timeline logical size. + pub fn set_logical_size(&self, size: usize) { + self.current_logical_size + .store(size as isize, Ordering::SeqCst); + } + /// Start ingesting a WAL record, or other atomic modification of /// the timeline. /// diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 1759d3bbb8..a485e7c2cb 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -494,12 +494,16 @@ fn load_local_timeline( format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; let repartition_distance = repo.get_checkpoint_distance() / 10; + let init_logical_size = inmem_timeline.init_logical_size; let page_tline = Arc::new(DatadirTimelineImpl::new( inmem_timeline, repartition_distance, )); - page_tline.init_logical_size()?; - + if let Some(logical_size) = init_logical_size { + page_tline.set_logical_size(logical_size); + } else { + page_tline.init_logical_size()?; + } tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach { id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id), datadir: Arc::clone(&page_tline), From 172314155e4bedd17904cf9eb3b49598fc3abfd1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 26 Jul 2022 00:59:14 +0300 Subject: [PATCH 21/29] Compact only once on psql checkpoint call --- pageserver/src/page_service.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 078edc5c9f..3dba207ab9 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1159,14 +1159,9 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; + // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). timeline.tline.checkpoint(CheckpointConfig::Forced)?; - // Also compact it. - // - // FIXME: This probably shouldn't be part of a "checkpoint" command, but a - // separate operation. Update the tests if you change this. - timeline.tline.compact()?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { From d301b8364cef3b2884b78ad6369e7587d9389b5f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 25 Jul 2022 23:23:35 +0300 Subject: [PATCH 22/29] Move LayeredTimeline and related code to separate source file. The layered_repository.rs file had grown to be very large. Split off the LayeredTimeline struct and related code to a separate source file to make it more manageable. There are plans to move much of the code to track timelines from tenant_mgr.rs to LayeredRepository. That will make layered_repository.rs grow again, so now is a good time to split it. There's a lot more cleanup to do, but this commit intentionally only moves existing code and avoids doing anything else, for easier review. --- pageserver/src/layered_repository.rs | 2032 +---------------- .../src/layered_repository/layer_map.rs | 2 +- pageserver/src/layered_repository/timeline.rs | 2021 ++++++++++++++++ pageserver/src/storage_sync.rs | 9 +- 4 files changed, 2057 insertions(+), 2007 deletions(-) create mode 100644 pageserver/src/layered_repository/timeline.rs diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5c65b5dc7e..ff230ed3c3 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -11,52 +11,36 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::Bytes; -use fail::fail_point; -use itertools::Itertools; -use lazy_static::lazy_static; +use anyhow::{bail, ensure, Context, Result}; use tracing::*; -use std::cmp::{max, min, Ordering}; +use std::cmp::min; use std::collections::hash_map::Entry; +use std::collections::BTreeSet; use std::collections::HashMap; -use std::collections::{BTreeSet, HashSet}; use std::fs; -use std::fs::{File, OpenOptions}; -use std::io::Write; +use std::fs::File; use std::num::NonZeroU64; -use std::ops::{Bound::Included, Deref, Range}; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{self, AtomicBool}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; -use std::time::{Duration, Instant, SystemTime}; +use std::ops::Bound::Included; +use std::path::Path; +use std::sync::{Arc, Mutex, RwLock}; +use std::time::{Duration, Instant}; -use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; +use self::metadata::{metadata_path, TimelineMetadata}; use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline, TimelineWriter}; -use crate::repository::{Key, Value}; +use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline}; use crate::tenant_mgr; use crate::thread_mgr; -use crate::virtual_file::VirtualFile; -use crate::walreceiver::IS_WAL_RECEIVER; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; -use crate::{page_cache, storage_sync}; -use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, - Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, -}; use toml_edit; use utils::{ crashsafe_dir, - lsn::{AtomicLsn, Lsn, RecordLsn}, - seqwait::SeqWait, + lsn::{Lsn, RecordLsn}, zid::{ZTenantId, ZTimelineId}, }; @@ -73,78 +57,16 @@ pub mod metadata; mod par_fsync; mod storage_layer; -use crate::pgdatadir_mapping::LsnForTimestamp; -use delta_layer::{DeltaLayer, DeltaLayerWriter}; -use ephemeral_file::is_ephemeral_file; -use filename::{DeltaFileName, ImageFileName}; -use image_layer::{ImageLayer, ImageLayerWriter}; -use inmemory_layer::InMemoryLayer; -use layer_map::LayerMap; -use layer_map::SearchResult; -use postgres_ffi::xlog_utils::to_pg_timestamp; -use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +mod timeline; + +use storage_layer::Layer; +use timeline::{LayeredTimeline, LayeredTimelineEntry}; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; -// Metrics collected on operations on the storage repository. -lazy_static! { - static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( - "pageserver_storage_operations_seconds", - "Time spent on storage operations", - &["operation", "tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -// Metrics collected on operations on the storage repository. -lazy_static! { - static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( - "pageserver_getpage_reconstruct_seconds", - "Time spent in reconstruct_value", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -lazy_static! { - static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( - "pageserver_materialized_cache_hits_total", - "Number of cache hits from materialized page cache", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); - static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( - "pageserver_wait_lsn_seconds", - "Time spent waiting for WAL to arrive", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -lazy_static! { - static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!( - "pageserver_last_record_lsn", - "Last record LSN grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, -// or in testing they estimate how much we would upload if we did. -lazy_static! { - static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( - "pageserver_created_persistent_files_total", - "Number of files created that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric"); - static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( - "pageserver_written_persistent_bytes_total", - "Total bytes written that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric"); -} +// re-export for use in storage_sync.rs +pub use crate::layered_repository::timeline::save_metadata; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; @@ -248,7 +170,7 @@ impl Repository for LayeredRepository { crashsafe_dir::create_dir_all(timeline_path)?; let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - Self::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; + timeline::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; let timeline = LayeredTimeline::new( self.conf, @@ -367,7 +289,7 @@ impl Repository for LayeredRepository { src_timeline.initdb_lsn, ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - Self::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; + timeline::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; timelines.insert( dst, LayeredTimelineEntry::Unloaded { @@ -396,7 +318,7 @@ impl Repository for LayeredRepository { .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); - STORAGE_TIME + timeline::STORAGE_TIME .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .observe_closure_duration(|| { self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) @@ -522,73 +444,6 @@ impl Repository for LayeredRepository { } } -#[derive(Clone)] -enum LayeredTimelineEntry { - Loaded(Arc), - Unloaded { - id: ZTimelineId, - metadata: TimelineMetadata, - init_logical_size: Option, - }, -} - -impl LayeredTimelineEntry { - fn timeline_id(&self) -> ZTimelineId { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, - LayeredTimelineEntry::Unloaded { id, .. } => *id, - } - } - - fn ancestor_timeline_id(&self) -> Option { - match self { - LayeredTimelineEntry::Loaded(timeline) => { - timeline.ancestor_timeline.as_ref().map(|t| t.timeline_id()) - } - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_timeline(), - } - } - - fn ancestor_lsn(&self) -> Lsn { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.ancestor_lsn, - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_lsn(), - } - } - - fn ensure_loaded(&self) -> anyhow::Result<&Arc> { - match self { - LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), - LayeredTimelineEntry::Unloaded { .. } => { - anyhow::bail!("timeline is unloaded") - } - } - } - - fn layer_removal_guard(&self) -> Result>, anyhow::Error> { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline - .layer_removal_cs - .try_lock() - .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) - .map(Some), - - LayeredTimelineEntry::Unloaded { .. } => Ok(None), - } - } -} - -impl From for RepositoryTimeline { - fn from(entry: LayeredTimelineEntry) -> Self { - match entry { - LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), - LayeredTimelineEntry::Unloaded { metadata, .. } => { - RepositoryTimeline::Unloaded { metadata } - } - } - } -} - /// Private functions impl LayeredRepository { pub fn get_checkpoint_distance(&self) -> u64 { @@ -857,42 +712,6 @@ impl LayeredRepository { }) } - /// Save timeline metadata to file - pub fn save_metadata( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - data: &TimelineMetadata, - first_save: bool, - ) -> Result<()> { - let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timelineid, tenantid); - // use OpenOptions to ensure file presence is consistent with first_save - let mut file = VirtualFile::open_with_options( - &path, - OpenOptions::new().write(true).create_new(first_save), - )?; - - let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; - - if file.write(&metadata_bytes)? != metadata_bytes.len() { - bail!("Could not write all the metadata bytes in a single call"); - } - file.sync_all()?; - - // fsync the parent directory to ensure the directory entry is durable - if first_save { - let timeline_dir = File::open( - &path - .parent() - .expect("Metadata should always have a parent dir"), - )?; - timeline_dir.sync_all()?; - } - - Ok(()) - } - // // How garbage collection works: // @@ -1044,1787 +863,6 @@ impl LayeredRepository { } } -pub struct LayeredTimeline { - conf: &'static PageServerConf, - tenant_conf: Arc>, - - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - - layers: RwLock, - - last_freeze_at: AtomicLsn, - - // WAL redo manager - walredo_mgr: Arc, - - // What page versions do we hold in the repository? If we get a - // request > last_record_lsn, we need to wait until we receive all - // the WAL up to the request. The SeqWait provides functions for - // that. TODO: If we get a request for an old LSN, such that the - // versions have already been garbage collected away, we should - // throw an error, but we don't track that currently. - // - // last_record_lsn.load().last points to the end of last processed WAL record. - // - // We also remember the starting point of the previous record in - // 'last_record_lsn.load().prev'. It's used to set the xl_prev pointer of the - // first WAL record when the node is started up. But here, we just - // keep track of it. - last_record_lsn: SeqWait, - - // All WAL records have been processed and stored durably on files on - // local disk, up to this LSN. On crash and restart, we need to re-process - // the WAL starting from this point. - // - // Some later WAL records might have been processed and also flushed to disk - // already, so don't be surprised to see some, but there's no guarantee on - // them yet. - disk_consistent_lsn: AtomicLsn, - - // Parent timeline that this timeline was branched from, and the LSN - // of the branch point. - ancestor_timeline: Option, - ancestor_lsn: Lsn, - - // Metrics - reconstruct_time_histo: Histogram, - materialized_page_cache_hit_counter: IntCounter, - flush_time_histo: Histogram, - compact_time_histo: Histogram, - create_images_time_histo: Histogram, - last_record_gauge: IntGauge, - wait_lsn_time_histo: Histogram, - - /// If `true`, will backup its files that appear after each checkpointing to the remote storage. - upload_layers: AtomicBool, - - /// Ensures layers aren't frozen by checkpointer between - /// [`LayeredTimeline::get_layer_for_write`] and layer reads. - /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. - /// Must always be acquired before the layer map/individual layer lock - /// to avoid deadlock. - write_lock: Mutex<()>, - - /// Used to ensure that there is only one thread - layer_flush_lock: Mutex<()>, - - /// Layer removal lock. - /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. - /// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`], - /// and [`LayeredRepository::delete_timeline`]. - layer_removal_cs: Mutex<()>, - - // Needed to ensure that we can't create a branch at a point that was already garbage collected - latest_gc_cutoff_lsn: RwLock, - - // List of child timelines and their branch points. This is needed to avoid - // garbage collecting data that is still needed by the child timelines. - gc_info: RwLock, - - // It may change across major versions so for simplicity - // keep it after running initdb for a timeline. - // It is needed in checks when we want to error on some operations - // when they are requested for pre-initdb lsn. - // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", - // though lets keep them both for better error visibility. - initdb_lsn: Lsn, - - // Initial logical size of timeline (if known). - // Logical size can be copied from ancestor timeline when new branch is create at last LSN - pub init_logical_size: Option, -} - -/// -/// Information about how much history needs to be retained, needed by -/// Garbage Collection. -/// -struct GcInfo { - /// Specific LSNs that are needed. - /// - /// Currently, this includes all points where child branches have - /// been forked off from. In the future, could also include - /// explicit user-defined snapshot points. - retain_lsns: Vec, - - /// In addition to 'retain_lsns', keep everything newer than this - /// point. - /// - /// This is calculated by subtracting 'gc_horizon' setting from - /// last-record LSN - /// - /// FIXME: is this inclusive or exclusive? - horizon_cutoff: Lsn, - - /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this - /// point. - /// - /// This is calculated by finding a number such that a record is needed for PITR - /// if only if its LSN is larger than 'pitr_cutoff'. - pitr_cutoff: Lsn, -} - -/// Public interface functions -impl Timeline for LayeredTimeline { - fn get_ancestor_lsn(&self) -> Lsn { - self.ancestor_lsn - } - - fn get_ancestor_timeline_id(&self) -> Option { - self.ancestor_timeline - .as_ref() - .map(LayeredTimelineEntry::timeline_id) - } - - /// Wait until WAL has been received up to the given LSN. - fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { - // This should never be called from the WAL receiver thread, because that could lead - // to a deadlock. - ensure!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" - ); - - self.wait_lsn_time_histo.observe_closure_duration( - || self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - }))?; - - Ok(()) - } - - fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { - self.latest_gc_cutoff_lsn.read().unwrap() - } - - /// Look up the value with the given a key - fn get(&self, key: Key, lsn: Lsn) -> Result { - debug_assert!(lsn <= self.get_last_record_lsn()); - - // Check the page cache. We will get back the most recent page with lsn <= `lsn`. - // The cached image can be returned directly if there is no WAL between the cached image - // and requested LSN. The cached image can also be used to reduce the amount of WAL needed - // for redo. - let cached_page_img = match self.lookup_cached_page(&key, lsn) { - Some((cached_lsn, cached_img)) => { - match cached_lsn.cmp(&lsn) { - Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn - } - Some((cached_lsn, cached_img)) - } - None => None, - }; - - let mut reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: cached_page_img, - }; - - self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; - - self.reconstruct_time_histo - .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) - } - - /// Public entry point for checkpoint(). All the logic is in the private - /// checkpoint_internal function, this public facade just wraps it for - /// metrics collection. - fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { - match cconf { - CheckpointConfig::Flush => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true) - } - CheckpointConfig::Forced => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true)?; - self.compact() - } - } - } - - /// - /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. - /// - fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()> { - ensure!( - lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", - lsn, - **latest_gc_cutoff_lsn, - ); - Ok(()) - } - - fn get_last_record_lsn(&self) -> Lsn { - self.last_record_lsn.load().last - } - - fn get_prev_record_lsn(&self) -> Lsn { - self.last_record_lsn.load().prev - } - - fn get_last_record_rlsn(&self) -> RecordLsn { - self.last_record_lsn.load() - } - - fn get_disk_consistent_lsn(&self) -> Lsn { - self.disk_consistent_lsn.load() - } - - fn writer<'a>(&'a self) -> Box { - Box::new(LayeredTimelineWriter { - tl: self, - _write_guard: self.write_lock.lock().unwrap(), - }) - } -} - -impl LayeredTimeline { - fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .checkpoint_distance - .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) - } - - fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .compaction_target_size - .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) - } - - fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .compaction_threshold - .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) - } - - fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .image_creation_threshold - .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) - } - - /// Open a Timeline handle. - /// - /// Loads the metadata for the timeline into memory, but not the layer map. - #[allow(clippy::too_many_arguments)] - fn new( - conf: &'static PageServerConf, - tenant_conf: Arc>, - metadata: TimelineMetadata, - ancestor: Option, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, - walredo_mgr: Arc, - upload_layers: bool, - init_logical_size: Option, - ) -> LayeredTimeline { - let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let flush_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "layer flush", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let compact_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "compact", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let create_images_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "create images", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - - LayeredTimeline { - conf, - tenant_conf, - timeline_id, - tenant_id, - layers: RwLock::new(LayerMap::default()), - - walredo_mgr, - - // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. - last_record_lsn: SeqWait::new(RecordLsn { - last: metadata.disk_consistent_lsn(), - prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), - }), - disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), - - last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), - - ancestor_timeline: ancestor, - ancestor_lsn: metadata.ancestor_lsn(), - - reconstruct_time_histo, - materialized_page_cache_hit_counter, - flush_time_histo, - compact_time_histo, - create_images_time_histo, - last_record_gauge, - wait_lsn_time_histo, - - upload_layers: AtomicBool::new(upload_layers), - - write_lock: Mutex::new(()), - layer_flush_lock: Mutex::new(()), - layer_removal_cs: Mutex::new(()), - - gc_info: RwLock::new(GcInfo { - retain_lsns: Vec::new(), - horizon_cutoff: Lsn(0), - pitr_cutoff: Lsn(0), - }), - - latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), - initdb_lsn: metadata.initdb_lsn(), - init_logical_size, - } - } - - /// - /// Scan the timeline directory to populate the layer map. - /// Returns all timeline-related files that were found and loaded. - /// - fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { - let mut layers = self.layers.write().unwrap(); - let mut num_layers = 0; - - // Scan timeline directory and create ImageFileName and DeltaFilename - // structs representing all files on disk - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); - - for direntry in fs::read_dir(timeline_path)? { - let direntry = direntry?; - let fname = direntry.file_name(); - let fname = fname.to_string_lossy(); - - if let Some(imgfilename) = ImageFileName::parse_str(&fname) { - // create an ImageLayer struct for each image file. - if imgfilename.lsn > disk_consistent_lsn { - warn!( - "found future image layer {} on timeline {} disk_consistent_lsn is {}", - imgfilename, self.timeline_id, disk_consistent_lsn - ); - - rename_to_backup(direntry.path())?; - continue; - } - - let layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); - - trace!("found layer {}", layer.filename().display()); - layers.insert_historic(Arc::new(layer)); - num_layers += 1; - } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { - // Create a DeltaLayer struct for each delta file. - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { - warn!( - "found future delta layer {} on timeline {} disk_consistent_lsn is {}", - deltafilename, self.timeline_id, disk_consistent_lsn - ); - - rename_to_backup(direntry.path())?; - continue; - } - - let layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); - - trace!("found layer {}", layer.filename().display()); - layers.insert_historic(Arc::new(layer)); - num_layers += 1; - } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { - // ignore these - } else if is_ephemeral_file(&fname) { - // Delete any old ephemeral files - trace!("deleting old ephemeral file in timeline dir: {}", fname); - fs::remove_file(direntry.path())?; - } else { - warn!("unrecognized filename in timeline dir: {}", fname); - } - } - - layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); - - info!( - "loaded layer map with {} layers at {}", - num_layers, disk_consistent_lsn - ); - - Ok(()) - } - - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - /// This function takes the current timeline's locked LayerMap as an argument, - /// so callers can avoid potential race conditions. - fn get_reconstruct_data( - &self, - key: Key, - request_lsn: Lsn, - reconstruct_state: &mut ValueReconstructState, - ) -> anyhow::Result<()> { - // Start from the current timeline. - let mut timeline_owned; - let mut timeline = self; - - // For debugging purposes, collect the path of layers that we traversed - // through. It's included in the error message if we fail to find the key. - let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); - - let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { - *cached_lsn - } else { - Lsn(0) - }; - - // 'prev_lsn' tracks the last LSN that we were at in our search. It's used - // to check that each iteration make some progress, to break infinite - // looping if something goes wrong. - let mut prev_lsn = Lsn(u64::MAX); - - let mut result = ValueReconstructResult::Continue; - let mut cont_lsn = Lsn(request_lsn.0 + 1); - - 'outer: loop { - // The function should have updated 'state' - //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); - match result { - ValueReconstructResult::Complete => return Ok(()), - ValueReconstructResult::Continue => { - // If we reached an earlier cached page image, we're done. - if cont_lsn == cached_lsn + 1 { - self.materialized_page_cache_hit_counter.inc_by(1); - return Ok(()); - } - if prev_lsn <= cont_lsn { - // Didn't make any progress in last iteration. Error out to avoid - // getting stuck in the loop. - return layer_traversal_error(format!( - "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", - key, - Lsn(cont_lsn.0 - 1), - request_lsn, - timeline.ancestor_lsn - ), traversal_path); - } - prev_lsn = cont_lsn; - } - ValueReconstructResult::Missing => { - return layer_traversal_error( - format!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, cont_lsn, request_lsn - ), - traversal_path, - ); - } - } - - // Recurse into ancestor if needed - if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( - "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn - ); - let ancestor = timeline.get_ancestor_timeline()?; - timeline_owned = ancestor; - timeline = &*timeline_owned; - prev_lsn = Lsn(u64::MAX); - continue; - } - - let layers = timeline.layers.read().unwrap(); - - // Check the open and frozen in-memory layers first, in order from newest - // to oldest. - if let Some(open_layer) = &layers.open_layer { - let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = open_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, open_layer.clone())); - continue; - } - } - for frozen_layer in layers.frozen_layers.iter().rev() { - let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = frozen_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, frozen_layer.clone())); - continue 'outer; - } - } - - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { - //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); - - let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, layer)); - } else if timeline.ancestor_timeline.is_some() { - // Nothing on this timeline. Traverse to parent - result = ValueReconstructResult::Continue; - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); - } else { - // Nothing found - result = ValueReconstructResult::Missing; - } - } - } - - fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { - let cache = page_cache::get(); - - // FIXME: It's pointless to check the cache for things that are not 8kB pages. - // We should look at the key to determine if it's a cacheable object - let (lsn, read_guard) = - cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?; - let img = Bytes::from(read_guard.to_vec()); - Some((lsn, img)) - } - - fn get_ancestor_timeline(&self) -> Result> { - let ancestor = self - .ancestor_timeline - .as_ref() - .with_context(|| { - format!( - "Ancestor is missing. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })? - .ensure_loaded() - .with_context(|| { - format!( - "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })?; - Ok(Arc::clone(ancestor)) - } - - /// - /// Get a handle to the latest layer for appending. - /// - fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { - let mut layers = self.layers.write().unwrap(); - - ensure!(lsn.is_aligned()); - - let last_record_lsn = self.get_last_record_lsn(); - ensure!( - lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", - lsn, - last_record_lsn, - ); - - // Do we have a layer open for writing already? - let layer; - if let Some(open_layer) = &layers.open_layer { - if open_layer.get_lsn_range().start > lsn { - bail!("unexpected open layer in the future"); - } - - layer = Arc::clone(open_layer); - } else { - // No writeable layer yet. Create one. - let start_lsn = layers.next_open_layer_at.unwrap(); - - trace!( - "creating layer for write at {}/{} for record at {}", - self.timeline_id, - start_lsn, - lsn - ); - let new_layer = - InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; - let layer_rc = Arc::new(new_layer); - - layers.open_layer = Some(Arc::clone(&layer_rc)); - layers.next_open_layer_at = None; - - layer = layer_rc; - } - Ok(layer) - } - - fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { - //info!("PUT: key {} at {}", key, lsn); - let layer = self.get_layer_for_write(lsn)?; - layer.put_value(key, lsn, val)?; - Ok(()) - } - - fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> Result<()> { - let layer = self.get_layer_for_write(lsn)?; - layer.put_tombstone(key_range, lsn)?; - - Ok(()) - } - - fn finish_write(&self, new_lsn: Lsn) { - assert!(new_lsn.is_aligned()); - - self.last_record_gauge.set(new_lsn.0 as i64); - self.last_record_lsn.advance(new_lsn); - } - - fn freeze_inmem_layer(&self, write_lock_held: bool) { - // Freeze the current open in-memory layer. It will be written to disk on next - // iteration. - let _write_guard = if write_lock_held { - None - } else { - Some(self.write_lock.lock().unwrap()) - }; - let mut layers = self.layers.write().unwrap(); - if let Some(open_layer) = &layers.open_layer { - let open_layer_rc = Arc::clone(open_layer); - // Does this layer need freezing? - let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); - open_layer.freeze(end_lsn); - - // The layer is no longer open, update the layer map to reflect this. - // We will replace it with on-disk historics below. - layers.frozen_layers.push_back(open_layer_rc); - layers.open_layer = None; - layers.next_open_layer_at = Some(end_lsn); - self.last_freeze_at.store(end_lsn); - } - drop(layers); - } - - /// - /// Check if more than 'checkpoint_distance' of WAL has been accumulated - /// in the in-memory layer, and initiate flushing it if so. - /// - pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { - let last_lsn = self.get_last_record_lsn(); - let layers = self.layers.read().unwrap(); - if let Some(open_layer) = &layers.open_layer { - let open_layer_size = open_layer.size()?; - drop(layers); - let distance = last_lsn.widening_sub(self.last_freeze_at.load()); - // Checkpointing the open layer can be triggered by layer size or LSN range. - // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and - // we want to stay below that with a big margin. The LSN distance determines how - // much WAL the safekeepers need to store. - if distance >= self.get_checkpoint_distance().into() - || open_layer_size > self.get_checkpoint_distance() - { - info!( - "check_checkpoint_distance {}, layer size {}", - distance, open_layer_size - ); - - self.freeze_inmem_layer(true); - self.last_freeze_at.store(last_lsn); - - // Launch a thread to flush the frozen layer to disk, unless - // a thread was already running. (If the thread was running - // at the time that we froze the layer, it must've seen the - // the layer we just froze before it exited; see comments - // in flush_frozen_layers()) - if let Ok(guard) = self.layer_flush_lock.try_lock() { - drop(guard); - let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush thread", - false, - move || self_clone.flush_frozen_layers(false), - )?; - } - } - } - Ok(()) - } - - /// Flush all frozen layers to disk. - /// - /// Only one thread at a time can be doing layer-flushing for a - /// given timeline. If 'wait' is true, and another thread is - /// currently doing the flushing, this function will wait for it - /// to finish. If 'wait' is false, this function will return - /// immediately instead. - fn flush_frozen_layers(&self, wait: bool) -> Result<()> { - let flush_lock_guard = if wait { - self.layer_flush_lock.lock().unwrap() - } else { - match self.layer_flush_lock.try_lock() { - Ok(guard) => guard, - Err(TryLockError::WouldBlock) => return Ok(()), - Err(TryLockError::Poisoned(err)) => panic!("{:?}", err), - } - }; - - let timer = self.flush_time_histo.start_timer(); - - loop { - let layers = self.layers.read().unwrap(); - if let Some(frozen_layer) = layers.frozen_layers.front() { - let frozen_layer = Arc::clone(frozen_layer); - drop(layers); // to allow concurrent reads and writes - self.flush_frozen_layer(frozen_layer)?; - } else { - // Drop the 'layer_flush_lock' *before* 'layers'. That - // way, if you freeze a layer, and then call - // flush_frozen_layers(false), it is guaranteed that - // if another thread was busy flushing layers and the - // call therefore returns immediately, the other - // thread will have seen the newly-frozen layer and - // will flush that too (assuming no errors). - drop(flush_lock_guard); - drop(layers); - break; - } - } - - timer.stop_and_record(); - - Ok(()) - } - - /// Flush one frozen in-memory layer to disk, as a new delta layer. - fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { - // As a special case, when we have just imported an image into the repository, - // instead of writing out a L0 delta layer, we directly write out image layer - // files instead. This is possible as long as *all* the data imported into the - // repository have the same LSN. - let lsn_range = frozen_layer.get_lsn_range(); - let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn - && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) - { - let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; - let (partitioning, _lsn) = - pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - self.create_image_layers(&partitioning, self.initdb_lsn, true)? - } else { - // normal case, write out a L0 delta layer file. - let delta_path = self.create_delta_layer(&frozen_layer)?; - HashSet::from([delta_path]) - }; - - fail_point!("flush-frozen-before-sync"); - - // The new on-disk layers are now in the layer map. We can remove the - // in-memory layer from the map now. - { - let mut layers = self.layers.write().unwrap(); - let l = layers.frozen_layers.pop_front(); - - // Only one thread may call this function at a time (for this - // timeline). If two threads tried to flush the same frozen - // layer to disk at the same time, that would not work. - assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); - - // release lock on 'layers' - } - - fail_point!("checkpoint-after-sync"); - - // Update the metadata file, with new 'disk_consistent_lsn' - // - // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing - // *all* the layers, to avoid fsyncing the file multiple times. - let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); - self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?; - - Ok(()) - } - - /// Update metadata file - fn update_disk_consistent_lsn( - &self, - disk_consistent_lsn: Lsn, - layer_paths_to_upload: HashSet, - ) -> Result<()> { - // If we were able to advance 'disk_consistent_lsn', save it the metadata file. - // After crash, we will restart WAL streaming and processing from that point. - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); - - // We can only save a valid 'prev_record_lsn' value on disk if we - // flushed *all* in-memory changes to disk. We only track - // 'prev_record_lsn' in memory for the latest processed record, so we - // don't remember what the correct value that corresponds to some old - // LSN is. But if we flush everything, then the value corresponding - // current 'last_record_lsn' is correct and we can store it on disk. - let RecordLsn { - last: last_record_lsn, - prev: prev_record_lsn, - } = self.last_record_lsn.load(); - let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { - Some(prev_record_lsn) - } else { - None - }; - - let ancestor_timelineid = self - .ancestor_timeline - .as_ref() - .map(LayeredTimelineEntry::timeline_id); - - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - ondisk_prev_record_lsn, - ancestor_timelineid, - self.ancestor_lsn, - *self.latest_gc_cutoff_lsn.read().unwrap(), - self.initdb_lsn, - ); - - fail_point!("checkpoint-before-saving-metadata", |x| bail!( - "{}", - x.unwrap() - )); - - LayeredRepository::save_metadata( - self.conf, - self.timeline_id, - self.tenant_id, - &metadata, - false, - )?; - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - layer_paths_to_upload, - Some(metadata), - ); - } - - // Also update the in-memory copy - self.disk_consistent_lsn.store(disk_consistent_lsn); - } - - Ok(()) - } - - // Write out the given frozen in-memory layer as a new L0 delta file - fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result { - // Write it out - let new_delta = frozen_layer.write_to_disk()?; - let new_delta_path = new_delta.path(); - - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, it might be better to first write them all, and then fsync - // them all in parallel. - par_fsync::par_fsync(&[ - new_delta_path.clone(), - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - ])?; - - // Add it to the layer map - { - let mut layers = self.layers.write().unwrap(); - layers.insert_historic(Arc::new(new_delta)); - } - - NUM_PERSISTENT_FILES_CREATED.inc_by(1); - PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); - - Ok(new_delta_path) - } - - pub fn compact(&self) -> Result<()> { - // - // High level strategy for compaction / image creation: - // - // 1. First, calculate the desired "partitioning" of the - // currently in-use key space. The goal is to partition the - // key space into roughly fixed-size chunks, but also take into - // account any existing image layers, and try to align the - // chunk boundaries with the existing image layers to avoid - // too much churn. Also try to align chunk boundaries with - // relation boundaries. In principle, we don't know about - // relation boundaries here, we just deal with key-value - // pairs, and the code in pgdatadir_mapping.rs knows how to - // map relations into key-value pairs. But in practice we know - // that 'field6' is the block number, and the fields 1-5 - // identify a relation. This is just an optimization, - // though. - // - // 2. Once we know the partitioning, for each partition, - // decide if it's time to create a new image layer. The - // criteria is: there has been too much "churn" since the last - // image layer? The "churn" is fuzzy concept, it's a - // combination of too many delta files, or too much WAL in - // total in the delta file. Or perhaps: if creating an image - // file would allow to delete some older files. - // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. - let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); - - let target_file_size = self.get_checkpoint_distance(); - - // Define partitioning schema if needed - if let Ok(pgdir) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - // 2. Create new image layers for partitions that have been modified - // "enough". - let (partitioning, lsn) = pgdir.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - )?; - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; - if !layer_paths_to_upload.is_empty() - && self.upload_layers.load(atomic::Ordering::Relaxed) - { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - HashSet::from_iter(layer_paths_to_upload), - None, - ); - } - - // 3. Compact - let timer = self.compact_time_histo.start_timer(); - self.compact_level0(target_file_size)?; - timer.stop_and_record(); - } else { - debug!("Could not compact because no partitioning specified yet"); - } - - Ok(()) - } - - // Is it time to create a new image layer for the given partition? - fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { - let layers = self.layers.read().unwrap(); - - for part_range in &partition.ranges { - let image_coverage = layers.image_coverage(part_range, lsn)?; - for (img_range, last_img) in image_coverage { - let img_lsn = if let Some(last_img) = last_img { - last_img.get_lsn_range().end - } else { - Lsn(0) - }; - // Let's consider an example: - // - // delta layer with LSN range 71-81 - // delta layer with LSN range 81-91 - // delta layer with LSN range 91-101 - // image layer at LSN 100 - // - // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, - // there's no need to create a new one. We check this case explicitly, to avoid passing - // a bogus range to count_deltas below, with start > end. It's even possible that there - // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed - // after we read last_record_lsn, which is passed here in the 'lsn' argument. - if img_lsn < lsn { - let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; - - debug!( - "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", - img_range.start, img_range.end, num_deltas, img_lsn, lsn - ); - if num_deltas >= self.get_image_creation_threshold() { - return Ok(true); - } - } - } - } - - Ok(false) - } - - fn create_image_layers( - &self, - partitioning: &KeyPartitioning, - lsn: Lsn, - force: bool, - ) -> Result> { - let timer = self.create_images_time_histo.start_timer(); - let mut image_layers: Vec = Vec::new(); - let mut layer_paths_to_upload = HashSet::new(); - for partition in partitioning.parts.iter() { - if force || self.time_for_new_image_layer(partition, lsn)? { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_id, - &img_range, - lsn, - )?; - - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - let img = self.get(key, lsn)?; - image_layer_writer.put_image(key, &img)?; - key = key.next(); - } - } - let image_layer = image_layer_writer.finish()?; - layer_paths_to_upload.insert(image_layer.path()); - image_layers.push(image_layer); - } - } - - // Sync the new layer to disk before adding it to the layer map, to make sure - // we don't garbage collect something based on the new layer, before it has - // reached the disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // Compaction creates multiple image layers. It would be better to create them all - // and fsync them all in parallel. - let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone()); - all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); - par_fsync::par_fsync(&all_paths)?; - - let mut layers = self.layers.write().unwrap(); - for l in image_layers { - layers.insert_historic(Arc::new(l)); - } - drop(layers); - timer.stop_and_record(); - - Ok(layer_paths_to_upload) - } - - /// - /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. - /// - fn compact_level0(&self, target_file_size: u64) -> Result<()> { - let layers = self.layers.read().unwrap(); - let mut level0_deltas = layers.get_level0_deltas()?; - drop(layers); - - // Only compact if enough layers have accumulated. - if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { - return Ok(()); - } - - // Gather the files to compact in this iteration. - // - // Start with the oldest Level 0 delta file, and collect any other - // level 0 files that form a contiguous sequence, such that the end - // LSN of previous file matches the start LSN of the next file. - // - // Note that if the files don't form such a sequence, we might - // "compact" just a single file. That's a bit pointless, but it allows - // us to get rid of the level 0 file, and compact the other files on - // the next iteration. This could probably made smarter, but such - // "gaps" in the sequence of level 0 files should only happen in case - // of a crash, partial download from cloud storage, or something like - // that, so it's not a big deal in practice. - level0_deltas.sort_by_key(|l| l.get_lsn_range().start); - let mut level0_deltas_iter = level0_deltas.iter(); - - let first_level0_delta = level0_deltas_iter.next().unwrap(); - let mut prev_lsn_end = first_level0_delta.get_lsn_range().end; - let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)]; - for l in level0_deltas_iter { - let lsn_range = l.get_lsn_range(); - - if lsn_range.start != prev_lsn_end { - break; - } - deltas_to_compact.push(Arc::clone(l)); - prev_lsn_end = lsn_range.end; - } - let lsn_range = Range { - start: deltas_to_compact.first().unwrap().get_lsn_range().start, - end: deltas_to_compact.last().unwrap().get_lsn_range().end, - }; - - info!( - "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", - lsn_range.start, - lsn_range.end, - deltas_to_compact.len(), - level0_deltas.len() - ); - for l in deltas_to_compact.iter() { - info!("compact includes {}", l.filename().display()); - } - // We don't need the original list of layers anymore. Drop it so that - // we don't accidentally use it later in the function. - drop(level0_deltas); - - // This iterator walks through all key-value pairs from all the layers - // we're compacting, in key, LSN order. - let all_values_iter = deltas_to_compact - .iter() - .map(|l| l.iter()) - .kmerge_by(|a, b| { - if let Ok((a_key, a_lsn, _)) = a { - if let Ok((b_key, b_lsn, _)) = b { - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, - } - } else { - false - } - } else { - true - } - }); - - // This iterator walks through all keys and is needed to calculate size used by each key - let mut all_keys_iter = deltas_to_compact - .iter() - .map(|l| l.key_iter()) - .kmerge_by(|a, b| { - let (a_key, a_lsn, _) = a; - let (b_key, b_lsn, _) = b; - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, - } - }); - - // Merge the contents of all the input delta layers into a new set - // of delta layers, based on the current partitioning. - // - // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. - // It's possible that there is a single key with so many page versions that storing all of them in a single layer file - // would be too large. In that case, we also split on the LSN dimension. - // - // LSN - // ^ - // | - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // - // - // If one key (X) has a lot of page versions: - // - // LSN - // ^ - // | (X) - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | +--+ | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | +--+ | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // TODO: this actually divides the layers into fixed-size chunks, not - // based on the partitioning. - // - // TODO: we should also opportunistically materialize and - // garbage collect what we can. - let mut new_layers = Vec::new(); - let mut prev_key: Option = None; - let mut writer: Option = None; - let mut key_values_total_size = 0u64; - let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key - let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key - for x in all_values_iter { - let (key, lsn, value) = x?; - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); - // We need to check key boundaries once we reach next key or end of layer with the same key - if !same_key || lsn == dup_end_lsn { - let mut next_key_size = 0u64; - let is_dup_layer = dup_end_lsn.is_valid(); - dup_start_lsn = Lsn::INVALID; - if !same_key { - dup_end_lsn = Lsn::INVALID; - } - // Determine size occupied by this key. We stop at next key, or when size becomes larger than target_file_size - for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { - next_key_size = next_size; - if key != next_key { - if dup_end_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; - } - break; - } - key_values_total_size += next_size; - if key_values_total_size > target_file_size { - // split key between multiple layers: such layer can contain only single key - dup_start_lsn = if dup_end_lsn.is_valid() { - dup_end_lsn - } else { - lsn - }; - dup_end_lsn = next_lsn; - break; - } - } - // handle case when loop reaches last key - if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; - } - if writer.is_some() { - let written_size = writer.as_mut().unwrap().size(); - // check if key cause layer overflow - if is_dup_layer - || dup_end_lsn.is_valid() - || written_size + key_values_total_size > target_file_size - { - new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); - writer = None; - } - } - key_values_total_size = next_key_size; - } - if writer.is_none() { - writer = Some(DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_id, - key, - if dup_end_lsn.is_valid() { - // this is a layer containing slice of values of the same key - debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); - dup_start_lsn..dup_end_lsn - } else { - debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); - lsn_range.clone() - }, - )?); - } - writer.as_mut().unwrap().put_value(key, lsn, value)?; - prev_key = Some(key); - } - if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next())?); - } - - // Sync layers - if !new_layers.is_empty() { - let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); - - // also sync the directory - layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync(&layer_paths)?; - - layer_paths.pop().unwrap(); - } - - let mut layers = self.layers.write().unwrap(); - let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); - for l in new_layers { - new_layer_paths.insert(l.path()); - layers.insert_historic(Arc::new(l)); - } - - // Now that we have reshuffled the data to set of new delta layers, we can - // delete the old ones - let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); - for l in &deltas_to_compact { - l.delete()?; - if let Some(path) = l.local_path() { - layer_paths_do_delete.insert(path); - } - layers.remove_historic(l.clone()); - } - drop(layers); - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - new_layer_paths, - None, - ); - storage_sync::schedule_layer_delete( - self.tenant_id, - self.timeline_id, - layer_paths_do_delete, - ); - } - - Ok(()) - } - - /// Update information about which layer files need to be retained on - /// garbage collection. This is separate from actually performing the GC, - /// and is updated more frequently, so that compaction can remove obsolete - /// page versions more aggressively. - /// - /// TODO: that's wishful thinking, compaction doesn't actually do that - /// currently. - /// - /// The caller specifies how much history is needed with the 3 arguments: - /// - /// retain_lsns: keep a version of each page at these LSNs - /// cutoff_horizon: also keep everything newer than this LSN - /// pitr: the time duration required to keep data for PITR - /// - /// The 'retain_lsns' list is currently used to prevent removing files that - /// are needed by child timelines. In the future, the user might be able to - /// name additional points in time to retain. The caller is responsible for - /// collecting that information. - /// - /// The 'cutoff_horizon' point is used to retain recent versions that might still be - /// needed by read-only nodes. (As of this writing, the caller just passes - /// the latest LSN subtracted by a constant, and doesn't do anything smart - /// to figure out what read-only nodes might actually need.) - /// - /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine - /// whether a record is needed for PITR. - fn update_gc_info( - &self, - retain_lsns: Vec, - cutoff_horizon: Lsn, - pitr: Duration, - ) -> Result<()> { - let mut gc_info = self.gc_info.write().unwrap(); - - gc_info.horizon_cutoff = cutoff_horizon; - gc_info.retain_lsns = retain_lsns; - - // Calculate pitr cutoff point. - // If we cannot determine a cutoff LSN, be conservative and don't GC anything. - let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); - - if let Ok(timeline) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - let now = SystemTime::now(); - // First, calculate pitr_cutoff_timestamp and then convert it to LSN. - // If we don't have enough data to convert to LSN, - // play safe and don't remove any layers. - if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { - let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - - match timeline.find_lsn_for_timestamp(pitr_timestamp)? { - LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, - LsnForTimestamp::Future(lsn) => { - debug!("future({})", lsn); - pitr_cutoff_lsn = gc_info.horizon_cutoff; - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - } - } - debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) - } - } else if cfg!(test) { - // We don't have local timeline in mocked cargo tests. - // So, just ignore pitr_interval setting in this case. - pitr_cutoff_lsn = gc_info.horizon_cutoff; - } - gc_info.pitr_cutoff = pitr_cutoff_lsn; - - Ok(()) - } - - /// - /// Garbage collect layer files on a timeline that are no longer needed. - /// - /// Currently, we don't make any attempt at removing unneeded page versions - /// within a layer file. We can only remove the whole file if it's fully - /// obsolete. - /// - fn gc(&self) -> Result { - let mut result: GcResult = Default::default(); - let now = SystemTime::now(); - - fail_point!("before-timeline-gc"); - - let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); - - let gc_info = self.gc_info.read().unwrap(); - - let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.pitr_cutoff; - let retain_lsns = &gc_info.retain_lsns; - - let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); - - // Nothing to GC. Return early. - let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); - if latest_gc_cutoff >= new_gc_cutoff { - info!( - "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", - self.timeline_id - ); - return Ok(result); - } - - let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); - - // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. - // See branch_timeline() for details. - *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; - - info!("GC starting"); - - debug!("retain_lsns: {:?}", retain_lsns); - - let mut layers_to_remove = Vec::new(); - - // Scan all on-disk layers in the timeline. - // - // Garbage collect the layer if all conditions are satisfied: - // 1. it is older than cutoff LSN; - // 2. it is older than PITR interval; - // 3. it doesn't need to be retained for 'retain_lsns'; - // 4. newer on-disk image layers cover the layer's whole key range - // - let mut layers = self.layers.write().unwrap(); - 'outer: for l in layers.iter_historic_layers() { - // This layer is in the process of being flushed to disk. - // It will be swapped out of the layer map, replaced with - // on-disk layers containing the same data. - // We can't GC it, as it's not on disk. We can't remove it - // from the layer map yet, as it would make its data - // inaccessible. - if l.is_in_memory() { - continue; - } - - result.layers_total += 1; - - // 1. Is it newer than GC horizon cutoff point? - if l.get_lsn_range().end > horizon_cutoff { - debug!( - "keeping {} because it's newer than horizon_cutoff {}", - l.filename().display(), - horizon_cutoff - ); - result.layers_needed_by_cutoff += 1; - continue 'outer; - } - - // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > pitr_cutoff { - debug!( - "keeping {} because it's newer than pitr_cutoff {}", - l.filename().display(), - pitr_cutoff - ); - result.layers_needed_by_pitr += 1; - continue 'outer; - } - - // 3. Is it needed by a child branch? - // NOTE With that we would keep data that - // might be referenced by child branches forever. - // We can track this in child timeline GC and delete parent layers when - // they are no longer needed. This might be complicated with long inheritance chains. - for retain_lsn in retain_lsns { - // start_lsn is inclusive - if &l.get_lsn_range().start <= retain_lsn { - debug!( - "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename().display(), - retain_lsn, - l.is_incremental(), - ); - result.layers_needed_by_branches += 1; - continue 'outer; - } - } - - // 4. Is there a later on-disk layer for this relation? - // - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - // - // For example, imagine that the following layers exist: - // - // 1000 - image (A) - // 1000-2000 - delta (B) - // 2000 - image (C) - // 2000-3000 - delta (D) - // 3000 - image (E) - // - // If GC horizon is at 2500, we can remove layers A and B, but - // we cannot remove C, even though it's older than 2500, because - // the delta layer 2000-3000 depends on it. - if !layers - .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? - { - debug!( - "keeping {} because it is the latest layer", - l.filename().display() - ); - result.layers_not_updated += 1; - continue 'outer; - } - - // We didn't find any reason to keep this file, so remove it. - debug!( - "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename().display(), - l.is_incremental(), - ); - layers_to_remove.push(Arc::clone(l)); - } - - // Actually delete the layers from disk and remove them from the map. - // (couldn't do this in the loop above, because you cannot modify a collection - // while iterating it. BTreeMap::retain() would be another option) - let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); - for doomed_layer in layers_to_remove { - doomed_layer.delete()?; - if let Some(path) = doomed_layer.local_path() { - layer_paths_to_delete.insert(path); - } - layers.remove_historic(doomed_layer); - result.layers_removed += 1; - } - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_delete( - self.tenant_id, - self.timeline_id, - layer_paths_to_delete, - ); - } - - result.elapsed = now.elapsed()?; - Ok(result) - } - - /// - /// Reconstruct a value, using the given base image and WAL records in 'data'. - /// - fn reconstruct_value( - &self, - key: Key, - request_lsn: Lsn, - mut data: ValueReconstructState, - ) -> Result { - // Perform WAL redo if needed - data.records.reverse(); - - // If we have a page image, and no WAL, we're all set - if data.records.is_empty() { - if let Some((img_lsn, img)) = &data.img { - trace!( - "found page image for key {} at {}, no WAL redo required", - key, - img_lsn - ); - Ok(img.clone()) - } else { - bail!("base image for {} at {} not found", key, request_lsn); - } - } else { - // We need to do WAL redo. - // - // If we don't have a base image, then the oldest WAL record better initialize - // the page - if data.img.is_none() && !data.records.first().unwrap().1.will_init() { - bail!( - "Base image for {} at {} not found, but got {} WAL records", - key, - request_lsn, - data.records.len() - ); - } else { - let base_img = if let Some((_lsn, img)) = data.img { - trace!( - "found {} WAL records and a base image for {} at {}, performing WAL redo", - data.records.len(), - key, - request_lsn - ); - Some(img) - } else { - trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); - None - }; - - let last_rec_lsn = data.records.last().unwrap().0; - - let img = - self.walredo_mgr - .request_redo(key, request_lsn, base_img, data.records)?; - - if img.len() == page_cache::PAGE_SZ { - let cache = page_cache::get(); - cache.memorize_materialized_page( - self.tenant_id, - self.timeline_id, - key, - last_rec_lsn, - &img, - ); - } - - Ok(img) - } - } - } -} - -/// Helper function for get_reconstruct_data() to add the path of layers traversed -/// to an error, as anyhow context information. -fn layer_traversal_error( - msg: String, - path: Vec<(ValueReconstructResult, Lsn, Arc)>, -) -> anyhow::Result<()> { - // We want the original 'msg' to be the outermost context. The outermost context - // is the most high-level information, which also gets propagated to the client. - let mut msg_iter = path - .iter() - .map(|(r, c, l)| { - format!( - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, - c, - l.filename().display() - ) - }) - .chain(std::iter::once(msg)); - // Construct initial message from the first traversed layer - let err = anyhow!(msg_iter.next().unwrap()); - - // Append all subsequent traversals, and the error message 'msg', as contexts. - Err(msg_iter.fold(err, |err, msg| err.context(msg))) -} - -struct LayeredTimelineWriter<'a> { - tl: &'a LayeredTimeline, - _write_guard: MutexGuard<'a, ()>, -} - -impl Deref for LayeredTimelineWriter<'_> { - type Target = dyn Timeline; - - fn deref(&self) -> &Self::Target { - self.tl - } -} - -impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { - fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { - self.tl.put_value(key, lsn, value) - } - - fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { - self.tl.put_tombstone(key_range, lsn) - } - - /// - /// Remember the (end of) last valid WAL record remembered in the timeline. - /// - fn finish_write(&self, new_lsn: Lsn) { - self.tl.finish_write(new_lsn); - } -} - /// Dump contents of a layer file to stdout. pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { use std::os::unix::fs::FileExt; @@ -2836,34 +874,18 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { file.read_exact_at(&mut header_buf, 0)?; match u16::from_be_bytes(header_buf) { - crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?, - crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?, + crate::IMAGE_FILE_MAGIC => { + image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)? + } + crate::DELTA_FILE_MAGIC => { + delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)? + } magic => bail!("unrecognized magic identifier: {:?}", magic), } Ok(()) } -/// Add a suffix to a layer file's name: .{num}.old -/// Uses the first available num (starts at 0) -fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { - let filename = path - .file_name() - .ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))? - .to_string_lossy(); - let mut new_path = path.clone(); - - for i in 0u32.. { - new_path.set_file_name(format!("{}.{}.old", filename, i)); - if !new_path.exists() { - std::fs::rename(&path, &new_path)?; - return Ok(()); - } - } - - bail!("couldn't find an unused backup number for {:?}", path) -} - pub fn load_metadata( conf: &'static PageServerConf, timeline_id: ZTimelineId, @@ -2893,9 +915,11 @@ pub fn load_metadata( /// #[cfg(test)] pub mod tests { + use super::metadata::METADATA_FILE_NAME; use super::*; use crate::keyspace::KeySpaceAccum; use crate::repository::repo_harness::*; + use crate::repository::{Key, Value}; use rand::{thread_rng, Rng}; #[test] diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index f7f51bf21f..be590c88c2 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -10,9 +10,9 @@ //! corresponding files are written to disk. //! +use crate::layered_repository::inmemory_layer::InMemoryLayer; use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; -use crate::layered_repository::InMemoryLayer; use crate::repository::Key; use anyhow::Result; use lazy_static::lazy_static; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs new file mode 100644 index 0000000000..e862b7def7 --- /dev/null +++ b/pageserver/src/layered_repository/timeline.rs @@ -0,0 +1,2021 @@ +//! + +use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::Bytes; +use fail::fail_point; +use itertools::Itertools; +use lazy_static::lazy_static; +use tracing::*; + +use std::cmp::{max, min, Ordering}; +use std::collections::HashSet; +use std::fs; +use std::fs::{File, OpenOptions}; +use std::io::Write; +use std::ops::{Deref, Range}; +use std::path::PathBuf; +use std::sync::atomic::{self, AtomicBool}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; +use std::time::{Duration, SystemTime}; + +use metrics::{ + register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, + Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, +}; + +use crate::layered_repository::{ + delta_layer::{DeltaLayer, DeltaLayerWriter}, + ephemeral_file::is_ephemeral_file, + filename::{DeltaFileName, ImageFileName}, + image_layer::{ImageLayer, ImageLayerWriter}, + inmemory_layer::InMemoryLayer, + layer_map::{LayerMap, SearchResult}, + metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, + par_fsync, + storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, +}; + +use crate::config::PageServerConf; +use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::tenant_config::TenantConfOpt; + +use postgres_ffi::xlog_utils::to_pg_timestamp; +use utils::{ + lsn::{AtomicLsn, Lsn, RecordLsn}, + seqwait::SeqWait, + zid::{ZTenantId, ZTimelineId}, +}; + +use crate::repository::{GcResult, RepositoryTimeline, Timeline, TimelineWriter}; +use crate::repository::{Key, Value}; +use crate::tenant_mgr; +use crate::thread_mgr; +use crate::virtual_file::VirtualFile; +use crate::walreceiver::IS_WAL_RECEIVER; +use crate::walredo::WalRedoManager; +use crate::CheckpointConfig; +use crate::{page_cache, storage_sync}; + +// Metrics collected on operations on the storage repository. +lazy_static! { + pub static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( + "pageserver_storage_operations_seconds", + "Time spent on storage operations", + &["operation", "tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +// Metrics collected on operations on the storage repository. +lazy_static! { + static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( + "pageserver_getpage_reconstruct_seconds", + "Time spent in reconstruct_value", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +lazy_static! { + static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( + "pageserver_materialized_cache_hits_total", + "Number of cache hits from materialized page cache", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); + static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( + "pageserver_wait_lsn_seconds", + "Time spent waiting for WAL to arrive", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +lazy_static! { + static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!( + "pageserver_last_record_lsn", + "Last record LSN grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, +// or in testing they estimate how much we would upload if we did. +lazy_static! { + static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( + "pageserver_created_persistent_files_total", + "Number of files created that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric"); + static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( + "pageserver_written_persistent_bytes_total", + "Total bytes written that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric"); +} + +#[derive(Clone)] +pub enum LayeredTimelineEntry { + Loaded(Arc), + Unloaded { + id: ZTimelineId, + metadata: TimelineMetadata, + init_logical_size: Option, + }, +} + +impl LayeredTimelineEntry { + fn timeline_id(&self) -> ZTimelineId { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, + LayeredTimelineEntry::Unloaded { id, .. } => *id, + } + } + + pub fn ancestor_timeline_id(&self) -> Option { + match self { + LayeredTimelineEntry::Loaded(timeline) => { + timeline.ancestor_timeline.as_ref().map(|t| t.timeline_id()) + } + LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_timeline(), + } + } + + pub fn ancestor_lsn(&self) -> Lsn { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline.ancestor_lsn, + LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_lsn(), + } + } + + fn ensure_loaded(&self) -> anyhow::Result<&Arc> { + match self { + LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), + LayeredTimelineEntry::Unloaded { .. } => { + anyhow::bail!("timeline is unloaded") + } + } + } + + pub fn layer_removal_guard(&self) -> Result>, anyhow::Error> { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline + .layer_removal_cs + .try_lock() + .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + .map(Some), + + LayeredTimelineEntry::Unloaded { .. } => Ok(None), + } + } +} + +impl From for RepositoryTimeline { + fn from(entry: LayeredTimelineEntry) -> Self { + match entry { + LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), + LayeredTimelineEntry::Unloaded { metadata, .. } => { + RepositoryTimeline::Unloaded { metadata } + } + } + } +} + +pub struct LayeredTimeline { + conf: &'static PageServerConf, + tenant_conf: Arc>, + + tenant_id: ZTenantId, + pub timeline_id: ZTimelineId, + + pub layers: RwLock, + + last_freeze_at: AtomicLsn, + + // WAL redo manager + walredo_mgr: Arc, + + // What page versions do we hold in the repository? If we get a + // request > last_record_lsn, we need to wait until we receive all + // the WAL up to the request. The SeqWait provides functions for + // that. TODO: If we get a request for an old LSN, such that the + // versions have already been garbage collected away, we should + // throw an error, but we don't track that currently. + // + // last_record_lsn.load().last points to the end of last processed WAL record. + // + // We also remember the starting point of the previous record in + // 'last_record_lsn.load().prev'. It's used to set the xl_prev pointer of the + // first WAL record when the node is started up. But here, we just + // keep track of it. + last_record_lsn: SeqWait, + + // All WAL records have been processed and stored durably on files on + // local disk, up to this LSN. On crash and restart, we need to re-process + // the WAL starting from this point. + // + // Some later WAL records might have been processed and also flushed to disk + // already, so don't be surprised to see some, but there's no guarantee on + // them yet. + disk_consistent_lsn: AtomicLsn, + + // Parent timeline that this timeline was branched from, and the LSN + // of the branch point. + ancestor_timeline: Option, + ancestor_lsn: Lsn, + + // Metrics + reconstruct_time_histo: Histogram, + materialized_page_cache_hit_counter: IntCounter, + flush_time_histo: Histogram, + compact_time_histo: Histogram, + create_images_time_histo: Histogram, + last_record_gauge: IntGauge, + wait_lsn_time_histo: Histogram, + + /// If `true`, will backup its files that appear after each checkpointing to the remote storage. + upload_layers: AtomicBool, + + /// Ensures layers aren't frozen by checkpointer between + /// [`LayeredTimeline::get_layer_for_write`] and layer reads. + /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. + /// Must always be acquired before the layer map/individual layer lock + /// to avoid deadlock. + write_lock: Mutex<()>, + + /// Used to ensure that there is only one thread + layer_flush_lock: Mutex<()>, + + /// Layer removal lock. + /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. + /// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`], + /// and [`LayeredRepository::delete_timeline`]. + layer_removal_cs: Mutex<()>, + + // Needed to ensure that we can't create a branch at a point that was already garbage collected + pub latest_gc_cutoff_lsn: RwLock, + + // List of child timelines and their branch points. This is needed to avoid + // garbage collecting data that is still needed by the child timelines. + pub gc_info: RwLock, + + // It may change across major versions so for simplicity + // keep it after running initdb for a timeline. + // It is needed in checks when we want to error on some operations + // when they are requested for pre-initdb lsn. + // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", + // though lets keep them both for better error visibility. + pub initdb_lsn: Lsn, + + // Initial logical size of timeline (if known). + // Logical size can be copied from ancestor timeline when new branch is create at last LSN + pub init_logical_size: Option, +} + +/// +/// Information about how much history needs to be retained, needed by +/// Garbage Collection. +/// +pub struct GcInfo { + /// Specific LSNs that are needed. + /// + /// Currently, this includes all points where child branches have + /// been forked off from. In the future, could also include + /// explicit user-defined snapshot points. + pub retain_lsns: Vec, + + /// In addition to 'retain_lsns', keep everything newer than this + /// point. + /// + /// This is calculated by subtracting 'gc_horizon' setting from + /// last-record LSN + /// + /// FIXME: is this inclusive or exclusive? + pub horizon_cutoff: Lsn, + + /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this + /// point. + /// + /// This is calculated by finding a number such that a record is needed for PITR + /// if only if its LSN is larger than 'pitr_cutoff'. + pub pitr_cutoff: Lsn, +} + +/// Public interface functions +impl Timeline for LayeredTimeline { + fn get_ancestor_lsn(&self) -> Lsn { + self.ancestor_lsn + } + + fn get_ancestor_timeline_id(&self) -> Option { + self.ancestor_timeline + .as_ref() + .map(LayeredTimelineEntry::timeline_id) + } + + /// Wait until WAL has been received up to the given LSN. + fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + // This should never be called from the WAL receiver thread, because that could lead + // to a deadlock. + ensure!( + !IS_WAL_RECEIVER.with(|c| c.get()), + "wait_lsn called by WAL receiver thread" + ); + + self.wait_lsn_time_histo.observe_closure_duration( + || self.last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .with_context(|| { + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + }))?; + + Ok(()) + } + + fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { + self.latest_gc_cutoff_lsn.read().unwrap() + } + + /// Look up the value with the given a key + fn get(&self, key: Key, lsn: Lsn) -> Result { + debug_assert!(lsn <= self.get_last_record_lsn()); + + // Check the page cache. We will get back the most recent page with lsn <= `lsn`. + // The cached image can be returned directly if there is no WAL between the cached image + // and requested LSN. The cached image can also be used to reduce the amount of WAL needed + // for redo. + let cached_page_img = match self.lookup_cached_page(&key, lsn) { + Some((cached_lsn, cached_img)) => { + match cached_lsn.cmp(&lsn) { + Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check + Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn + } + Some((cached_lsn, cached_img)) + } + None => None, + }; + + let mut reconstruct_state = ValueReconstructState { + records: Vec::new(), + img: cached_page_img, + }; + + self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; + + self.reconstruct_time_histo + .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) + } + + /// Public entry point for checkpoint(). All the logic is in the private + /// checkpoint_internal function, this public facade just wraps it for + /// metrics collection. + fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { + match cconf { + CheckpointConfig::Flush => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true) + } + CheckpointConfig::Forced => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true)?; + self.compact() + } + } + } + + /// + /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. + /// + fn check_lsn_is_in_scope( + &self, + lsn: Lsn, + latest_gc_cutoff_lsn: &RwLockReadGuard, + ) -> Result<()> { + ensure!( + lsn >= **latest_gc_cutoff_lsn, + "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + lsn, + **latest_gc_cutoff_lsn, + ); + Ok(()) + } + + fn get_last_record_lsn(&self) -> Lsn { + self.last_record_lsn.load().last + } + + fn get_prev_record_lsn(&self) -> Lsn { + self.last_record_lsn.load().prev + } + + fn get_last_record_rlsn(&self) -> RecordLsn { + self.last_record_lsn.load() + } + + fn get_disk_consistent_lsn(&self) -> Lsn { + self.disk_consistent_lsn.load() + } + + fn writer<'a>(&'a self) -> Box { + Box::new(LayeredTimelineWriter { + tl: self, + _write_guard: self.write_lock.lock().unwrap(), + }) + } +} + +impl LayeredTimeline { + fn get_checkpoint_distance(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_distance + .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) + } + + fn get_compaction_target_size(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_target_size + .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) + } + + fn get_compaction_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_threshold + .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) + } + + fn get_image_creation_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .image_creation_threshold + .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) + } + + /// Open a Timeline handle. + /// + /// Loads the metadata for the timeline into memory, but not the layer map. + #[allow(clippy::too_many_arguments)] + pub fn new( + conf: &'static PageServerConf, + tenant_conf: Arc>, + metadata: TimelineMetadata, + ancestor: Option, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + walredo_mgr: Arc, + upload_layers: bool, + init_logical_size: Option, + ) -> LayeredTimeline { + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let flush_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "layer flush", + &tenant_id.to_string(), + &timeline_id.to_string(), + ]) + .unwrap(); + let compact_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "compact", + &tenant_id.to_string(), + &timeline_id.to_string(), + ]) + .unwrap(); + let create_images_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "create images", + &tenant_id.to_string(), + &timeline_id.to_string(), + ]) + .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + + LayeredTimeline { + conf, + tenant_conf, + timeline_id, + tenant_id, + layers: RwLock::new(LayerMap::default()), + + walredo_mgr, + + // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. + last_record_lsn: SeqWait::new(RecordLsn { + last: metadata.disk_consistent_lsn(), + prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), + }), + disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), + + last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), + + ancestor_timeline: ancestor, + ancestor_lsn: metadata.ancestor_lsn(), + + reconstruct_time_histo, + materialized_page_cache_hit_counter, + flush_time_histo, + compact_time_histo, + create_images_time_histo, + last_record_gauge, + wait_lsn_time_histo, + + upload_layers: AtomicBool::new(upload_layers), + + write_lock: Mutex::new(()), + layer_flush_lock: Mutex::new(()), + layer_removal_cs: Mutex::new(()), + + gc_info: RwLock::new(GcInfo { + retain_lsns: Vec::new(), + horizon_cutoff: Lsn(0), + pitr_cutoff: Lsn(0), + }), + + latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), + initdb_lsn: metadata.initdb_lsn(), + init_logical_size, + } + } + + /// + /// Scan the timeline directory to populate the layer map. + /// Returns all timeline-related files that were found and loaded. + /// + pub fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { + let mut layers = self.layers.write().unwrap(); + let mut num_layers = 0; + + // Scan timeline directory and create ImageFileName and DeltaFilename + // structs representing all files on disk + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + + for direntry in fs::read_dir(timeline_path)? { + let direntry = direntry?; + let fname = direntry.file_name(); + let fname = fname.to_string_lossy(); + + if let Some(imgfilename) = ImageFileName::parse_str(&fname) { + // create an ImageLayer struct for each image file. + if imgfilename.lsn > disk_consistent_lsn { + warn!( + "found future image layer {} on timeline {} disk_consistent_lsn is {}", + imgfilename, self.timeline_id, disk_consistent_lsn + ); + + rename_to_backup(direntry.path())?; + continue; + } + + let layer = + ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); + + trace!("found layer {}", layer.filename().display()); + layers.insert_historic(Arc::new(layer)); + num_layers += 1; + } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { + // Create a DeltaLayer struct for each delta file. + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { + warn!( + "found future delta layer {} on timeline {} disk_consistent_lsn is {}", + deltafilename, self.timeline_id, disk_consistent_lsn + ); + + rename_to_backup(direntry.path())?; + continue; + } + + let layer = + DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); + + trace!("found layer {}", layer.filename().display()); + layers.insert_historic(Arc::new(layer)); + num_layers += 1; + } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { + // ignore these + } else if is_ephemeral_file(&fname) { + // Delete any old ephemeral files + trace!("deleting old ephemeral file in timeline dir: {}", fname); + fs::remove_file(direntry.path())?; + } else { + warn!("unrecognized filename in timeline dir: {}", fname); + } + } + + layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); + + info!( + "loaded layer map with {} layers at {}", + num_layers, disk_consistent_lsn + ); + + Ok(()) + } + + /// + /// Get a handle to a Layer for reading. + /// + /// The returned Layer might be from an ancestor timeline, if the + /// segment hasn't been updated on this timeline yet. + /// + /// This function takes the current timeline's locked LayerMap as an argument, + /// so callers can avoid potential race conditions. + fn get_reconstruct_data( + &self, + key: Key, + request_lsn: Lsn, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result<()> { + // Start from the current timeline. + let mut timeline_owned; + let mut timeline = self; + + // For debugging purposes, collect the path of layers that we traversed + // through. It's included in the error message if we fail to find the key. + let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); + + let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { + *cached_lsn + } else { + Lsn(0) + }; + + // 'prev_lsn' tracks the last LSN that we were at in our search. It's used + // to check that each iteration make some progress, to break infinite + // looping if something goes wrong. + let mut prev_lsn = Lsn(u64::MAX); + + let mut result = ValueReconstructResult::Continue; + let mut cont_lsn = Lsn(request_lsn.0 + 1); + + 'outer: loop { + // The function should have updated 'state' + //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); + match result { + ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Continue => { + // If we reached an earlier cached page image, we're done. + if cont_lsn == cached_lsn + 1 { + self.materialized_page_cache_hit_counter.inc_by(1); + return Ok(()); + } + if prev_lsn <= cont_lsn { + // Didn't make any progress in last iteration. Error out to avoid + // getting stuck in the loop. + return layer_traversal_error(format!( + "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + key, + Lsn(cont_lsn.0 - 1), + request_lsn, + timeline.ancestor_lsn + ), traversal_path); + } + prev_lsn = cont_lsn; + } + ValueReconstructResult::Missing => { + return layer_traversal_error( + format!( + "could not find data for key {} at LSN {}, for request at LSN {}", + key, cont_lsn, request_lsn + ), + traversal_path, + ); + } + } + + // Recurse into ancestor if needed + if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + trace!( + "going into ancestor {}, cont_lsn is {}", + timeline.ancestor_lsn, + cont_lsn + ); + let ancestor = timeline.get_ancestor_timeline()?; + timeline_owned = ancestor; + timeline = &*timeline_owned; + prev_lsn = Lsn(u64::MAX); + continue; + } + + let layers = timeline.layers.read().unwrap(); + + // Check the open and frozen in-memory layers first, in order from newest + // to oldest. + if let Some(open_layer) = &layers.open_layer { + let start_lsn = open_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + // Get all the data needed to reconstruct the page version from this layer. + // But if we have an older cached page image, no need to go past that. + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = open_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, open_layer.clone())); + continue; + } + } + for frozen_layer in layers.frozen_layers.iter().rev() { + let start_lsn = frozen_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = frozen_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, frozen_layer.clone())); + continue 'outer; + } + } + + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { + //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); + + let lsn_floor = max(cached_lsn + 1, lsn_floor); + result = layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, layer)); + } else if timeline.ancestor_timeline.is_some() { + // Nothing on this timeline. Traverse to parent + result = ValueReconstructResult::Continue; + cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); + } else { + // Nothing found + result = ValueReconstructResult::Missing; + } + } + } + + fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { + let cache = page_cache::get(); + + // FIXME: It's pointless to check the cache for things that are not 8kB pages. + // We should look at the key to determine if it's a cacheable object + let (lsn, read_guard) = + cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?; + let img = Bytes::from(read_guard.to_vec()); + Some((lsn, img)) + } + + fn get_ancestor_timeline(&self) -> Result> { + let ancestor = self + .ancestor_timeline + .as_ref() + .with_context(|| { + format!( + "Ancestor is missing. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })? + .ensure_loaded() + .with_context(|| { + format!( + "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })?; + Ok(Arc::clone(ancestor)) + } + + /// + /// Get a handle to the latest layer for appending. + /// + fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { + let mut layers = self.layers.write().unwrap(); + + ensure!(lsn.is_aligned()); + + let last_record_lsn = self.get_last_record_lsn(); + ensure!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + lsn, + last_record_lsn, + ); + + // Do we have a layer open for writing already? + let layer; + if let Some(open_layer) = &layers.open_layer { + if open_layer.get_lsn_range().start > lsn { + bail!("unexpected open layer in the future"); + } + + layer = Arc::clone(open_layer); + } else { + // No writeable layer yet. Create one. + let start_lsn = layers.next_open_layer_at.unwrap(); + + trace!( + "creating layer for write at {}/{} for record at {}", + self.timeline_id, + start_lsn, + lsn + ); + let new_layer = + InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; + let layer_rc = Arc::new(new_layer); + + layers.open_layer = Some(Arc::clone(&layer_rc)); + layers.next_open_layer_at = None; + + layer = layer_rc; + } + Ok(layer) + } + + fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { + //info!("PUT: key {} at {}", key, lsn); + let layer = self.get_layer_for_write(lsn)?; + layer.put_value(key, lsn, val)?; + Ok(()) + } + + fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> Result<()> { + let layer = self.get_layer_for_write(lsn)?; + layer.put_tombstone(key_range, lsn)?; + + Ok(()) + } + + fn finish_write(&self, new_lsn: Lsn) { + assert!(new_lsn.is_aligned()); + + self.last_record_gauge.set(new_lsn.0 as i64); + self.last_record_lsn.advance(new_lsn); + } + + fn freeze_inmem_layer(&self, write_lock_held: bool) { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let _write_guard = if write_lock_held { + None + } else { + Some(self.write_lock.lock().unwrap()) + }; + let mut layers = self.layers.write().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_rc = Arc::clone(open_layer); + // Does this layer need freezing? + let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); + open_layer.freeze(end_lsn); + + // The layer is no longer open, update the layer map to reflect this. + // We will replace it with on-disk historics below. + layers.frozen_layers.push_back(open_layer_rc); + layers.open_layer = None; + layers.next_open_layer_at = Some(end_lsn); + self.last_freeze_at.store(end_lsn); + } + drop(layers); + } + + /// + /// Check if more than 'checkpoint_distance' of WAL has been accumulated + /// in the in-memory layer, and initiate flushing it if so. + /// + pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { + let last_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_size = open_layer.size()?; + drop(layers); + let distance = last_lsn.widening_sub(self.last_freeze_at.load()); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + { + info!( + "check_checkpoint_distance {}, layer size {}", + distance, open_layer_size + ); + + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); + + // Launch a thread to flush the frozen layer to disk, unless + // a thread was already running. (If the thread was running + // at the time that we froze the layer, it must've seen the + // the layer we just froze before it exited; see comments + // in flush_frozen_layers()) + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + thread_mgr::spawn( + thread_mgr::ThreadKind::LayerFlushThread, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush thread", + false, + move || self_clone.flush_frozen_layers(false), + )?; + } + } + } + Ok(()) + } + + /// Flush all frozen layers to disk. + /// + /// Only one thread at a time can be doing layer-flushing for a + /// given timeline. If 'wait' is true, and another thread is + /// currently doing the flushing, this function will wait for it + /// to finish. If 'wait' is false, this function will return + /// immediately instead. + fn flush_frozen_layers(&self, wait: bool) -> Result<()> { + let flush_lock_guard = if wait { + self.layer_flush_lock.lock().unwrap() + } else { + match self.layer_flush_lock.try_lock() { + Ok(guard) => guard, + Err(TryLockError::WouldBlock) => return Ok(()), + Err(TryLockError::Poisoned(err)) => panic!("{:?}", err), + } + }; + + let timer = self.flush_time_histo.start_timer(); + + loop { + let layers = self.layers.read().unwrap(); + if let Some(frozen_layer) = layers.frozen_layers.front() { + let frozen_layer = Arc::clone(frozen_layer); + drop(layers); // to allow concurrent reads and writes + self.flush_frozen_layer(frozen_layer)?; + } else { + // Drop the 'layer_flush_lock' *before* 'layers'. That + // way, if you freeze a layer, and then call + // flush_frozen_layers(false), it is guaranteed that + // if another thread was busy flushing layers and the + // call therefore returns immediately, the other + // thread will have seen the newly-frozen layer and + // will flush that too (assuming no errors). + drop(flush_lock_guard); + drop(layers); + break; + } + } + + timer.stop_and_record(); + + Ok(()) + } + + /// Flush one frozen in-memory layer to disk, as a new delta layer. + fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { + // As a special case, when we have just imported an image into the repository, + // instead of writing out a L0 delta layer, we directly write out image layer + // files instead. This is possible as long as *all* the data imported into the + // repository have the same LSN. + let lsn_range = frozen_layer.get_lsn_range(); + let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn + && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) + { + let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; + let (partitioning, _lsn) = + pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; + self.create_image_layers(&partitioning, self.initdb_lsn, true)? + } else { + // normal case, write out a L0 delta layer file. + let delta_path = self.create_delta_layer(&frozen_layer)?; + HashSet::from([delta_path]) + }; + + fail_point!("flush-frozen-before-sync"); + + // The new on-disk layers are now in the layer map. We can remove the + // in-memory layer from the map now. + { + let mut layers = self.layers.write().unwrap(); + let l = layers.frozen_layers.pop_front(); + + // Only one thread may call this function at a time (for this + // timeline). If two threads tried to flush the same frozen + // layer to disk at the same time, that would not work. + assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); + + // release lock on 'layers' + } + + fail_point!("checkpoint-after-sync"); + + // Update the metadata file, with new 'disk_consistent_lsn' + // + // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing + // *all* the layers, to avoid fsyncing the file multiple times. + let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); + self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?; + + Ok(()) + } + + /// Update metadata file + fn update_disk_consistent_lsn( + &self, + disk_consistent_lsn: Lsn, + layer_paths_to_upload: HashSet, + ) -> Result<()> { + // If we were able to advance 'disk_consistent_lsn', save it the metadata file. + // After crash, we will restart WAL streaming and processing from that point. + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + if disk_consistent_lsn != old_disk_consistent_lsn { + assert!(disk_consistent_lsn > old_disk_consistent_lsn); + + // We can only save a valid 'prev_record_lsn' value on disk if we + // flushed *all* in-memory changes to disk. We only track + // 'prev_record_lsn' in memory for the latest processed record, so we + // don't remember what the correct value that corresponds to some old + // LSN is. But if we flush everything, then the value corresponding + // current 'last_record_lsn' is correct and we can store it on disk. + let RecordLsn { + last: last_record_lsn, + prev: prev_record_lsn, + } = self.last_record_lsn.load(); + let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { + Some(prev_record_lsn) + } else { + None + }; + + let ancestor_timelineid = self + .ancestor_timeline + .as_ref() + .map(LayeredTimelineEntry::timeline_id); + + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + ondisk_prev_record_lsn, + ancestor_timelineid, + self.ancestor_lsn, + *self.latest_gc_cutoff_lsn.read().unwrap(), + self.initdb_lsn, + ); + + fail_point!("checkpoint-before-saving-metadata", |x| bail!( + "{}", + x.unwrap() + )); + + save_metadata( + self.conf, + self.timeline_id, + self.tenant_id, + &metadata, + false, + )?; + + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + layer_paths_to_upload, + Some(metadata), + ); + } + + // Also update the in-memory copy + self.disk_consistent_lsn.store(disk_consistent_lsn); + } + + Ok(()) + } + + // Write out the given frozen in-memory layer as a new L0 delta file + fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result { + // Write it out + let new_delta = frozen_layer.write_to_disk()?; + let new_delta_path = new_delta.path(); + + // Sync it to disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // TODO: If we're running inside 'flush_frozen_layers' and there are multiple + // files to flush, it might be better to first write them all, and then fsync + // them all in parallel. + par_fsync::par_fsync(&[ + new_delta_path.clone(), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), + ])?; + + // Add it to the layer map + { + let mut layers = self.layers.write().unwrap(); + layers.insert_historic(Arc::new(new_delta)); + } + + NUM_PERSISTENT_FILES_CREATED.inc_by(1); + PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); + + Ok(new_delta_path) + } + + pub fn compact(&self) -> Result<()> { + // + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This high level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let target_file_size = self.get_checkpoint_distance(); + + // Define partitioning schema if needed + if let Ok(pgdir) = + tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) + { + // 2. Create new image layers for partitions that have been modified + // "enough". + let (partitioning, lsn) = pgdir.repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + )?; + let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + if !layer_paths_to_upload.is_empty() + && self.upload_layers.load(atomic::Ordering::Relaxed) + { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + HashSet::from_iter(layer_paths_to_upload), + None, + ); + } + + // 3. Compact + let timer = self.compact_time_histo.start_timer(); + self.compact_level0(target_file_size)?; + timer.stop_and_record(); + } else { + debug!("Could not compact because no partitioning specified yet"); + } + + Ok(()) + } + + // Is it time to create a new image layer for the given partition? + fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { + let layers = self.layers.read().unwrap(); + + for part_range in &partition.ranges { + let image_coverage = layers.image_coverage(part_range, lsn)?; + for (img_range, last_img) in image_coverage { + let img_lsn = if let Some(last_img) = last_img { + last_img.get_lsn_range().end + } else { + Lsn(0) + }; + // Let's consider an example: + // + // delta layer with LSN range 71-81 + // delta layer with LSN range 81-91 + // delta layer with LSN range 91-101 + // image layer at LSN 100 + // + // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, + // there's no need to create a new one. We check this case explicitly, to avoid passing + // a bogus range to count_deltas below, with start > end. It's even possible that there + // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed + // after we read last_record_lsn, which is passed here in the 'lsn' argument. + if img_lsn < lsn { + let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; + + debug!( + "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", + img_range.start, img_range.end, num_deltas, img_lsn, lsn + ); + if num_deltas >= self.get_image_creation_threshold() { + return Ok(true); + } + } + } + } + + Ok(false) + } + + fn create_image_layers( + &self, + partitioning: &KeyPartitioning, + lsn: Lsn, + force: bool, + ) -> Result> { + let timer = self.create_images_time_histo.start_timer(); + let mut image_layers: Vec = Vec::new(); + let mut layer_paths_to_upload = HashSet::new(); + for partition in partitioning.parts.iter() { + if force || self.time_for_new_image_layer(partition, lsn)? { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + &img_range, + lsn, + )?; + + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + let img = self.get(key, lsn)?; + image_layer_writer.put_image(key, &img)?; + key = key.next(); + } + } + let image_layer = image_layer_writer.finish()?; + layer_paths_to_upload.insert(image_layer.path()); + image_layers.push(image_layer); + } + } + + // Sync the new layer to disk before adding it to the layer map, to make sure + // we don't garbage collect something based on the new layer, before it has + // reached the disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // Compaction creates multiple image layers. It would be better to create them all + // and fsync them all in parallel. + let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone()); + all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + par_fsync::par_fsync(&all_paths)?; + + let mut layers = self.layers.write().unwrap(); + for l in image_layers { + layers.insert_historic(Arc::new(l)); + } + drop(layers); + timer.stop_and_record(); + + Ok(layer_paths_to_upload) + } + + /// + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. + /// + fn compact_level0(&self, target_file_size: u64) -> Result<()> { + let layers = self.layers.read().unwrap(); + let mut level0_deltas = layers.get_level0_deltas()?; + drop(layers); + + // Only compact if enough layers have accumulated. + if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { + return Ok(()); + } + + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.get_lsn_range().start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.get_lsn_range().end; + let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)]; + for l in level0_deltas_iter { + let lsn_range = l.get_lsn_range(); + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(Arc::clone(l)); + prev_lsn_end = lsn_range.end; + } + let lsn_range = Range { + start: deltas_to_compact.first().unwrap().get_lsn_range().start, + end: deltas_to_compact.last().unwrap().get_lsn_range().end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + for l in deltas_to_compact.iter() { + info!("compact includes {}", l.filename().display()); + } + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let all_values_iter = deltas_to_compact + .iter() + .map(|l| l.iter()) + .kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + } else { + false + } + } else { + true + } + }); + + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = deltas_to_compact + .iter() + .map(|l| l.key_iter()) + .kmerge_by(|a, b| { + let (a_key, a_lsn, _) = a; + let (b_key, b_lsn, _) = b; + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + for x in all_values_iter { + let (key, lsn, value) = x?; + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key, or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + break; + } + key_values_total_size += next_size; + if key_values_total_size > target_file_size { + // split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn + } else { + lsn + }; + dup_end_lsn = next_lsn; + break; + } + } + // handle case when loop reaches last key + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + // check if key cause layer overflow + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + { + new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); + writer = None; + } + } + key_values_total_size = next_key_size; + } + if writer.is_none() { + writer = Some(DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + )?); + } + writer.as_mut().unwrap().put_value(key, lsn, value)?; + prev_key = Some(key); + } + if let Some(writer) = writer { + new_layers.push(writer.finish(prev_key.unwrap().next())?); + } + + // Sync layers + if !new_layers.is_empty() { + let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); + + // also sync the directory + layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + + // Fsync all the layer files and directory using multiple threads to + // minimize latency. + par_fsync::par_fsync(&layer_paths)?; + + layer_paths.pop().unwrap(); + } + + let mut layers = self.layers.write().unwrap(); + let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); + for l in new_layers { + new_layer_paths.insert(l.path()); + layers.insert_historic(Arc::new(l)); + } + + // Now that we have reshuffled the data to set of new delta layers, we can + // delete the old ones + let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); + for l in &deltas_to_compact { + l.delete()?; + if let Some(path) = l.local_path() { + layer_paths_do_delete.insert(path); + } + layers.remove_historic(l.clone()); + } + drop(layers); + + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + new_layer_paths, + None, + ); + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_do_delete, + ); + } + + Ok(()) + } + + /// Update information about which layer files need to be retained on + /// garbage collection. This is separate from actually performing the GC, + /// and is updated more frequently, so that compaction can remove obsolete + /// page versions more aggressively. + /// + /// TODO: that's wishful thinking, compaction doesn't actually do that + /// currently. + /// + /// The caller specifies how much history is needed with the 3 arguments: + /// + /// retain_lsns: keep a version of each page at these LSNs + /// cutoff_horizon: also keep everything newer than this LSN + /// pitr: the time duration required to keep data for PITR + /// + /// The 'retain_lsns' list is currently used to prevent removing files that + /// are needed by child timelines. In the future, the user might be able to + /// name additional points in time to retain. The caller is responsible for + /// collecting that information. + /// + /// The 'cutoff_horizon' point is used to retain recent versions that might still be + /// needed by read-only nodes. (As of this writing, the caller just passes + /// the latest LSN subtracted by a constant, and doesn't do anything smart + /// to figure out what read-only nodes might actually need.) + /// + /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine + /// whether a record is needed for PITR. + pub fn update_gc_info( + &self, + retain_lsns: Vec, + cutoff_horizon: Lsn, + pitr: Duration, + ) -> Result<()> { + let mut gc_info = self.gc_info.write().unwrap(); + + gc_info.horizon_cutoff = cutoff_horizon; + gc_info.retain_lsns = retain_lsns; + + // Calculate pitr cutoff point. + // If we cannot determine a cutoff LSN, be conservative and don't GC anything. + let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); + + if let Ok(timeline) = + tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) + { + let now = SystemTime::now(); + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. + // If we don't have enough data to convert to LSN, + // play safe and don't remove any layers. + if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { + let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); + + match timeline.find_lsn_for_timestamp(pitr_timestamp)? { + LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, + LsnForTimestamp::Future(lsn) => { + debug!("future({})", lsn); + pitr_cutoff_lsn = gc_info.horizon_cutoff; + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + } + } + debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) + } + } else if cfg!(test) { + // We don't have local timeline in mocked cargo tests. + // So, just ignore pitr_interval setting in this case. + pitr_cutoff_lsn = gc_info.horizon_cutoff; + } + gc_info.pitr_cutoff = pitr_cutoff_lsn; + + Ok(()) + } + + /// + /// Garbage collect layer files on a timeline that are no longer needed. + /// + /// Currently, we don't make any attempt at removing unneeded page versions + /// within a layer file. We can only remove the whole file if it's fully + /// obsolete. + /// + pub fn gc(&self) -> Result { + let mut result: GcResult = Default::default(); + let now = SystemTime::now(); + + fail_point!("before-timeline-gc"); + + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let gc_info = self.gc_info.read().unwrap(); + + let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.pitr_cutoff; + let retain_lsns = &gc_info.retain_lsns; + + let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + + // Nothing to GC. Return early. + let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + if latest_gc_cutoff >= new_gc_cutoff { + info!( + "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", + self.timeline_id + ); + return Ok(result); + } + + let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); + + // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. + // See branch_timeline() for details. + *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; + + info!("GC starting"); + + debug!("retain_lsns: {:?}", retain_lsns); + + let mut layers_to_remove = Vec::new(); + + // Scan all on-disk layers in the timeline. + // + // Garbage collect the layer if all conditions are satisfied: + // 1. it is older than cutoff LSN; + // 2. it is older than PITR interval; + // 3. it doesn't need to be retained for 'retain_lsns'; + // 4. newer on-disk image layers cover the layer's whole key range + // + let mut layers = self.layers.write().unwrap(); + 'outer: for l in layers.iter_historic_layers() { + // This layer is in the process of being flushed to disk. + // It will be swapped out of the layer map, replaced with + // on-disk layers containing the same data. + // We can't GC it, as it's not on disk. We can't remove it + // from the layer map yet, as it would make its data + // inaccessible. + if l.is_in_memory() { + continue; + } + + result.layers_total += 1; + + // 1. Is it newer than GC horizon cutoff point? + if l.get_lsn_range().end > horizon_cutoff { + debug!( + "keeping {} because it's newer than horizon_cutoff {}", + l.filename().display(), + horizon_cutoff + ); + result.layers_needed_by_cutoff += 1; + continue 'outer; + } + + // 2. It is newer than PiTR cutoff point? + if l.get_lsn_range().end > pitr_cutoff { + debug!( + "keeping {} because it's newer than pitr_cutoff {}", + l.filename().display(), + pitr_cutoff + ); + result.layers_needed_by_pitr += 1; + continue 'outer; + } + + // 3. Is it needed by a child branch? + // NOTE With that we would keep data that + // might be referenced by child branches forever. + // We can track this in child timeline GC and delete parent layers when + // they are no longer needed. This might be complicated with long inheritance chains. + for retain_lsn in retain_lsns { + // start_lsn is inclusive + if &l.get_lsn_range().start <= retain_lsn { + debug!( + "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + l.filename().display(), + retain_lsn, + l.is_incremental(), + ); + result.layers_needed_by_branches += 1; + continue 'outer; + } + } + + // 4. Is there a later on-disk layer for this relation? + // + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + // + // For example, imagine that the following layers exist: + // + // 1000 - image (A) + // 1000-2000 - delta (B) + // 2000 - image (C) + // 2000-3000 - delta (D) + // 3000 - image (E) + // + // If GC horizon is at 2500, we can remove layers A and B, but + // we cannot remove C, even though it's older than 2500, because + // the delta layer 2000-3000 depends on it. + if !layers + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? + { + debug!( + "keeping {} because it is the latest layer", + l.filename().display() + ); + result.layers_not_updated += 1; + continue 'outer; + } + + // We didn't find any reason to keep this file, so remove it. + debug!( + "garbage collecting {} is_dropped: xx is_incremental: {}", + l.filename().display(), + l.is_incremental(), + ); + layers_to_remove.push(Arc::clone(l)); + } + + // Actually delete the layers from disk and remove them from the map. + // (couldn't do this in the loop above, because you cannot modify a collection + // while iterating it. BTreeMap::retain() would be another option) + let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); + for doomed_layer in layers_to_remove { + doomed_layer.delete()?; + if let Some(path) = doomed_layer.local_path() { + layer_paths_to_delete.insert(path); + } + layers.remove_historic(doomed_layer); + result.layers_removed += 1; + } + + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_to_delete, + ); + } + + result.elapsed = now.elapsed()?; + Ok(result) + } + + /// + /// Reconstruct a value, using the given base image and WAL records in 'data'. + /// + fn reconstruct_value( + &self, + key: Key, + request_lsn: Lsn, + mut data: ValueReconstructState, + ) -> Result { + // Perform WAL redo if needed + data.records.reverse(); + + // If we have a page image, and no WAL, we're all set + if data.records.is_empty() { + if let Some((img_lsn, img)) = &data.img { + trace!( + "found page image for key {} at {}, no WAL redo required", + key, + img_lsn + ); + Ok(img.clone()) + } else { + bail!("base image for {} at {} not found", key, request_lsn); + } + } else { + // We need to do WAL redo. + // + // If we don't have a base image, then the oldest WAL record better initialize + // the page + if data.img.is_none() && !data.records.first().unwrap().1.will_init() { + bail!( + "Base image for {} at {} not found, but got {} WAL records", + key, + request_lsn, + data.records.len() + ); + } else { + let base_img = if let Some((_lsn, img)) = data.img { + trace!( + "found {} WAL records and a base image for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); + Some(img) + } else { + trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); + None + }; + + let last_rec_lsn = data.records.last().unwrap().0; + + let img = + self.walredo_mgr + .request_redo(key, request_lsn, base_img, data.records)?; + + if img.len() == page_cache::PAGE_SZ { + let cache = page_cache::get(); + cache.memorize_materialized_page( + self.tenant_id, + self.timeline_id, + key, + last_rec_lsn, + &img, + ); + } + + Ok(img) + } + } + } +} + +/// Helper function for get_reconstruct_data() to add the path of layers traversed +/// to an error, as anyhow context information. +fn layer_traversal_error( + msg: String, + path: Vec<(ValueReconstructResult, Lsn, Arc)>, +) -> anyhow::Result<()> { + // We want the original 'msg' to be the outermost context. The outermost context + // is the most high-level information, which also gets propagated to the client. + let mut msg_iter = path + .iter() + .map(|(r, c, l)| { + format!( + "layer traversal: result {:?}, cont_lsn {}, layer: {}", + r, + c, + l.filename().display() + ) + }) + .chain(std::iter::once(msg)); + // Construct initial message from the first traversed layer + let err = anyhow!(msg_iter.next().unwrap()); + + // Append all subsequent traversals, and the error message 'msg', as contexts. + Err(msg_iter.fold(err, |err, msg| err.context(msg))) +} + +struct LayeredTimelineWriter<'a> { + tl: &'a LayeredTimeline, + _write_guard: MutexGuard<'a, ()>, +} + +impl Deref for LayeredTimelineWriter<'_> { + type Target = dyn Timeline; + + fn deref(&self) -> &Self::Target { + self.tl + } +} + +impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { + fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { + self.tl.put_value(key, lsn, value) + } + + fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { + self.tl.put_tombstone(key_range, lsn) + } + + /// + /// Remember the (end of) last valid WAL record remembered in the timeline. + /// + fn finish_write(&self, new_lsn: Lsn) { + self.tl.finish_write(new_lsn); + } +} + +/// Add a suffix to a layer file's name: .{num}.old +/// Uses the first available num (starts at 0) +fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { + let filename = path + .file_name() + .ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))? + .to_string_lossy(); + let mut new_path = path.clone(); + + for i in 0u32.. { + new_path.set_file_name(format!("{}.{}.old", filename, i)); + if !new_path.exists() { + std::fs::rename(&path, &new_path)?; + return Ok(()); + } + } + + bail!("couldn't find an unused backup number for {:?}", path) +} + +/// Save timeline metadata to file +pub fn save_metadata( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + data: &TimelineMetadata, + first_save: bool, +) -> Result<()> { + let _enter = info_span!("saving metadata").entered(); + let path = metadata_path(conf, timelineid, tenantid); + // use OpenOptions to ensure file presence is consistent with first_save + let mut file = VirtualFile::open_with_options( + &path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; + + if file.write(&metadata_bytes)? != metadata_bytes.len() { + bail!("Could not write all the metadata bytes in a single call"); + } + file.sync_all()?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + let timeline_dir = File::open( + &path + .parent() + .expect("Metadata should always have a parent dir"), + )?; + timeline_dir.sync_all()?; + } + + Ok(()) +} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index ac5fb0bc8c..fe1ba4b5bb 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -176,7 +176,6 @@ use crate::{ layered_repository::{ ephemeral_file::is_ephemeral_file, metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, - LayeredRepository, }, storage_sync::{self, index::RemoteIndex}, tenant_mgr::attach_downloaded_tenants, @@ -1257,7 +1256,13 @@ async fn update_local_metadata( timeline_id, } = sync_id; tokio::task::spawn_blocking(move || { - LayeredRepository::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) + crate::layered_repository::save_metadata( + conf, + timeline_id, + tenant_id, + &cloned_metadata, + true, + ) }) .await .with_context(|| { From 5a4394a8df1a573a3e3fa27bcd7a792f4f00139d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 26 Jul 2022 22:21:05 +0300 Subject: [PATCH 23/29] Do not hold timelines lock while calling update_gc_info to avoid recusrive mutex lock and so deadlock (#2163) --- pageserver/src/layered_repository.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index ff230ed3c3..d770e736e9 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -752,7 +752,7 @@ impl LayeredRepository { // grab mutex to prevent new timelines from being created here. let gc_cs = self.gc_cs.lock().unwrap(); - let mut timelines = self.timelines.lock().unwrap(); + let timelines = self.timelines.lock().unwrap(); // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. @@ -789,15 +789,14 @@ impl LayeredRepository { }) .collect::>() }; + drop(timelines); // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); for timeline_id in timeline_ids { // Timeline is known to be local and loaded. - let timeline = self - .get_timeline_load_internal(timeline_id, &mut *timelines)? - .expect("checked above that timeline is local and loaded"); + let timeline = self.get_timeline_load(timeline_id)?; // If target_timeline is specified, ignore all other timelines if let Some(target_timelineid) = target_timeline_id { @@ -819,7 +818,6 @@ impl LayeredRepository { gc_timelines.push(timeline); } } - drop(timelines); drop(gc_cs); // Perform GC for each timeline. From d6f12cff8e1dd858a961369d6caf1e8ea8345759 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 27 Jul 2022 10:26:21 +0300 Subject: [PATCH 24/29] Make DatadirTimeline a trait, implemented by LayeredTimeline. Previously DatadirTimeline was a separate struct, and there was a 1:1 relationship between each DatadirTimeline and LayeredTimeline. That was a bit awkward; whenever you created a timeline, you also needed to create the DatadirTimeline wrapper around it, and if you only had a reference to the LayeredTimeline, you would need to look up the corresponding DatadirTimeline struct through tenant_mgr::get_local_timeline_with_load(). There were a couple of calls like that from LayeredTimeline itself. Refactor DatadirTimeline, so that it's a trait, and mark LayeredTimeline as implementing that trait. That way, there's only one object, LayeredTimeline, and you can call both Timeline and DatadirTimeline functions on that. You can now also call DatadirTimeline functions from LayeredTimeline itself. I considered just moving all the functions from DatadirTimeline directly to Timeline/LayeredTimeline, but I still like to have some separation. Timeline provides a simple key-value API, and handles durably storing key/value pairs, and branching. Whereas DatadirTimeline is stateless, and provides an abstraction over the key-value store, to present an interface with relations, databases, etc. Postgres concepts. This simplified the logical size calculation fast-path for branch creation, introduced in commit 28243d68e6. LayerTimeline can now access the ancestor's logical size directly, so it doesn't need the caller to pass it to it. I moved the fast-path to init_logical_size() function itself. It now checks if the ancestor's last LSN is the same as the branch point, i.e. if there haven't been any changes on the ancestor after the branch, and copies the size from there. An additional bonus is that the optimization will now work any time you have a branch of another branch, with no changes from the ancestor, not only at a create-branch command. --- pageserver/src/basebackup.rs | 26 +- pageserver/src/import_datadir.rs | 32 ++- pageserver/src/layered_repository.rs | 38 +-- pageserver/src/layered_repository/timeline.rs | 206 ++++++++++----- pageserver/src/lib.rs | 3 +- pageserver/src/page_service.rs | 58 ++--- pageserver/src/pgdatadir_mapping.rs | 243 ++++++------------ pageserver/src/repository.rs | 4 +- pageserver/src/tenant_mgr.rs | 28 +- pageserver/src/timelines.rs | 38 ++- pageserver/src/walingest.rs | 72 +++--- .../src/walreceiver/connection_manager.rs | 20 +- .../src/walreceiver/walreceiver_connection.rs | 5 +- 13 files changed, 361 insertions(+), 412 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 3ec1ec9243..5837447ce8 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -23,8 +23,7 @@ use tar::{Builder, EntryType, Header}; use tracing::*; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Timeline; -use crate::DatadirTimelineImpl; +use crate::DatadirTimeline; use postgres_ffi::xlog_utils::*; use postgres_ffi::*; use utils::lsn::Lsn; @@ -32,12 +31,13 @@ use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a, W> +pub struct Basebackup<'a, W, T> where W: Write, + T: DatadirTimeline, { ar: Builder>, - timeline: &'a Arc, + timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, full_backup: bool, @@ -52,17 +52,18 @@ where // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a, W> Basebackup<'a, W> +impl<'a, W, T> Basebackup<'a, W, T> where W: Write, + T: DatadirTimeline, { pub fn new( write: W, - timeline: &'a Arc, + timeline: &'a Arc, req_lsn: Option, prev_lsn: Option, full_backup: bool, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -79,13 +80,13 @@ where let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", req_lsn); - timeline.tline.wait_lsn(req_lsn)?; + timeline.wait_lsn(req_lsn)?; // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as // zero, though, if no WAL has been generated on this timeline // yet.) - let end_of_timeline = timeline.tline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); if req_lsn == end_of_timeline.last { (end_of_timeline.prev, req_lsn) } else { @@ -93,7 +94,7 @@ where } } else { // Backup was requested at end of the timeline. - let end_of_timeline = timeline.tline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); (end_of_timeline.prev, end_of_timeline.last) }; @@ -371,7 +372,7 @@ where // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.tline.get_ancestor_lsn() { + if self.lsn == self.timeline.get_ancestor_lsn() { write!(zenith_signal, "PREV LSN: none")?; } else { write!(zenith_signal, "PREV LSN: invalid")?; @@ -402,9 +403,10 @@ where } } -impl<'a, W> Drop for Basebackup<'a, W> +impl<'a, W, T> Drop for Basebackup<'a, W, T> where W: Write, + T: DatadirTimeline, { /// If the basebackup was not finished, prevent the Archive::drop() from /// writing the end-of-archive marker. diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 6402657e05..ccfd83400a 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -13,8 +13,6 @@ use walkdir::WalkDir; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Repository; -use crate::repository::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; use postgres_ffi::relfile_utils::*; @@ -30,9 +28,9 @@ use utils::lsn::Lsn; /// This is currently only used to import a cluster freshly created by initdb. /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. -pub fn import_timeline_from_postgres_datadir( +pub fn import_timeline_from_postgres_datadir( path: &Path, - tline: &mut DatadirTimeline, + tline: &T, lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; @@ -90,8 +88,8 @@ pub fn import_timeline_from_postgres_datadir( } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_rel( - modification: &mut DatadirModification, +fn import_rel( + modification: &mut DatadirModification, path: &Path, spcoid: Oid, dboid: Oid, @@ -170,8 +168,8 @@ fn import_rel( /// Import an SLRU segment file /// -fn import_slru( - modification: &mut DatadirModification, +fn import_slru( + modification: &mut DatadirModification, slru: SlruKind, path: &Path, mut reader: Reader, @@ -226,9 +224,9 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( +fn import_wal( walpath: &Path, - tline: &mut DatadirTimeline, + tline: &T, startpoint: Lsn, endpoint: Lsn, ) -> Result<()> { @@ -297,8 +295,8 @@ fn import_wal( Ok(()) } -pub fn import_basebackup_from_tar( - tline: &mut DatadirTimeline, +pub fn import_basebackup_from_tar( + tline: &T, reader: Reader, base_lsn: Lsn, ) -> Result<()> { @@ -339,8 +337,8 @@ pub fn import_basebackup_from_tar( Ok(()) } -pub fn import_wal_from_tar( - tline: &mut DatadirTimeline, +pub fn import_wal_from_tar( + tline: &T, reader: Reader, start_lsn: Lsn, end_lsn: Lsn, @@ -420,8 +418,8 @@ pub fn import_wal_from_tar( Ok(()) } -pub fn import_file( - modification: &mut DatadirModification, +pub fn import_file( + modification: &mut DatadirModification, file_path: &Path, reader: Reader, len: usize, @@ -540,7 +538,7 @@ pub fn import_file( // zenith.signal is not necessarily the last file, that we handle // but it is ok to call `finish_write()`, because final `modification.commit()` // will update lsn once more to the final one. - let writer = modification.tline.tline.writer(); + let writer = modification.tline.writer(); writer.finish_write(prev_lsn); debug!("imported zenith signal {}", prev_lsn); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index d770e736e9..c500b05e66 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -32,7 +32,6 @@ use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline}; -use crate::tenant_mgr; use crate::thread_mgr; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -181,7 +180,6 @@ impl Repository for LayeredRepository { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, - None, ); timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -246,20 +244,6 @@ impl Repository for LayeredRepository { )); } } - // Copy logical size from source timeline if we are branching on the last position. - let init_logical_size = - if let Ok(src_pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenant_id, src) { - let logical_size = src_pgdir.get_current_logical_size(); - // Check LSN after getting logical size to exclude race condition - // when ancestor timeline is concurrently updated - if src_timeline.get_last_record_lsn() == start_lsn { - Some(logical_size) - } else { - None - } - } else { - None - }; // Determine prev-LSN for the new timeline. We can only determine it if // the timeline was branched at the current end of the source timeline. @@ -290,14 +274,7 @@ impl Repository for LayeredRepository { ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; timeline::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - timelines.insert( - dst, - LayeredTimelineEntry::Unloaded { - id: dst, - metadata, - init_logical_size, - }, - ); + timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -433,7 +410,7 @@ impl Repository for LayeredRepository { // we need to get metadata of a timeline, another option is to pass it along with Downloaded status let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; // finally we make newly downloaded timeline visible to repository - entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, init_logical_size: None }) + entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata }) }, }; Ok(()) @@ -551,18 +528,13 @@ impl LayeredRepository { timelineid: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result>> { - let logical_size: Option; match timelines.get(&timelineid) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { debug!("timeline {} found loaded into memory", &timelineid); return Ok(Some(Arc::clone(local_timeline))); } - LayeredTimelineEntry::Unloaded { - init_logical_size, .. - } => { - logical_size = *init_logical_size; - } + LayeredTimelineEntry::Unloaded { .. } => {} }, None => { debug!("timeline {} not found", &timelineid); @@ -573,7 +545,7 @@ impl LayeredRepository { "timeline {} found on a local disk, but not loaded into the memory, loading", &timelineid ); - let timeline = self.load_local_timeline(timelineid, timelines, logical_size)?; + let timeline = self.load_local_timeline(timelineid, timelines)?; let was_loaded = timelines.insert( timelineid, LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), @@ -590,7 +562,6 @@ impl LayeredRepository { &self, timeline_id: ZTimelineId, timelines: &mut HashMap, - init_logical_size: Option, ) -> anyhow::Result> { let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) .context("failed to load metadata")?; @@ -617,7 +588,6 @@ impl LayeredRepository { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, - init_logical_size, ); timeline .load_layer_map(disk_consistent_lsn) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index e862b7def7..bdc74160aa 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -14,7 +14,7 @@ use std::fs::{File, OpenOptions}; use std::io::Write; use std::ops::{Deref, Range}; use std::path::PathBuf; -use std::sync::atomic::{self, AtomicBool}; +use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; use std::time::{Duration, SystemTime}; @@ -39,6 +39,7 @@ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::tenant_config::TenantConfOpt; +use crate::DatadirTimeline; use postgres_ffi::xlog_utils::to_pg_timestamp; use utils::{ @@ -49,7 +50,6 @@ use utils::{ use crate::repository::{GcResult, RepositoryTimeline, Timeline, TimelineWriter}; use crate::repository::{Key, Value}; -use crate::tenant_mgr; use crate::thread_mgr; use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; @@ -122,7 +122,6 @@ pub enum LayeredTimelineEntry { Unloaded { id: ZTimelineId, metadata: TimelineMetadata, - init_logical_size: Option, }, } @@ -269,11 +268,21 @@ pub struct LayeredTimeline { // though lets keep them both for better error visibility. pub initdb_lsn: Lsn, - // Initial logical size of timeline (if known). - // Logical size can be copied from ancestor timeline when new branch is create at last LSN - pub init_logical_size: Option, + /// When did we last calculate the partitioning? + partitioning: Mutex<(KeyPartitioning, Lsn)>, + + /// Configuration: how often should the partitioning be recalculated. + repartition_threshold: u64, + + /// Current logical size of the "datadir", at the last LSN. + current_logical_size: AtomicIsize, } +/// Inherit all the functions from DatadirTimeline, to provide the +/// functionality to store PostgreSQL relations, SLRUs, etc. in a +/// LayeredTimeline. +impl DatadirTimeline for LayeredTimeline {} + /// /// Information about how much history needs to be retained, needed by /// Garbage Collection. @@ -472,7 +481,6 @@ impl LayeredTimeline { tenant_id: ZTenantId, walredo_mgr: Arc, upload_layers: bool, - init_logical_size: Option, ) -> LayeredTimeline { let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) @@ -508,7 +516,7 @@ impl LayeredTimeline { .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); - LayeredTimeline { + let mut result = LayeredTimeline { conf, tenant_conf, timeline_id, @@ -551,8 +559,13 @@ impl LayeredTimeline { latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), - init_logical_size, - } + + current_logical_size: AtomicIsize::new(0), + partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), + repartition_threshold: 0, + }; + result.repartition_threshold = result.get_checkpoint_distance() / 10; + result } /// @@ -634,6 +647,58 @@ impl LayeredTimeline { Ok(()) } + /// (Re-)calculate the logical size of the database at the latest LSN. + /// + /// This can be a slow operation. + pub fn init_logical_size(&self) -> Result<()> { + // Try a fast-path first: + // Copy logical size from ancestor timeline if there has been no changes on this + // branch, and no changes on the ancestor branch since the branch point. + if self.get_ancestor_lsn() == self.get_last_record_lsn() && self.ancestor_timeline.is_some() + { + let ancestor = self.get_ancestor_timeline()?; + let ancestor_logical_size = ancestor.get_current_logical_size(); + // Check LSN after getting logical size to exclude race condition + // when ancestor timeline is concurrently updated. + // + // Logical size 0 means that it was not initialized, so don't believe that. + if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn { + self.current_logical_size + .store(ancestor_logical_size as isize, AtomicOrdering::SeqCst); + debug!( + "logical size copied from ancestor: {}", + ancestor_logical_size + ); + return Ok(()); + } + } + + // Have to calculate it the hard way + let last_lsn = self.get_last_record_lsn(); + let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; + self.current_logical_size + .store(logical_size as isize, AtomicOrdering::SeqCst); + debug!("calculated logical size the hard way: {}", logical_size); + Ok(()) + } + + /// Retrieve current logical size of the timeline + /// + /// NOTE: counted incrementally, includes ancestors, + pub fn get_current_logical_size(&self) -> usize { + let current_logical_size = self.current_logical_size.load(AtomicOrdering::Acquire); + match usize::try_from(current_logical_size) { + Ok(sz) => sz, + Err(_) => { + error!( + "current_logical_size is out of range: {}", + current_logical_size + ); + 0 + } + } + } + /// /// Get a handle to a Layer for reading. /// @@ -1003,18 +1068,16 @@ impl LayeredTimeline { // files instead. This is possible as long as *all* the data imported into the // repository have the same LSN. let lsn_range = frozen_layer.get_lsn_range(); - let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn - && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) - { - let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; - let (partitioning, _lsn) = - pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - self.create_image_layers(&partitioning, self.initdb_lsn, true)? - } else { - // normal case, write out a L0 delta layer file. - let delta_path = self.create_delta_layer(&frozen_layer)?; - HashSet::from([delta_path]) - }; + let layer_paths_to_upload = + if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { + let (partitioning, _lsn) = + self.repartition(self.initdb_lsn, self.get_compaction_target_size())?; + self.create_image_layers(&partitioning, self.initdb_lsn, true)? + } else { + // normal case, write out a L0 delta layer file. + let delta_path = self.create_delta_layer(&frozen_layer)?; + HashSet::from([delta_path]) + }; fail_point!("flush-frozen-before-sync"); @@ -1186,38 +1249,56 @@ impl LayeredTimeline { let target_file_size = self.get_checkpoint_distance(); // Define partitioning schema if needed - if let Ok(pgdir) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - // 2. Create new image layers for partitions that have been modified - // "enough". - let (partitioning, lsn) = pgdir.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - )?; - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; - if !layer_paths_to_upload.is_empty() - && self.upload_layers.load(atomic::Ordering::Relaxed) - { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - HashSet::from_iter(layer_paths_to_upload), - None, - ); - } - // 3. Compact - let timer = self.compact_time_histo.start_timer(); - self.compact_level0(target_file_size)?; - timer.stop_and_record(); - } else { - debug!("Could not compact because no partitioning specified yet"); - } + match self.repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + ) { + Ok((partitioning, lsn)) => { + // 2. Create new image layers for partitions that have been modified + // "enough". + let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + if !layer_paths_to_upload.is_empty() + && self.upload_layers.load(atomic::Ordering::Relaxed) + { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + HashSet::from_iter(layer_paths_to_upload), + None, + ); + } + + // 3. Compact + let timer = self.compact_time_histo.start_timer(); + self.compact_level0(target_file_size)?; + timer.stop_and_record(); + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + error!("could not compact, repartitioning keyspace failed: {err:?}"); + } + }; Ok(()) } + fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> { + let mut partitioning_guard = self.partitioning.lock().unwrap(); + if partitioning_guard.1 == Lsn(0) + || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold + { + let keyspace = self.collect_keyspace(lsn)?; + let partitioning = keyspace.partition(partition_size); + *partitioning_guard = (partitioning, lsn); + return Ok((partitioning_guard.0.clone(), lsn)); + } + Ok((partitioning_guard.0.clone(), partitioning_guard.1)) + } + // Is it time to create a new image layer for the given partition? fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { let layers = self.layers.read().unwrap(); @@ -1626,19 +1707,21 @@ impl LayeredTimeline { // Calculate pitr cutoff point. // If we cannot determine a cutoff LSN, be conservative and don't GC anything. - let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); + let mut pitr_cutoff_lsn: Lsn; + + if pitr != Duration::ZERO { + // conservative, safe default is to remove nothing, when we have no + // commit timestamp data available + pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn(); - if let Ok(timeline) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - let now = SystemTime::now(); // First, calculate pitr_cutoff_timestamp and then convert it to LSN. // If we don't have enough data to convert to LSN, // play safe and don't remove any layers. + let now = SystemTime::now(); if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - match timeline.find_lsn_for_timestamp(pitr_timestamp)? { + match self.find_lsn_for_timestamp(pitr_timestamp)? { LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, LsnForTimestamp::Future(lsn) => { debug!("future({})", lsn); @@ -1653,9 +1736,10 @@ impl LayeredTimeline { } debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) } - } else if cfg!(test) { - // We don't have local timeline in mocked cargo tests. - // So, just ignore pitr_interval setting in this case. + } else { + // No time-based retention. (Some unit tests depend on garbage-collection + // working even when CLOG data is missing, so that find_lsn_for_timestamp() + // above doesn't work.) pitr_cutoff_lsn = gc_info.horizon_cutoff; } gc_info.pitr_cutoff = pitr_cutoff_lsn; @@ -1962,6 +2046,12 @@ impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { fn finish_write(&self, new_lsn: Lsn) { self.tl.finish_write(new_lsn); } + + fn update_current_logical_size(&self, delta: isize) { + self.tl + .current_logical_size + .fetch_add(delta, AtomicOrdering::SeqCst); + } } /// Add a suffix to a layer file's name: .{num}.old diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c9c00d75e2..4ecb181553 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -63,8 +63,7 @@ pub enum CheckpointConfig { } pub type RepositoryImpl = LayeredRepository; - -pub type DatadirTimelineImpl = DatadirTimeline; +pub type TimelineImpl = ::Timeline; pub fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint thread. This prevents new connections from diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3dba207ab9..c8aa4b35e8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -30,7 +30,6 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; -use crate::layered_repository::LayeredRepository; use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -555,9 +554,6 @@ impl PageServerHandler { info!("creating new timeline"); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?; - let repartition_distance = repo.get_checkpoint_distance(); - let mut datadir_timeline = - DatadirTimeline::::new(timeline, repartition_distance); // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -573,7 +569,7 @@ impl PageServerHandler { info!("importing basebackup"); pgb.write_message(&BeMessage::CopyInResponse)?; let reader = CopyInReader::new(pgb); - import_basebackup_from_tar(&mut datadir_timeline, reader, base_lsn)?; + import_basebackup_from_tar(&*timeline, reader, base_lsn)?; // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup @@ -583,7 +579,7 @@ impl PageServerHandler { // Flush data to disk, then upload to s3 info!("flushing layers"); - datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + timeline.checkpoint(CheckpointConfig::Flush)?; info!("done"); Ok(()) @@ -605,10 +601,6 @@ impl PageServerHandler { let timeline = repo.get_timeline_load(timeline_id)?; ensure!(timeline.get_last_record_lsn() == start_lsn); - let repartition_distance = repo.get_checkpoint_distance(); - let mut datadir_timeline = - DatadirTimeline::::new(timeline, repartition_distance); - // TODO leave clean state on error. For now you can use detach to clean // up broken state from a failed import. @@ -616,16 +608,16 @@ impl PageServerHandler { info!("importing wal"); pgb.write_message(&BeMessage::CopyInResponse)?; let reader = CopyInReader::new(pgb); - import_wal_from_tar(&mut datadir_timeline, reader, start_lsn, end_lsn)?; + import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)?; // TODO Does it make sense to overshoot? - ensure!(datadir_timeline.tline.get_last_record_lsn() >= end_lsn); + ensure!(timeline.get_last_record_lsn() >= end_lsn); // Flush data to disk, then upload to s3. No need for a forced checkpoint. // We only want to persist the data, and it doesn't matter if it's in the // shape of deltas or images. info!("flushing layers"); - datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + timeline.checkpoint(CheckpointConfig::Flush)?; info!("done"); Ok(()) @@ -643,8 +635,8 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( - timeline: &DatadirTimeline, + fn wait_or_get_last_lsn( + timeline: &T, mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RwLockReadGuard, @@ -671,7 +663,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.tline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn)?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -681,7 +673,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.tline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn)?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -691,14 +683,14 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + fn handle_get_rel_exists_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamExistsRequest, ) -> Result { let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; let exists = timeline.get_rel_exists(req.rel, lsn)?; @@ -708,13 +700,13 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + fn handle_get_nblocks_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamNblocksRequest, ) -> Result { let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; let n_blocks = timeline.get_rel_size(req.rel, lsn)?; @@ -724,13 +716,13 @@ impl PageServerHandler { })) } - fn handle_db_size_request( + fn handle_db_size_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamDbSizeRequest, ) -> Result { let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; let total_blocks = @@ -743,14 +735,14 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + fn handle_get_page_at_lsn_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamGetPageRequest, ) -> Result { let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) .entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; /* // Add a 1s delay to some requests. The delayed causes the requests to @@ -783,7 +775,7 @@ impl PageServerHandler { // check that the timeline exists let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) @@ -921,7 +913,7 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; - let end_of_timeline = timeline.tline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::text_col(b"prev_lsn"), @@ -1139,7 +1131,7 @@ impl postgres_backend::Handler for PageServerHandler { let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Couldn't load timeline")?; - timeline.tline.compact()?; + timeline.compact()?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; @@ -1160,7 +1152,7 @@ impl postgres_backend::Handler for PageServerHandler { .context("Cannot load local timeline")?; // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). - timeline.tline.checkpoint(CheckpointConfig::Forced)?; + timeline.checkpoint(CheckpointConfig::Forced)?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index f703fa16af..61aca8d4ba 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,10 +6,10 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! -use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceAccum}; +use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::reltag::{RelTag, SlruKind}; +use crate::repository::Timeline; use crate::repository::*; -use crate::repository::{Repository, Timeline}; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; @@ -18,34 +18,12 @@ use postgres_ffi::{pg_constants, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::ops::Range; -use std::sync::atomic::{AtomicIsize, Ordering}; -use std::sync::{Arc, Mutex, RwLockReadGuard}; -use tracing::{debug, error, trace, warn}; +use tracing::{debug, trace, warn}; use utils::{bin_ser::BeSer, lsn::Lsn}; /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. pub type BlockNumber = u32; -pub struct DatadirTimeline -where - R: Repository, -{ - /// The underlying key-value store. Callers should not read or modify the - /// data in the underlying store directly. However, it is exposed to have - /// access to information like last-LSN, ancestor, and operations like - /// compaction. - pub tline: Arc, - - /// When did we last calculate the partitioning? - partitioning: Mutex<(KeyPartitioning, Lsn)>, - - /// Configuration: how often should the partitioning be recalculated. - repartition_threshold: u64, - - /// Current logical size of the "datadir", at the last LSN. - current_logical_size: AtomicIsize, -} - #[derive(Debug)] pub enum LsnForTimestamp { Present(Lsn), @@ -54,34 +32,24 @@ pub enum LsnForTimestamp { NoData(Lsn), } -impl DatadirTimeline { - pub fn new(tline: Arc, repartition_threshold: u64) -> Self { - DatadirTimeline { - tline, - partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), - current_logical_size: AtomicIsize::new(0), - repartition_threshold, - } - } - - /// (Re-)calculate the logical size of the database at the latest LSN. - /// - /// This can be a slow operation. - pub fn init_logical_size(&self) -> Result<()> { - let last_lsn = self.tline.get_last_record_lsn(); - self.current_logical_size.store( - self.get_current_logical_size_non_incremental(last_lsn)? as isize, - Ordering::SeqCst, - ); - Ok(()) - } - - /// Set timeline logical size. - pub fn set_logical_size(&self, size: usize) { - self.current_logical_size - .store(size as isize, Ordering::SeqCst); - } - +/// +/// This trait provides all the functionality to store PostgreSQL relations, SLRUs, +/// and other special kinds of files, in a versioned key-value store. The +/// Timeline trait provides the key-value store. +/// +/// This is a trait, so that we can easily include all these functions in a Timeline +/// implementation. You're not expected to have different implementations of this trait, +/// rather, this provides an interface and implementation, over Timeline. +/// +/// If you wanted to store other kinds of data in the Neon repository, e.g. +/// flat files or MySQL, you would create a new trait like this, with all the +/// functions that make sense for the kind of data you're storing. For flat files, +/// for example, you might have a function like "fn read(path, offset, size)". +/// We might also have that situation in the future, to support multiple PostgreSQL +/// versions, if there are big changes in how the data is organized in the data +/// directory, or if new special files are introduced. +/// +pub trait DatadirTimeline: Timeline { /// Start ingesting a WAL record, or other atomic modification of /// the timeline. /// @@ -102,7 +70,10 @@ impl DatadirTimeline { /// functions of the timeline until you finish! And if you update the /// same page twice, the last update wins. /// - pub fn begin_modification(&self) -> DatadirModification { + fn begin_modification(&self) -> DatadirModification + where + Self: Sized, + { DatadirModification { tline: self, pending_updates: HashMap::new(), @@ -116,7 +87,7 @@ impl DatadirTimeline { //------------------------------------------------------------------------------ /// Look up given page version. - pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { + fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); let nblocks = self.get_rel_size(tag, lsn)?; @@ -129,11 +100,11 @@ impl DatadirTimeline { } let key = rel_block_to_key(tag, blknum); - self.tline.get(key, lsn) + self.get(key, lsn) } // Get size of a database in blocks - pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let mut total_blocks = 0; let rels = self.list_rels(spcnode, dbnode, lsn)?; @@ -146,7 +117,7 @@ impl DatadirTimeline { } /// Get size of a relation file - pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { + fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); if (tag.forknum == pg_constants::FSM_FORKNUM @@ -161,17 +132,17 @@ impl DatadirTimeline { } let key = rel_size_to_key(tag); - let mut buf = self.tline.get(key, lsn)?; + let mut buf = self.get(key, lsn)?; Ok(buf.get_u32_le()) } /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = RelDirectory::des(&buf)?; let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); @@ -180,10 +151,10 @@ impl DatadirTimeline { } /// Get a list of all existing relations in given tablespace and database. - pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = RelDirectory::des(&buf)?; let rels: HashSet = @@ -198,7 +169,7 @@ impl DatadirTimeline { } /// Look up given SLRU page version. - pub fn get_slru_page_at_lsn( + fn get_slru_page_at_lsn( &self, kind: SlruKind, segno: u32, @@ -206,26 +177,21 @@ impl DatadirTimeline { lsn: Lsn, ) -> Result { let key = slru_block_to_key(kind, segno, blknum); - self.tline.get(key, lsn) + self.get(key, lsn) } /// Get size of an SLRU segment - pub fn get_slru_segment_size( - &self, - kind: SlruKind, - segno: u32, - lsn: Lsn, - ) -> Result { + fn get_slru_segment_size(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.tline.get(key, lsn)?; + let mut buf = self.get(key, lsn)?; Ok(buf.get_u32_le()) } /// Get size of an SLRU segment - pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; let exists = dir.segments.get(&segno).is_some(); @@ -239,10 +205,10 @@ impl DatadirTimeline { /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// - pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { - let gc_cutoff_lsn_guard = self.tline.get_latest_gc_cutoff_lsn(); + fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; - let max_lsn = self.tline.get_last_record_lsn(); + let max_lsn = self.get_last_record_lsn(); // LSNs are always 8-byte aligned. low/mid/high represent the // LSN divided by 8. @@ -333,88 +299,51 @@ impl DatadirTimeline { } /// Get a list of SLRU segments - pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; Ok(dir.segments) } - pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let key = relmap_file_key(spcnode, dbnode); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; Ok(buf) } - pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { + fn list_dbdirs(&self, lsn: Lsn) -> Result> { // fetch directory entry - let buf = self.tline.get(DBDIR_KEY, lsn)?; + let buf = self.get(DBDIR_KEY, lsn)?; let dir = DbDirectory::des(&buf)?; Ok(dir.dbdirs) } - pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { let key = twophase_file_key(xid); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; Ok(buf) } - pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { + fn list_twophase_files(&self, lsn: Lsn) -> Result> { // fetch directory entry - let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let buf = self.get(TWOPHASEDIR_KEY, lsn)?; let dir = TwoPhaseDirectory::des(&buf)?; Ok(dir.xids) } - pub fn get_control_file(&self, lsn: Lsn) -> Result { - self.tline.get(CONTROLFILE_KEY, lsn) + fn get_control_file(&self, lsn: Lsn) -> Result { + self.get(CONTROLFILE_KEY, lsn) } - pub fn get_checkpoint(&self, lsn: Lsn) -> Result { - self.tline.get(CHECKPOINT_KEY, lsn) - } - - /// Get the LSN of the last ingested WAL record. - /// - /// This is just a convenience wrapper that calls through to the underlying - /// repository. - pub fn get_last_record_lsn(&self) -> Lsn { - self.tline.get_last_record_lsn() - } - - /// Check that it is valid to request operations with that lsn. - /// - /// This is just a convenience wrapper that calls through to the underlying - /// repository. - pub fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()> { - self.tline.check_lsn_is_in_scope(lsn, latest_gc_cutoff_lsn) - } - - /// Retrieve current logical size of the timeline - /// - /// NOTE: counted incrementally, includes ancestors, - pub fn get_current_logical_size(&self) -> usize { - let current_logical_size = self.current_logical_size.load(Ordering::Acquire); - match usize::try_from(current_logical_size) { - Ok(sz) => sz, - Err(_) => { - error!( - "current_logical_size is out of range: {}", - current_logical_size - ); - 0 - } - } + fn get_checkpoint(&self, lsn: Lsn) -> Result { + self.get(CHECKPOINT_KEY, lsn) } /// Does the same as get_current_logical_size but counted on demand. @@ -422,16 +351,16 @@ impl DatadirTimeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { // Fetch list of database dirs and iterate them - let buf = self.tline.get(DBDIR_KEY, lsn)?; + let buf = self.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; let mut total_size: usize = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { for rel in self.list_rels(*spcnode, *dbnode, lsn)? { let relsize_key = rel_size_to_key(rel); - let mut buf = self.tline.get(relsize_key, lsn)?; + let mut buf = self.get(relsize_key, lsn)?; let relsize = buf.get_u32_le(); total_size += relsize as usize; @@ -452,7 +381,7 @@ impl DatadirTimeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.tline.get(DBDIR_KEY, lsn)?; + let buf = self.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); @@ -469,7 +398,7 @@ impl DatadirTimeline { rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); - let mut buf = self.tline.get(relsize_key, lsn)?; + let mut buf = self.get(relsize_key, lsn)?; let relsize = buf.get_u32_le(); result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); @@ -485,13 +414,13 @@ impl DatadirTimeline { ] { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); - let buf = self.tline.get(slrudir_key, lsn)?; + let buf = self.get(slrudir_key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.tline.get(segsize_key, lsn)?; + let mut buf = self.get(segsize_key, lsn)?; let segsize = buf.get_u32_le(); result.add_range( @@ -503,7 +432,7 @@ impl DatadirTimeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let buf = self.get(TWOPHASEDIR_KEY, lsn)?; let twophase_dir = TwoPhaseDirectory::des(&buf)?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); xids.sort_unstable(); @@ -516,30 +445,17 @@ impl DatadirTimeline { Ok(result.to_keyspace()) } - - pub fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> { - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 == Lsn(0) - || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold - { - let keyspace = self.collect_keyspace(lsn)?; - let partitioning = keyspace.partition(partition_size); - *partitioning_guard = (partitioning, lsn); - return Ok((partitioning_guard.0.clone(), lsn)); - } - Ok((partitioning_guard.0.clone(), partitioning_guard.1)) - } } /// DatadirModification represents an operation to ingest an atomic set of /// updates to the repository. It is created by the 'begin_record' /// function. It is called for each WAL record, so that all the modifications /// by a one WAL record appear atomic. -pub struct DatadirModification<'a, R: Repository> { +pub struct DatadirModification<'a, T: DatadirTimeline> { /// The timeline this modification applies to. You can access this to /// read the state, but note that any pending updates are *not* reflected /// in the state in 'tline' yet. - pub tline: &'a DatadirTimeline, + pub tline: &'a T, // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the @@ -549,7 +465,7 @@ pub struct DatadirModification<'a, R: Repository> { pending_nblocks: isize, } -impl<'a, R: Repository> DatadirModification<'a, R> { +impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -934,7 +850,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { return Ok(()); } - let writer = self.tline.tline.writer(); + let writer = self.tline.writer(); // Flush relation and SLRU data blocks, keep metadata. let mut result: Result<()> = Ok(()); @@ -949,10 +865,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { result?; if pending_nblocks != 0 { - self.tline.current_logical_size.fetch_add( - pending_nblocks * pg_constants::BLCKSZ as isize, - Ordering::SeqCst, - ); + writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); self.pending_nblocks = 0; } @@ -965,7 +878,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { /// All the modifications in this atomic update are stamped by the specified LSN. /// pub fn commit(&mut self, lsn: Lsn) -> Result<()> { - let writer = self.tline.tline.writer(); + let writer = self.tline.writer(); let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; @@ -980,10 +893,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { writer.finish_write(lsn); if pending_nblocks != 0 { - self.tline.current_logical_size.fetch_add( - pending_nblocks * pg_constants::BLCKSZ as isize, - Ordering::SeqCst, - ); + writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); } Ok(()) @@ -1010,7 +920,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { } } else { let last_lsn = self.tline.get_last_record_lsn(); - self.tline.tline.get(key, last_lsn) + self.tline.get(key, last_lsn) } } @@ -1412,13 +1322,12 @@ fn is_slru_block_key(key: Key) -> bool { pub fn create_test_timeline( repo: R, timeline_id: utils::zid::ZTimelineId, -) -> Result>> { +) -> Result> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; - let tline = DatadirTimeline::new(tline, 256 * 1024); let mut m = tline.begin_modification(); m.init_empty()?; m.commit(Lsn(8))?; - Ok(Arc::new(tline)) + Ok(tline) } #[allow(clippy::bool_assert_comparison)] @@ -1491,7 +1400,7 @@ mod tests { .contains(&TESTREL_A)); // Run checkpoint and garbage collection and check that it's still not visible - newtline.tline.checkpoint(CheckpointConfig::Forced)?; + newtline.checkpoint(CheckpointConfig::Forced)?; repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; assert!(!newtline diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 359c704e81..61058a7806 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -185,7 +185,7 @@ impl Value { /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { - type Timeline: Timeline; + type Timeline: crate::DatadirTimeline; /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. /// See [`crate::remote_storage`] for more details about the synchronization. @@ -405,6 +405,8 @@ pub trait TimelineWriter<'a> { /// the 'lsn' or anything older. The previous last record LSN is stored alongside /// the latest and can be read. fn finish_write(&self, lsn: Lsn); + + fn update_current_logical_size(&self, delta: isize); } #[cfg(test)] diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index a485e7c2cb..640dfa623a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,7 +3,6 @@ use crate::config::PageServerConf; use crate::layered_repository::{load_metadata, LayeredRepository}; -use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; @@ -12,7 +11,7 @@ use crate::thread_mgr::ThreadKind; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; -use crate::{DatadirTimelineImpl, RepositoryImpl}; +use crate::{RepositoryImpl, TimelineImpl}; use anyhow::Context; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -101,7 +100,7 @@ struct Tenant { /// /// Local timelines have more metadata that's loaded into memory, /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. - local_timelines: HashMap>, + local_timelines: HashMap::Timeline>>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -178,7 +177,7 @@ pub enum LocalTimelineUpdate { }, Attach { id: ZTenantTimelineId, - datadir: Arc, + datadir: Arc<::Timeline>, }, } @@ -382,7 +381,7 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) @@ -489,27 +488,18 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any fn load_local_timeline( repo: &RepositoryImpl, timeline_id: ZTimelineId, -) -> anyhow::Result>> { +) -> anyhow::Result> { let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; - let repartition_distance = repo.get_checkpoint_distance() / 10; - let init_logical_size = inmem_timeline.init_logical_size; - let page_tline = Arc::new(DatadirTimelineImpl::new( - inmem_timeline, - repartition_distance, - )); - if let Some(logical_size) = init_logical_size { - page_tline.set_logical_size(logical_size); - } else { - page_tline.init_logical_size()?; - } + inmem_timeline.init_logical_size()?; + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach { id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id), - datadir: Arc::clone(&page_tline), + datadir: Arc::clone(&inmem_timeline), }); - Ok(page_tline) + Ok(inmem_timeline) } #[serde_as] diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index a40e705cb9..984276bad2 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -26,7 +26,7 @@ use crate::{ repository::{LocalTimelineState, Repository}, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, - DatadirTimeline, RepositoryImpl, + DatadirTimeline, RepositoryImpl, TimelineImpl, }; use crate::{import_datadir, LOG_FILE_NAME}; use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; @@ -54,27 +54,27 @@ pub struct LocalTimelineInfo { } impl LocalTimelineInfo { - pub fn from_loaded_timeline( - datadir_tline: &DatadirTimeline, + pub fn from_loaded_timeline( + timeline: &TimelineImpl, include_non_incremental_logical_size: bool, ) -> anyhow::Result { - let last_record_lsn = datadir_tline.tline.get_last_record_lsn(); + let last_record_lsn = timeline.get_last_record_lsn(); let info = LocalTimelineInfo { - ancestor_timeline_id: datadir_tline.tline.get_ancestor_timeline_id(), + ancestor_timeline_id: timeline.get_ancestor_timeline_id(), ancestor_lsn: { - match datadir_tline.tline.get_ancestor_lsn() { + match timeline.get_ancestor_lsn() { Lsn(0) => None, lsn @ Lsn(_) => Some(lsn), } }, - disk_consistent_lsn: datadir_tline.tline.get_disk_consistent_lsn(), + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), last_record_lsn, - prev_record_lsn: Some(datadir_tline.tline.get_prev_record_lsn()), - latest_gc_cutoff_lsn: *datadir_tline.tline.get_latest_gc_cutoff_lsn(), + prev_record_lsn: Some(timeline.get_prev_record_lsn()), + latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Loaded, - current_logical_size: Some(datadir_tline.get_current_logical_size()), + current_logical_size: Some(timeline.get_current_logical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { - Some(datadir_tline.get_current_logical_size_non_incremental(last_record_lsn)?) + Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) } else { None }, @@ -109,9 +109,8 @@ impl LocalTimelineInfo { ) -> anyhow::Result { match repo_timeline { RepositoryTimeline::Loaded(_) => { - let datadir_tline = - tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; - Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size) + let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; + Self::from_loaded_timeline(&*timeline, include_non_incremental_logical_size) } RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)), } @@ -298,19 +297,18 @@ fn bootstrap_timeline( // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. let timeline = repo.create_empty_timeline(tli, lsn)?; - let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { bail!("failpoint before-checkpoint-new-timeline"); }); - page_tline.tline.checkpoint(CheckpointConfig::Forced)?; + timeline.checkpoint(CheckpointConfig::Forced)?; info!( "created root timeline {} timeline.lsn {}", tli, - page_tline.tline.get_last_record_lsn() + timeline.get_last_record_lsn() ); // Remove temp dir. We don't need it anymore @@ -389,7 +387,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*loaded_timeline, false) .context("cannot fill timeline info")? } None => { @@ -397,7 +395,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let new_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&new_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*new_timeline, false) .context("cannot fill timeline info")? } }; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index adc24328ae..8dd14ec177 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,7 +34,6 @@ use std::collections::HashMap; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Repository; use crate::walrecord::*; use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::xlog_utils::*; @@ -44,8 +43,8 @@ use utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); -pub struct WalIngest<'a, R: Repository> { - timeline: &'a DatadirTimeline, +pub struct WalIngest<'a, T: DatadirTimeline> { + timeline: &'a T, checkpoint: CheckPoint, checkpoint_modified: bool, @@ -53,8 +52,8 @@ pub struct WalIngest<'a, R: Repository> { relsize_cache: HashMap, } -impl<'a, R: Repository> WalIngest<'a, R> { - pub fn new(timeline: &DatadirTimeline, startpoint: Lsn) -> Result> { +impl<'a, T: DatadirTimeline> WalIngest<'a, T> { + pub fn new(timeline: &T, startpoint: Lsn) -> Result> { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; @@ -80,7 +79,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { &mut self, recdata: Bytes, lsn: Lsn, - modification: &mut DatadirModification, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { decode_wal_record(recdata, decoded).context("failed decoding wal record")?; @@ -268,7 +267,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_decoded_block( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, @@ -328,7 +327,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_heapam_record( &mut self, buf: &mut Bytes, - modification: &mut DatadirModification, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { // Handle VM bit updates that are implicitly part of heap records. @@ -472,7 +471,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. fn ingest_xlog_dbase_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlCreateDatabase, ) -> Result<()> { let db_id = rec.db_id; @@ -539,7 +538,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_xlog_smgr_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrCreate, ) -> Result<()> { let rel = RelTag { @@ -557,7 +556,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// This is the same logic as in PostgreSQL's smgr_redo() function. fn ingest_xlog_smgr_truncate( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrTruncate, ) -> Result<()> { let spcnode = rec.rnode.spcnode; @@ -622,7 +621,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// fn ingest_xact_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, ) -> Result<()> { @@ -691,7 +690,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_clog_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlClogTruncate, ) -> Result<()> { info!( @@ -749,7 +748,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_multixact_create_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, ) -> Result<()> { // Create WAL record for updating the multixact-offsets page @@ -828,7 +827,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_multixact_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactTruncate, ) -> Result<()> { self.checkpoint.oldestMulti = xlrec.end_trunc_off; @@ -862,7 +861,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_relmap_page( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, ) -> Result<()> { @@ -878,7 +877,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_creation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, ) -> Result<()> { self.relsize_cache.insert(rel, 0); @@ -888,7 +887,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, img: Bytes, @@ -900,7 +899,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_wal_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, rec: ZenithWalRecord, @@ -912,7 +911,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_truncation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, nblocks: BlockNumber, ) -> Result<()> { @@ -923,7 +922,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_drop( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, ) -> Result<()> { modification.put_rel_drop(rel)?; @@ -948,7 +947,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn handle_rel_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, ) -> Result<()> { @@ -986,7 +985,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_slru_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -999,7 +998,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn handle_slru_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -1052,6 +1051,7 @@ mod tests { use super::*; use crate::pgdatadir_mapping::create_test_timeline; use crate::repository::repo_harness::*; + use crate::repository::Timeline; use postgres_ffi::pg_constants; /// Arbitrary relation tag, for testing. @@ -1062,13 +1062,13 @@ mod tests { forknum: 0, }; - fn assert_current_logical_size(_timeline: &DatadirTimeline, _lsn: Lsn) { + fn assert_current_logical_size(_timeline: &T, _lsn: Lsn) { // TODO } static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - fn init_walingest_test(tline: &DatadirTimeline) -> Result> { + fn init_walingest_test(tline: &T) -> Result> { let mut m = tline.begin_modification(); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file @@ -1082,7 +1082,7 @@ mod tests { fn test_relsize() -> Result<()> { let repo = RepoHarness::create("test_relsize")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(); walingest.put_rel_creation(&mut m, TESTREL_A)?; @@ -1098,7 +1098,7 @@ mod tests { walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; m.commit(Lsn(0x50))?; - assert_current_logical_size(&tline, Lsn(0x50)); + assert_current_logical_size(&*tline, Lsn(0x50)); // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); @@ -1145,7 +1145,7 @@ mod tests { let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; m.commit(Lsn(0x60))?; - assert_current_logical_size(&tline, Lsn(0x60)); + assert_current_logical_size(&*tline, Lsn(0x60)); // Check reported size and contents after truncation assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 2); @@ -1210,7 +1210,7 @@ mod tests { fn test_drop_extend() -> Result<()> { let repo = RepoHarness::create("test_drop_extend")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; @@ -1250,7 +1250,7 @@ mod tests { fn test_truncate_extend() -> Result<()> { let repo = RepoHarness::create("test_truncate_extend")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; @@ -1338,7 +1338,7 @@ mod tests { fn test_large_rel() -> Result<()> { let repo = RepoHarness::create("test_large_rel")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; for blknum in 0..pg_constants::RELSEG_SIZE + 1 { @@ -1349,7 +1349,7 @@ mod tests { m.commit(Lsn(lsn))?; } - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, @@ -1365,7 +1365,7 @@ mod tests { tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE ); - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block lsn += 0x10; @@ -1376,7 +1376,7 @@ mod tests { tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE - 1 ); - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time // This tests the behavior at segment boundaries @@ -1393,7 +1393,7 @@ mod tests { size -= 1; } - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); Ok(()) } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 614bca50ad..f2aa7ce2cf 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -25,7 +25,8 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::DatadirTimelineImpl; +use crate::repository::{Repository, Timeline}; +use crate::{RepositoryImpl, TimelineImpl}; use utils::{ lsn::Lsn, pq_proto::ReplicationFeedback, @@ -39,7 +40,7 @@ pub(super) fn spawn_connection_manager_task( id: ZTenantTimelineId, broker_loop_prefix: String, mut client: Client, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -245,7 +246,7 @@ async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { struct WalreceiverState { id: ZTenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + local_timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -283,7 +284,7 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( id: ZTenantTimelineId, - local_timeline: Arc, + local_timeline: Arc<::Timeline>, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -1203,13 +1204,10 @@ mod tests { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, - local_timeline: Arc::new(DatadirTimelineImpl::new( - harness - .load() - .create_empty_timeline(TIMELINE_ID, Lsn(0)) - .expect("Failed to create an empty timeline for dummy wal connection manager"), - 10_000, - )), + local_timeline: harness + .load() + .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .expect("Failed to create an empty timeline for dummy wal connection manager"), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1).unwrap(), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index cc1a9cc5eb..ca29c00771 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -20,6 +20,7 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::{ http::models::WalReceiverEntry, + pgdatadir_mapping::DatadirTimeline, repository::{Repository, Timeline}, tenant_mgr, walingest::WalIngest, @@ -177,7 +178,7 @@ pub async fn handle_walreceiver_connection( caught_up = true; } - let timeline_to_check = Arc::clone(&timeline.tline); + let timeline_to_check = Arc::clone(&timeline); tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) .await .with_context(|| { @@ -225,7 +226,7 @@ pub async fn handle_walreceiver_connection( // The last LSN we processed. It is not guaranteed to survive pageserver crash. let write_lsn = u64::from(last_lsn); // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); + let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. let apply_lsn = u64::from(timeline_remote_consistent_lsn); From fd46e52e00dfdaa1e63c40cf0e8836d9f10470cb Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 27 Jul 2022 12:28:05 +0300 Subject: [PATCH 25/29] Switch staging storage to dedicated etcd (#2164) --- .github/ansible/staging.hosts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts index 35e77513df..2bb28f1972 100644 --- a/.github/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -17,4 +17,4 @@ env_name = us-stage console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 -etcd_endpoints = etcd-staging.local:2379 +etcd_endpoints = zenith-us-stage-etcd.local:2379 From f6f29f58cd8178ac452b276ececa692e79dda85e Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 27 Jul 2022 16:41:25 +0300 Subject: [PATCH 26/29] Switch production storage to dedicated etcd (#2169) --- .github/ansible/production.hosts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ansible/production.hosts b/.github/ansible/production.hosts index d22ce0e37e..364e8ed50e 100644 --- a/.github/ansible/production.hosts +++ b/.github/ansible/production.hosts @@ -17,4 +17,4 @@ env_name = prod-1 console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 -etcd_endpoints = etcd-release.local:2379 +etcd_endpoints = zenith-1-etcd.local:2379 From 6a664629fa4834a8c9c1a00d3c729d924f19ad45 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Wed, 27 Jul 2022 12:36:46 -0400 Subject: [PATCH 27/29] Add timeline physical size tracking (#2126) Ref #1902. - Track the layered timeline's `physical_size` using `pageserver_current_physical_size` metric when updating the layer map. - Report the local timeline's `physical_size` in timeline GET APIs. - Add `include-non-incremental-physical-size` URL flag to also report the local timeline's `physical_size_non_incremental` (similar to `logical_size_non_incremental`) - Add a `UIntGaugeVec` and `UIntGauge` to represent `u64` prometheus metrics Co-authored-by: Dmitry Rodionov --- libs/metrics/src/lib.rs | 14 ++ pageserver/src/http/openapi_spec.yml | 14 ++ pageserver/src/http/routes.rs | 25 ++-- pageserver/src/layered_repository/timeline.rs | 80 +++++++++-- pageserver/src/repository.rs | 5 + pageserver/src/timelines.rs | 24 +++- .../batch_others/test_timeline_size.py | 131 ++++++++++++++++++ test_runner/fixtures/neon_fixtures.py | 8 +- test_runner/fixtures/utils.py | 37 ++++- 9 files changed, 315 insertions(+), 23 deletions(-) diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 3b5da9f7ff..ea24b3fe7e 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,6 +3,9 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use lazy_static::lazy_static; +use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec}; +pub use prometheus::opts; +pub use prometheus::register; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_gauge, Gauge}; @@ -18,6 +21,17 @@ pub use prometheus::{Encoder, TextEncoder}; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; +pub type UIntGauge = GenericGauge; +pub type UIntGaugeVec = GenericGaugeVec; + +#[macro_export] +macro_rules! register_uint_gauge_vec { + ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ + let gauge_vec = UIntGaugeVec::new($crate::opts!($NAME, $HELP), $LABELS_NAMES).unwrap(); + $crate::register(Box::new(gauge_vec.clone())).map(|_| gauge_vec) + }}; +} + /// Gathers all Prometheus metrics and records the I/O stats just before that. /// /// Metrics gathering is a relatively simple and standalone operation, so diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2775a27e0f..46305a4855 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -78,6 +78,11 @@ paths: schema: type: string description: Controls calculation of current_logical_size_non_incremental + - name: include-non-incremental-physical-size + in: query + schema: + type: string + description: Controls calculation of current_physical_size_non_incremental get: description: Get timelines for tenant responses: @@ -136,6 +141,11 @@ paths: schema: type: string description: Controls calculation of current_logical_size_non_incremental + - name: include-non-incremental-physical-size + in: query + schema: + type: string + description: Controls calculation of current_physical_size_non_incremental responses: "200": description: TimelineInfo @@ -671,8 +681,12 @@ components: format: hex current_logical_size: type: integer + current_physical_size: + type: integer current_logical_size_non_incremental: type: integer + current_physical_size_non_incremental: + type: integer WalReceiverEntry: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 236415cf58..8ac3faca7a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -113,10 +113,17 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); + let include_non_incremental_logical_size = + query_param_present(&request, "include-non-incremental-logical-size"); + let include_non_incremental_physical_size = + query_param_present(&request, "include-non-incremental-physical-size"); let local_timeline_infos = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - crate::timelines::get_local_timelines(tenant_id, include_non_incremental_logical_size) + crate::timelines::get_local_timelines( + tenant_id, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) }) .await .map_err(ApiError::from_err)??; @@ -145,17 +152,15 @@ async fn timeline_list_handler(request: Request) -> Result, json_response(StatusCode::OK, response_data) } -// Gate non incremental logical size calculation behind a flag -// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines -// and tenants it can take noticeable amount of time. Also the value currently used only in tests -fn get_include_non_incremental_logical_size(request: &Request) -> bool { +/// Checks if a query param is present in the request's URL +fn query_param_present(request: &Request, param: &str) -> bool { request .uri() .query() .map(|v| { url::form_urlencoded::parse(v.as_bytes()) .into_owned() - .any(|(param, _)| param == "include-non-incremental-logical-size") + .any(|(p, _)| p == param) }) .unwrap_or(false) } @@ -165,7 +170,10 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result u64 { + self.current_physical_size_gauge.get() + } + + fn get_physical_size_non_incremental(&self) -> anyhow::Result { + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + // total size of layer files in the current timeline directory + let mut total_physical_size = 0; + + for direntry in fs::read_dir(timeline_path)? { + let direntry = direntry?; + let fname = direntry.file_name(); + let fname = fname.to_string_lossy(); + + if ImageFileName::parse_str(&fname).is_some() + || DeltaFileName::parse_str(&fname).is_some() + { + total_physical_size += direntry.metadata()?.len(); + } + } + + Ok(total_physical_size) + } } impl LayeredTimeline { @@ -515,6 +553,9 @@ impl LayeredTimeline { let wait_lsn_time_histo = WAIT_LSN_TIME .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); + let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); let mut result = LayeredTimeline { conf, @@ -544,6 +585,7 @@ impl LayeredTimeline { create_images_time_histo, last_record_gauge, wait_lsn_time_histo, + current_physical_size_gauge, upload_layers: AtomicBool::new(upload_layers), @@ -579,6 +621,8 @@ impl LayeredTimeline { // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + // total size of layer files in the current timeline directory + let mut total_physical_size = 0; for direntry in fs::read_dir(timeline_path)? { let direntry = direntry?; @@ -601,6 +645,7 @@ impl LayeredTimeline { ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); trace!("found layer {}", layer.filename().display()); + total_physical_size += layer.path().metadata()?.len(); layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { @@ -624,6 +669,7 @@ impl LayeredTimeline { DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); trace!("found layer {}", layer.filename().display()); + total_physical_size += layer.path().metadata()?.len(); layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { @@ -640,9 +686,10 @@ impl LayeredTimeline { layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); info!( - "loaded layer map with {} layers at {}", - num_layers, disk_consistent_lsn + "loaded layer map with {} layers at {}, total physical size: {}", + num_layers, disk_consistent_lsn, total_physical_size ); + self.current_physical_size_gauge.set(total_physical_size); Ok(()) } @@ -1203,8 +1250,12 @@ impl LayeredTimeline { layers.insert_historic(Arc::new(new_delta)); } + // update the timeline's physical size + let sz = new_delta_path.metadata()?.len(); + self.current_physical_size_gauge.add(sz); + // update metrics NUM_PERSISTENT_FILES_CREATED.inc_by(1); - PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); + PERSISTENT_BYTES_WRITTEN.inc_by(sz); Ok(new_delta_path) } @@ -1390,6 +1441,8 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); for l in image_layers { + self.current_physical_size_gauge + .add(l.path().metadata()?.len()); layers.insert_historic(Arc::new(l)); } drop(layers); @@ -1635,19 +1688,27 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); for l in new_layers { - new_layer_paths.insert(l.path()); + let new_delta_path = l.path(); + + // update the timeline's physical size + self.current_physical_size_gauge + .add(new_delta_path.metadata()?.len()); + + new_layer_paths.insert(new_delta_path); layers.insert_historic(Arc::new(l)); } // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); - for l in &deltas_to_compact { - l.delete()?; + drop(all_keys_iter); + for l in deltas_to_compact { if let Some(path) = l.local_path() { + self.current_physical_size_gauge.sub(path.metadata()?.len()); layer_paths_do_delete.insert(path); } - layers.remove_historic(l.clone()); + l.delete()?; + layers.remove_historic(l); } drop(layers); @@ -1899,10 +1960,11 @@ impl LayeredTimeline { // while iterating it. BTreeMap::retain() would be another option) let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); for doomed_layer in layers_to_remove { - doomed_layer.delete()?; if let Some(path) = doomed_layer.local_path() { + self.current_physical_size_gauge.sub(path.metadata()?.len()); layer_paths_to_delete.insert(path); } + doomed_layer.delete()?; layers.remove_historic(doomed_layer); result.layers_removed += 1; } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 61058a7806..0ca8c6150c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -382,6 +382,11 @@ pub trait Timeline: Send + Sync { lsn: Lsn, latest_gc_cutoff_lsn: &RwLockReadGuard, ) -> Result<()>; + + /// Get the physical size of the timeline at the latest LSN + fn get_physical_size(&self) -> u64; + /// Get the physical size of the timeline at the latest LSN non incrementally + fn get_physical_size_non_incremental(&self) -> Result; } /// Various functions to mutate the timeline. diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 984276bad2..1088e516aa 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -49,7 +49,9 @@ pub struct LocalTimelineInfo { #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, + pub current_physical_size_non_incremental: Option, pub timeline_state: LocalTimelineState, } @@ -57,6 +59,7 @@ impl LocalTimelineInfo { pub fn from_loaded_timeline( timeline: &TimelineImpl, include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, ) -> anyhow::Result { let last_record_lsn = timeline.get_last_record_lsn(); let info = LocalTimelineInfo { @@ -72,12 +75,18 @@ impl LocalTimelineInfo { prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Loaded, + current_physical_size: Some(timeline.get_physical_size()), current_logical_size: Some(timeline.get_current_logical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) } else { None }, + current_physical_size_non_incremental: if include_non_incremental_physical_size { + Some(timeline.get_physical_size_non_incremental()?) + } else { + None + }, }; Ok(info) } @@ -97,7 +106,9 @@ impl LocalTimelineInfo { latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Unloaded, current_logical_size: None, + current_physical_size: None, current_logical_size_non_incremental: None, + current_physical_size_non_incremental: None, } } @@ -106,11 +117,16 @@ impl LocalTimelineInfo { timeline_id: ZTimelineId, repo_timeline: &RepositoryTimeline, include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, ) -> anyhow::Result { match repo_timeline { RepositoryTimeline::Loaded(_) => { let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; - Self::from_loaded_timeline(&*timeline, include_non_incremental_logical_size) + Self::from_loaded_timeline( + &*timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) } RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)), } @@ -320,6 +336,7 @@ fn bootstrap_timeline( pub(crate) fn get_local_timelines( tenant_id: ZTenantId, include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, ) -> Result> { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; @@ -334,6 +351,7 @@ pub(crate) fn get_local_timelines( timeline_id, &repository_timeline, include_non_incremental_logical_size, + include_non_incremental_physical_size, )?, )) } @@ -387,7 +405,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&*loaded_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*loaded_timeline, false, false) .context("cannot fill timeline info")? } None => { @@ -395,7 +413,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let new_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&*new_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*new_timeline, false, false) .context("cannot fill timeline info")? } }; diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 7b7b16bcbf..c3788a0e9b 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,10 +1,15 @@ from contextlib import closing +import pathlib +from uuid import UUID +import re import psycopg2.extras import psycopg2.errors from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local from fixtures.log_helper import log import time +from fixtures.utils import get_timeline_dir_size + def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env @@ -176,3 +181,129 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): cur.execute("SELECT * from pg_size_pretty(pg_cluster_size())") pg_cluster_size = cur.fetchone() log.info(f"pg_cluster_size = {pg_cluster_size}") + + +def test_timeline_physical_size_init(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_init') + pg = env.postgres.create_start("test_timeline_physical_size_init") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 1000) g""", + ]) + + # restart the pageserer to force calculating timeline's initial physical size + env.pageserver.stop() + env.pageserver.start() + + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_checkpoint') + pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 1000) g""", + ]) + + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): + # Disable background compaction as we don't want it to happen after `get_physical_size` request + # and before checking the expected size on disk, which makes the assertion failed + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + + env = neon_env_builder.init_start() + + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_compaction') + pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ]) + + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}") + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): + # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request + # and before checking the expected size on disk, which makes the assertion failed + neon_env_builder.pageserver_config_override = \ + "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" + + env = neon_env_builder.init_start() + + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_gc') + pg = env.postgres.create_start("test_timeline_physical_size_post_gc") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ]) + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + pg.safe_psql(""" + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """) + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + + env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0") + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): + env = neon_simple_env + + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_metric') + pg = env.postgres.create_start("test_timeline_physical_size_metric") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ]) + + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + + # get the metrics and parse the metric for the current timeline's physical size + metrics = env.pageserver.http_client().get_metrics() + matches = re.search( + f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + metrics, + re.MULTILINE) + assert matches + + # assert that the metric matches the actual physical size on disk + tl_physical_size_metric = int(matches.group(1)) + timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id) + assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) + + +def assert_physical_size(env: NeonEnv, tenant_id: UUID, timeline_id: UUID): + """Check the current physical size returned from timeline API + matches the total physical size of the timeline on disk""" + client = env.pageserver.http_client() + res = assert_timeline_local(client, tenant_id, timeline_id) + timeline_path = env.timeline_dir(tenant_id, timeline_id) + assert res["local"]["current_physical_size"] == res["local"][ + "current_physical_size_non_incremental"] + assert res["local"]["current_physical_size"] == get_timeline_dir_size(timeline_path) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b1fba29e3b..4913f0b456 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -691,6 +691,10 @@ class NeonEnv: """ Get list of safekeeper endpoints suitable for safekeepers GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) + def timeline_dir(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Path: + """Get a timeline directory's path based on the repo directory of the test environment""" + return self.repo_dir / "tenants" / tenant_id.hex / "timelines" / timeline_id.hex + @cached_property def auth_keys(self) -> AuthKeys: pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes() @@ -863,8 +867,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" - ) + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + + "?include-non-incremental-logical-size=1&include-non-incremental-physical-size=1") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index c49fa08d77..bc50a43ada 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,9 +1,11 @@ +import contextlib import os +import pathlib import shutil import subprocess from pathlib import Path -from typing import Any, List +from typing import Any, List, Tuple from fixtures.log_helper import log @@ -89,3 +91,36 @@ def get_dir_size(path: str) -> int: pass # file could be concurrently removed return totalbytes + + +def get_timeline_dir_size(path: pathlib.Path) -> int: + """Get the timeline directory's total size, which only counts the layer files' size.""" + sz = 0 + for dir_entry in path.iterdir(): + with contextlib.suppress(Exception): + # file is an image layer + _ = parse_image_layer(dir_entry.name) + sz += dir_entry.stat().st_size + continue + + with contextlib.suppress(Exception): + # file is a delta layer + _ = parse_delta_layer(dir_entry.name) + sz += dir_entry.stat().st_size + continue + return sz + + +def parse_image_layer(f_name: str) -> Tuple[int, int, int]: + """Parse an image layer file name. Return key start, key end, and snapshot lsn""" + parts = f_name.split("__") + key_parts = parts[0].split("-") + return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16) + + +def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: + """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end""" + parts = f_name.split("__") + key_parts = parts[0].split("-") + lsn_parts = parts[1].split("-") + return int(key_parts[0], 16), int(key_parts[1], 16), int(lsn_parts[0], 16), int(lsn_parts[1], 16) From 01f1f1c1bfcbb1d1ef0e28782fcec138ddd9ac05 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 27 Jul 2022 20:29:22 +0200 Subject: [PATCH 28/29] Add OpenAPI spec for safekeeper HTTP API (neondatabase/cloud#1264, #2061) This spec is used in the `cloud` repo to generate HTTP client. --- .github/ansible/scripts/init_safekeeper.sh | 5 +- control_plane/src/safekeeper.rs | 3 +- safekeeper/src/http/models.rs | 3 +- safekeeper/src/http/openapi_spec.yaml | 365 +++++++++++++++++++++ safekeeper/src/http/routes.rs | 11 +- test_runner/fixtures/neon_fixtures.py | 2 +- 6 files changed, 377 insertions(+), 12 deletions(-) create mode 100644 safekeeper/src/http/openapi_spec.yaml diff --git a/.github/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh index 2297788f59..a9b5025562 100644 --- a/.github/ansible/scripts/init_safekeeper.sh +++ b/.github/ansible/scripts/init_safekeeper.sh @@ -12,10 +12,9 @@ cat <, } diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml new file mode 100644 index 0000000000..da225f244b --- /dev/null +++ b/safekeeper/src/http/openapi_spec.yaml @@ -0,0 +1,365 @@ +openapi: "3.0.2" +info: + title: Safekeeper control API + version: "1.0" + + +servers: + - url: "http://localhost:7676" + + +paths: + /v1/status: + get: + tags: + - "Info" + summary: Get safekeeper status + description: "" + operationId: v1GetSafekeeperStatus + responses: + "200": + description: Safekeeper status + content: + application/json: + schema: + $ref: "#/components/schemas/SafekeeperStatus" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + + delete: + tags: + - "Tenant" + summary: Delete tenant and all its timelines + description: "Deletes tenant and returns a map of timelines that were deleted along with their statuses" + operationId: v1DeleteTenant + responses: + "200": + description: Tenant deleted + content: + application/json: + schema: + $ref: "#/components/schemas/TenantDeleteResult" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}/timeline: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Timeline" + summary: Register new timeline + description: "" + operationId: v1CreateTenantTimeline + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineCreateRequest" + responses: + "201": + description: Timeline created + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}/timeline/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + + get: + tags: + - "Timeline" + summary: Get timeline information and status + description: "" + operationId: v1GetTenantTimeline + responses: + "200": + description: Timeline status + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineStatus" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + delete: + tags: + - "Timeline" + summary: Delete timeline + description: "" + operationId: v1DeleteTenantTimeline + responses: + "200": + description: Timeline deleted + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineDeleteResult" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/record_safekeeper_info/{tenant_id}/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Tests" + summary: Used only in tests to hand craft required data + description: "" + operationId: v1RecordSafekeeperInfo + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/SkTimelineInfo" + responses: + "200": + description: Timeline info posted + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + + + schemas: + + # + # Requests + # + + TimelineCreateRequest: + type: object + required: + - timeline_id + - peer_ids + properties: + timeline_id: + type: string + format: hex + peer_ids: + type: array + items: + type: integer + minimum: 0 + + SkTimelineInfo: + type: object + required: + - last_log_term + - flush_lsn + - commit_lsn + - backup_lsn + - remote_consistent_lsn + - peer_horizon_lsn + - safekeeper_connstr + properties: + last_log_term: + type: integer + minimum: 0 + flush_lsn: + type: string + commit_lsn: + type: string + backup_lsn: + type: string + remote_consistent_lsn: + type: string + peer_horizon_lsn: + type: string + safekeeper_connstr: + type: string + + # + # Responses + # + + SafekeeperStatus: + type: object + required: + - id + properties: + id: + type: integer + minimum: 0 # kind of unsigned integer + + TimelineStatus: + type: object + required: + - timeline_id + - tenant_id + properties: + timeline_id: + type: string + format: hex + tenant_id: + type: string + format: hex + acceptor_state: + $ref: '#/components/schemas/AcceptorStateStatus' + flush_lsn: + type: string + timeline_start_lsn: + type: string + local_start_lsn: + type: string + commit_lsn: + type: string + backup_lsn: + type: string + peer_horizon_lsn: + type: string + remote_consistent_lsn: + type: string + + AcceptorStateStatus: + type: object + required: + - term + - epoch + properties: + term: + type: integer + minimum: 0 # kind of unsigned integer + epoch: + type: integer + minimum: 0 # kind of unsigned integer + term_history: + type: array + items: + $ref: '#/components/schemas/TermSwitchEntry' + + TermSwitchEntry: + type: object + required: + - term + - lsn + properties: + term: + type: integer + minimum: 0 # kind of unsigned integer + lsn: + type: string + + TimelineDeleteResult: + type: object + required: + - dir_existed + - was_active + properties: + dir_existed: + type: boolean + was_active: + type: boolean + + TenantDeleteResult: + type: object + additionalProperties: + $ref: "#/components/schemas/TimelineDeleteResult" + example: + 57fd1b39f23704a63423de0a8435d85c: + dir_existed: true + was_active: false + 67fd1b39f23704a63423gb8435d85c33: + dir_existed: false + was_active: false + + # + # Errors + # + + GenericErrorContent: + type: object + properties: + msg: + type: string + + responses: + + # + # Errors + # + + GenericError: + description: Generic error response + content: + application/json: + schema: + $ref: "#/components/schemas/GenericErrorContent" + + ForbiddenError: + description: Forbidden error response + content: + application/json: + schema: + type: object + required: + - msg + properties: + msg: + type: string + + +security: + - JWT: [] diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 33581c6c31..13356c5921 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -126,7 +126,7 @@ async fn timeline_create_handler(mut request: Request) -> Result SafekeeperTimelineStatus: - res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id}/{timeline_id}") + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], From 58b04438f0fad05e78e661e1643b6653af092dd7 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 27 Jul 2022 20:04:34 +0300 Subject: [PATCH 29/29] Tweak backoff numbers to avoid no wal connection threshold trigger --- pageserver/src/walreceiver/connection_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index f2aa7ce2cf..f2b1671eb4 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -230,8 +230,8 @@ async fn subscribe_for_timeline_updates( } } -const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 2.0; -const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 60.0; +const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; +const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { if n == 0 {