From 031e57a973d5be159012a7af44d4b41f7abd61be Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 14 Sep 2022 16:10:52 +0300 Subject: [PATCH 01/90] Disable failpoints by default --- .github/workflows/build_and_test.yml | 6 ++++-- pageserver/Cargo.toml | 10 +++++----- test_runner/README.md | 1 + 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d586741d68..7688f9c1bd 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -94,15 +94,17 @@ jobs: # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, # because "cargo metadata" doesn't accept --release or --debug options # + # We run tests with addtional features, that are turned off by default (e.g. in release builds), see + # corresponding Cargo.toml files for their descriptions. - name: Set env variables run: | if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="" + CARGO_FEATURES="--features failpoints" CARGO_FLAGS="--locked --timings" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features profiling" + CARGO_FEATURES="--features failpoints,profiling" CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 11d2d94906..ce09e788bd 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -4,12 +4,12 @@ version = "0.1.0" edition = "2021" [features] -# It is simpler infra-wise to have failpoints enabled by default -# It shouldn't affect performance in any way because failpoints -# are not placed in hot code paths -default = ["failpoints"] -profiling = ["pprof"] +default = [] + +# Feature that enables a special API, fail_point! macro (adds some runtime cost) +# to run tests on outage conditions failpoints = ["fail/failpoints"] +profiling = ["pprof"] [dependencies] async-stream = "0.3" diff --git a/test_runner/README.md b/test_runner/README.md index 44751944b3..01fe4ff863 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,6 +6,7 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions + If you want to test tests with failpoints, you would need to add `--features failpoints` to Rust code build commands. - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule From db5ec0dae70aed65d79a23574afb4f2ea8d4fa06 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Thu, 15 Sep 2022 23:50:46 -0700 Subject: [PATCH 02/90] Cleanup/simplify logical size calculation (#2459) Should produce identical results; replaces an error case that shouldn't be possible with `expect`. --- pageserver/src/tenant/timeline.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e821ef1b9a..95bdf715b5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -232,14 +232,16 @@ impl LogicalSize { } fn current_size(&self) -> anyhow::Result { - let size_increment = self.size_added_after_initial.load(AtomicOrdering::Acquire); + let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); + // ^^^ keep this type explicit so that the casts in this function break if + // we change the type. match self.initial_logical_size.get() { Some(initial_size) => { let absolute_size_increment = u64::try_from( size_increment .checked_abs() .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?, - ).with_context(|| format!("Failed to convert size increment {size_increment} to u64"))?; + ).expect("casting nonnegative i64 to u64 should not fail"); if size_increment < 0 { initial_size.checked_sub(absolute_size_increment) @@ -249,11 +251,7 @@ impl LogicalSize { .map(CurrentLogicalSize::Exact) } None => { - let non_negative_size_increment = if size_increment < 0 { - 0 - } else { - u64::try_from(size_increment).expect("not negative, cannot fail") - }; + let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) } } From 74312e268febaff8829b6fa795268231bd985699 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 16 Sep 2022 09:49:33 +0300 Subject: [PATCH 03/90] Tidy up storege artifact build flags * Simplify test build features handling * Build only necessary binaries during the release build --- .github/workflows/build_and_test.yml | 4 ++-- Dockerfile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7688f9c1bd..f67d42f2ff 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -101,7 +101,7 @@ jobs: if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_FEATURES="--features failpoints" - CARGO_FLAGS="--locked --timings" + CARGO_FLAGS="--locked --timings $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_FEATURES="--features failpoints,profiling" @@ -160,7 +160,7 @@ jobs: - name: Run cargo build run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests shell: bash -euxo pipefail {0} - name: Run cargo test diff --git a/Dockerfile b/Dockerfile index eacb88d168..711a92a90e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ -&& mold -run cargo build --locked --release \ +&& mold -run cargo build --bin pageserver --bin safekeeper --bin proxy --locked --release \ && cachepot -s # Build final image From 72b33997c773a963521d8007136c30080292e85e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 16 Sep 2022 10:09:54 +0100 Subject: [PATCH 04/90] Nightly Benchmarks: trigger tests earlier (#2463) --- .github/workflows/benchmarking.yml | 3 ++- .github/workflows/build_and_test.yml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index fab0a9aa04..df0e8a4275 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -11,7 +11,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '36 4 * * *' # run once a day, timezone is utc + - cron: '0 3 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually inputs: @@ -239,6 +239,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Create Allure report + if: always() uses: ./.github/actions/allure-report with: action: generate diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f67d42f2ff..5bff469582 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -324,6 +324,7 @@ jobs: build_type: ${{ matrix.build_type }} - name: Store Allure test stat in the DB + if: ${{ steps.create-allure-report.outputs.report-url }} env: BUILD_TYPE: ${{ matrix.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} From 4db15d3c7cbfbbe17c6f18af7b5eae3198fafadf Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 14 Sep 2022 18:22:00 +0300 Subject: [PATCH 05/90] change prefix_in_bucket in pageserver config --- .github/ansible/deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index b47db6a9b5..c06a0ef5b3 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -71,7 +71,7 @@ - "[remote_storage]" - "bucket_name = '{{ bucket_name }}'" - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = '{{ inventory_hostname }}'" + - "prefix_in_bucket = 'pageserver/v1'" become: true tags: - pageserver From 44fd4e3c9f9b8087dc0871785f87ed7848538839 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 16 Sep 2022 16:59:05 +0300 Subject: [PATCH 06/90] add more logs --- pageserver/src/storage_sync.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 9d259bf1e2..64e0f9a9e3 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -601,6 +601,7 @@ pub fn spawn_storage_sync_task( for (tenant_id, timeline_data) in local_timeline_files.0 { if timeline_data.is_empty() { + info!("got empty tenant {}", tenant_id); let _ = empty_tenants.0.entry(tenant_id).or_default(); } else { for (timeline_id, timeline_data) in timeline_data { @@ -1303,6 +1304,10 @@ fn schedule_first_sync_tasks( None => { // TODO (rodionov) does this mean that we've crashed during tenant creation? // is it safe to upload this checkpoint? could it be half broken? + warn!( + "marking {} as locally complete, while it doesnt exist in remote index", + sync_id + ); new_sync_tasks.push_back(( sync_id, SyncTask::upload(LayersUpload { @@ -1337,6 +1342,8 @@ fn compare_local_and_remote_timeline( local_files: HashSet, remote_entry: &RemoteTimeline, ) -> (LocalTimelineInitStatus, bool) { + let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered(); + let remote_files = remote_entry.stored_files(); let number_of_layers_to_download = remote_files.difference(&local_files).count(); @@ -1347,10 +1354,12 @@ fn compare_local_and_remote_timeline( layers_to_skip: local_files.clone(), }), )); + info!("NeedsSync"); (LocalTimelineInitStatus::NeedsSync, true) // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { + info!("LocallyComplete"); ( LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()), false, From 9c35a094527fea58f1f402f99682fe9dc8c23b02 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Fri, 16 Sep 2022 08:37:44 -0700 Subject: [PATCH 07/90] Improve build errors when `postgres_ffi` fails (#2460) This commit does two things of note: 1. Bumps the bindgen dependency from `0.59.1` to `0.60.1`. This gets us an actual error type from bindgen, so we can display what's wrong. 2. Adds `anyhow` as a build dependency, so our error message can be prettier. It's already used heavily elsewhere in the crates in this repo, so I figured the fact it's a build dependency doesn't matter much. I ran into this from running `cargo ` without running `make` first. Here's a comparison of the compiler output in those two cases. Before this commit: ``` error: failed to run custom build command for `postgres_ffi v0.1.0 ($repo_path/libs/postgres_ffi)` Caused by: process didn't exit successfully: `$repo_path/target/debug/build/postgres_ffi-2f7253b3ad3ca840/build-script-build` (exit status: 101) --- stdout cargo:rerun-if-changed=bindgen_deps.h --- stderr bindgen_deps.h:7:10: fatal error: 'c.h' file not found bindgen_deps.h:7:10: fatal error: 'c.h' file not found, err: true thread 'main' panicked at 'Unable to generate bindings: ()', libs/postgres_ffi/build.rs:135:14 note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace ``` After this commit: ``` error: failed to run custom build command for `postgres_ffi v0.1.0 ($repo_path/libs/postgres_ffi)` Caused by: process didn't exit successfully: `$repo_path/target/debug/build/postgres_ffi-e01fb59602596748/build-script-build` (exit status: 1) --- stdout cargo:rerun-if-changed=bindgen_deps.h --- stderr bindgen_deps.h:7:10: fatal error: 'c.h' file not found Error: Unable to generate bindings Caused by: clang diagnosed error: bindgen_deps.h:7:10: fatal error: 'c.h' file not found ``` --- Cargo.lock | 6 +++--- libs/postgres_ffi/Cargo.toml | 3 ++- libs/postgres_ffi/build.rs | 29 +++++++++++++++++++---------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a258fab5f6..ca169dc0c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -229,14 +229,14 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.59.2" +version = "0.60.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" dependencies = [ "bitflags", "cexpr", "clang-sys", - "clap 2.34.0", + "clap 3.2.16", "env_logger", "lazy_static", "lazycell", diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 2b453fa0dc..60caca76b8 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -25,4 +25,5 @@ postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d05 wal_craft = { path = "wal_craft" } [build-dependencies] -bindgen = "0.59.1" +anyhow = "1.0" +bindgen = "0.60.1" diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 8389ac37fe..25ff398bbd 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -4,6 +4,7 @@ use std::env; use std::path::PathBuf; use std::process::Command; +use anyhow::{anyhow, Context}; use bindgen::callbacks::ParseCallbacks; #[derive(Debug)] @@ -42,7 +43,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { } } -fn main() { +fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); @@ -58,7 +59,7 @@ fn main() { for pg_version in &["v14", "v15"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { - let cwd = env::current_dir().unwrap(); + let cwd = env::current_dir().context("Failed to get current_dir")?; pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); } @@ -70,21 +71,25 @@ fn main() { let output = Command::new(pg_config_bin) .arg("--includedir-server") .output() - .expect("failed to execute `pg_config --includedir-server`"); + .context("failed to execute `pg_config --includedir-server`")?; if !output.status.success() { panic!("`pg_config --includedir-server` failed") } - String::from_utf8(output.stdout).unwrap().trim_end().into() + String::from_utf8(output.stdout) + .context("pg_config output is not UTF-8")? + .trim_end() + .into() } else { - pg_install_dir_versioned + let server_path = pg_install_dir_versioned .join("include") .join("postgresql") .join("server") - .into_os_string() + .into_os_string(); + server_path .into_string() - .unwrap() + .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? }; // The bindgen::Builder is the main entry point @@ -132,14 +137,18 @@ fn main() { // Finish the builder and generate the bindings. // .generate() - .expect("Unable to generate bindings"); + .context("Unable to generate bindings")?; // Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file. - let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + let out_path: PathBuf = env::var("OUT_DIR") + .context("Couldn't read OUT_DIR environment variable var")? + .into(); let filename = format!("bindings_{pg_version}.rs"); bindings .write_to_file(out_path.join(filename)) - .expect("Couldn't write bindings!"); + .context("Couldn't write bindings")?; } + + Ok(()) } From 65a5010e256da28cbf9a9410ecd7953d8f57cd00 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 16 Sep 2022 17:44:02 +0200 Subject: [PATCH 08/90] Use custom `install` command in Makefile to speed up incremental builds (#2458) Fixes #1873: previously any run of `make` caused the `postgres-v15-headers` target to build. It copied a bunch of headers via `install -C`. Unfortunately, some origins were symlinks in the `./pg_install/build` directory pointing inside `./vendor/postgres-v15` (e.g. `pg_config_os.h` pointing to `linux.h`). GNU coreutils' `install` ignores the `-C` key for non-regular files and always overwrites the destination if the origin is a symlink. That in turn made Cargo rebuild the `postgres_ffi` crate and all its dependencies because it thinks that Postgres headers changed, even if they did not. That was slow. Now we use a custom script that wraps the `install` program. It handles one specific case and makes sure individual headers are never copied if their content did not change. Hence, `postgres_ffi` is not rebuilt unless there were some changes to the C code. One may still have slow incremental single-threaded builds because Postgres Makefiles spawn about 2800 sub-makes even if no files have been changed. A no-op build takes "only" 3-4 seconds on my machine now when run with `-j30`, and 20 seconds when run with `-j1`. --- Makefile | 2 +- scripts/ninstall.sh | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100755 scripts/ninstall.sh diff --git a/Makefile b/Makefile index 4ac51ed174..738a45fd5e 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ endif # headers, the mtime of the headers are not changed when there have # been no changes to the files. Changing the mtime triggers an # unnecessary rebuild of 'postgres_ffi'. -PG_CONFIGURE_OPTS += INSTALL='install -C' +PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C' # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) diff --git a/scripts/ninstall.sh b/scripts/ninstall.sh new file mode 100755 index 0000000000..3554e3e4df --- /dev/null +++ b/scripts/ninstall.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -euo pipefail +# GNU coreutil's `install -C` always overrides the destination if the source +# is not a regular file, which is the case with lots of headers symlinked into +# the build directory by `./configure`. That causes Rust's Cargo to think that +# Postgres headers have been updated after `make` call even if no files have been +# touched. That causes long recompilation of `postgres_ffi` and all dependent +# packages. To counter that, we handle a special case here: do not copy the file +# if its content did not change. We only handle a single case where `install` +# installs a single file with a specific set of arguments, the rest does not +# matter in our configuration. +# +# Such behavior may be incorrect if e.g. permissions have changed, but it should +# not happen during normal Neon development that often, and rebuild should help. +# +# See https://github.com/neondatabase/neon/issues/1873 +if [ "$#" == "5" ]; then + if [ "$1" == "-C" ] && [ "$2" == "-m" ] && [ "$3" == "644" ]; then + if [ -e "$5" ] && diff -q "$4" "$5" >/dev/null 2>&1; then + exit 0 + fi + fi +fi +install "$@" From b46c8b4ae008f88a0693837752d0ca8007a54dd5 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 16 Sep 2022 11:35:51 +0300 Subject: [PATCH 09/90] Add an alias to build test images simply --- .cargo/config.toml | 3 +++ test_runner/README.md | 2 ++ 2 files changed, 5 insertions(+) diff --git a/.cargo/config.toml b/.cargo/config.toml index 76a2ff549e..d70d57a817 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -11,3 +11,6 @@ opt-level = 3 [profile.dev] # Turn on a small amount of optimization in Development mode. opt-level = 1 + +[alias] +build_testing = ["build", "--features", "failpoints"] diff --git a/test_runner/README.md b/test_runner/README.md index 01fe4ff863..f17a4a5a5d 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -7,6 +7,8 @@ Prerequisites: - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions If you want to test tests with failpoints, you would need to add `--features failpoints` to Rust code build commands. + For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. + Usage example: `cargo build_testing --release` is equivalent to `cargo build --features failpoints --release` - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule From c9c3c77c31e45cf59c02dbe142d0c99432fc4f18 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 16 Sep 2022 19:51:35 +0200 Subject: [PATCH 10/90] Fix Docker image builds (follow-up for #2458) (#2469) Put ninstall.sh inside Docker images for building --- .dockerignore | 1 + Dockerfile | 1 + 2 files changed, 2 insertions(+) diff --git a/.dockerignore b/.dockerignore index 4bc8e5fa13..92eb4f24de 100644 --- a/.dockerignore +++ b/.dockerignore @@ -18,3 +18,4 @@ !vendor/postgres-v15/ !workspace_hack/ !neon_local/ +!scripts/ninstall.sh diff --git a/Dockerfile b/Dockerfile index 711a92a90e..213934a844 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,7 @@ COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile +COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE release RUN set -e \ From 846d126579bd34f0b57b11a4e5477d8d239feea2 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 19 Sep 2022 12:56:08 +0300 Subject: [PATCH 11/90] Set last written lsn for created relation (#2398) * Set last written lsn for created relation * use current LSN for updating last written LSN of relation metadata * Update LSN for the extended blocks even for pges without LSN (zeroed) * Update pgxn/neon/pagestore_smgr.c Co-authored-by: Heikki Linnakangas Co-authored-by: Heikki Linnakangas --- pgxn/neon/pagestore_smgr.c | 12 +++++++++++- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 24adee019f..8e6dd373b0 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -959,7 +959,17 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); #endif - + /* + * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr. + * An smgr_write() call will come for the buffer later, after it has been initialized + * with the real page contents, and it is eventually evicted from the buffer cache. + * But we need a valid LSN to the relation metadata update now. + */ + if (lsn == InvalidXLogRecPtr) + { + lsn = GetXLogInsertRecPtr(); + SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno); + } SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum); } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index ce723ee499..796770565f 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit ce723ee499450cb108aede464a35a17f3d75cf84 +Subproject commit 796770565ff668b585e80733b8d679961ad50e93 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 0858387047..7d144ae2f3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 08583870479e30c64aeb5a97d6fee9cf470f05fb +Subproject commit 7d144ae2f3649570f60a0477993b8c8ad9dd8c4b From 90ed12630e698441a66fce7c095cc5a02487a26d Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Mon, 19 Sep 2022 12:57:44 +0200 Subject: [PATCH 12/90] Add zenith-us-stage-ps-4 and undo changes in prefix_in_bucket in pageserver config (#2473) * Add zenith-us-stage-ps-4 * Undo changes in prefix_in_bucket in pageserver config (Rollback #2449) --- .github/ansible/deploy.yaml | 2 +- .github/ansible/staging.hosts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index c06a0ef5b3..b47db6a9b5 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -71,7 +71,7 @@ - "[remote_storage]" - "bucket_name = '{{ bucket_name }}'" - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = 'pageserver/v1'" + - "prefix_in_bucket = '{{ inventory_hostname }}'" become: true tags: - pageserver diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts index 2bb28f1972..c470f8a814 100644 --- a/.github/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -2,6 +2,7 @@ #zenith-us-stage-ps-1 console_region_id=27 zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-ps-3 console_region_id=27 +zenith-us-stage-ps-4 console_region_id=27 [safekeepers] zenith-us-stage-sk-4 console_region_id=27 From d11cb4b2f115eb3be48f31926b952bbbbd21e6f7 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 14:23:44 +0300 Subject: [PATCH 13/90] Bump vendor/postgres-v15 to the latest state of REL_15_STABLE_neon branch --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 7d144ae2f3..34c47d6c99 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 7d144ae2f3649570f60a0477993b8c8ad9dd8c4b +Subproject commit 34c47d6c99415c94296d5e599ec5590d0001d6c2 From 4b5e7f2f82aaa0c1427b42976a555d7c236ee5ad Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 19 Sep 2022 11:14:34 +0300 Subject: [PATCH 14/90] Temporarily disable storage deployments Do not update configs Do not restart servieces Still update binaries --- .github/ansible/deploy.yaml | 42 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index b47db6a9b5..6982445558 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -63,18 +63,18 @@ tags: - pageserver - - name: update remote storage (s3) config - lineinfile: - path: /storage/pageserver/data/pageserver.toml - line: "{{ item }}" - loop: - - "[remote_storage]" - - "bucket_name = '{{ bucket_name }}'" - - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = '{{ inventory_hostname }}'" - become: true - tags: - - pageserver + # - name: update remote storage (s3) config + # lineinfile: + # path: /storage/pageserver/data/pageserver.toml + # line: "{{ item }}" + # loop: + # - "[remote_storage]" + # - "bucket_name = '{{ bucket_name }}'" + # - "bucket_region = '{{ bucket_region }}'" + # - "prefix_in_bucket = '{{ inventory_hostname }}'" + # become: true + # tags: + # - pageserver - name: upload systemd service definition ansible.builtin.template: @@ -87,15 +87,15 @@ tags: - pageserver - - name: start systemd service - ansible.builtin.systemd: - daemon_reload: yes - name: pageserver - enabled: yes - state: restarted - become: true - tags: - - pageserver + # - name: start systemd service + # ansible.builtin.systemd: + # daemon_reload: yes + # name: pageserver + # enabled: yes + # state: restarted + # become: true + # tags: + # - pageserver - name: post version to console when: console_mgmt_base_url is defined From fcb4a61a120ab29de19f8a0bbe64aa29bed5f194 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 19 Sep 2022 18:41:18 +0300 Subject: [PATCH 15/90] Adjust spans around gc and compaction So compaction and gc loops have their own span to always show tenant id in log messages. --- pageserver/src/page_service.rs | 3 +++ pageserver/src/tenant.rs | 6 +----- pageserver/src/tenant/timeline.rs | 9 +++++---- pageserver/src/tenant_tasks.rs | 20 +++++++++++-------- .../src/walreceiver/connection_manager.rs | 9 +++++++-- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b06814c557..1461a6d117 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1090,6 +1090,9 @@ impl postgres_backend_async::Handler for PageServerHandler { let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; + let _span_guard = + info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; let gc_horizon: u64 = caps diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 41fd98ec07..f56f10d7ea 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -342,8 +342,7 @@ impl Tenant { drop(timelines); for (timeline_id, timeline) in &timelines_to_compact { - let _entered = - info_span!("compact", timeline = %timeline_id, tenant = %self.tenant_id).entered(); + let _entered = info_span!("compact_timeline", timeline = %timeline_id).entered(); timeline.compact()?; } @@ -835,9 +834,6 @@ impl Tenant { pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let _span_guard = - info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timeline_id) - .entered(); let mut totals: GcResult = Default::default(); let now = Instant::now(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 95bdf715b5..8670e979ee 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1916,18 +1916,19 @@ impl Timeline { let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let _enter = + info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff) + .entered(); + // Nothing to GC. Return early. let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); if latest_gc_cutoff >= new_gc_cutoff { info!( - "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", - self.timeline_id + "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", ); return Ok(result); } - let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); - // We need to ensure that no one tries to read page versions or create // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() // for details. This will block until the old value is no longer in use. diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index c543a0ecb1..8329b15c08 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -21,7 +21,9 @@ pub fn start_background_loops(tenant_id: TenantId) { &format!("compactor for tenant {tenant_id}"), false, async move { - compaction_loop(tenant_id).await; + compaction_loop(tenant_id) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .await; Ok(()) }, ); @@ -33,7 +35,9 @@ pub fn start_background_loops(tenant_id: TenantId) { &format!("garbage collector for tenant {tenant_id}"), false, async move { - gc_loop(tenant_id).await; + gc_loop(tenant_id) + .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .await; Ok(()) }, ); @@ -44,7 +48,7 @@ pub fn start_background_loops(tenant_id: TenantId) { /// async fn compaction_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting compaction loop for {tenant_id}"); + info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { loop { @@ -52,7 +56,7 @@ async fn compaction_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received compaction cancellation request"); + info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { @@ -73,7 +77,7 @@ async fn compaction_loop(tenant_id: TenantId) { // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received compaction cancellation request during idling"); + info!("received cancellation request during idling"); break ; }, _ = tokio::time::sleep(sleep_duration) => {}, @@ -91,7 +95,7 @@ async fn compaction_loop(tenant_id: TenantId) { /// async fn gc_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting gc loop for {tenant_id}"); + info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { loop { @@ -99,7 +103,7 @@ async fn gc_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received GC cancellation request"); + info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { @@ -123,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) { // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received GC cancellation request during idling"); + info!("received cancellation request during idling"); break; }, _ = tokio::time::sleep(sleep_duration) => {}, diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 1e4b4e7d52..799062e935 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -58,7 +58,10 @@ pub fn spawn_connection_manager_task( TaskKind::WalReceiverManager, Some(tenant_id), Some(timeline_id), - &format!("walreceiver for tenant {} timeline {}", timeline.tenant_id, timeline.timeline_id), + &format!( + "walreceiver for tenant {} timeline {}", + timeline.tenant_id, timeline.timeline_id + ), false, async move { info!("WAL receiver broker started, connecting to etcd"); @@ -88,7 +91,9 @@ pub fn spawn_connection_manager_task( } } } - .instrument(info_span!("wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + .instrument( + info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), + ), ); Ok(()) } From 6985f6cd6c53ae96ad4afaaaf546f5d94c869d50 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 19 Sep 2022 20:56:11 +0300 Subject: [PATCH 16/90] Add a new benchmark data series for prefetching. Also run benchmarks with the seqscan prefetching (commit f44afbaf62) enabled. Renames the 'neon-captest' test to 'neon-captest-reuse', for clarity --- .github/workflows/benchmarking.yml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index df0e8a4275..9a9021ac37 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -144,9 +144,10 @@ jobs: strategy: fail-fast: false matrix: - # neon-captest: Run pgbench, reusing existing project - # neon-captest-new: Same, but on a freshly created project - platform: [ neon-captest, neon-captest-new, rds-aurora ] + # neon-captest-new: Run pgbench in a freshly created project + # neon-captest-reuse: Same, but reusing existing project + # neon-captest-prefetch: Same, with prefetching enabled (new project) + platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-aurora ] runs-on: dev container: @@ -164,7 +165,7 @@ jobs: sudo apt install -y postgresql-14 - name: Create Neon Project - if: matrix.platform == 'neon-captest-new' + if: matrix.platform != 'neon-captest-reuse' id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -175,17 +176,20 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; neon-captest-new) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; + neon-captest-prefetch) + CONNSTR=${{ steps.create-neon-project.outputs.dsn }}?options=-cenable_seqscan_prefetch%3Don%20-cseqscan_prefetch_buffers%3D10 + ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest', 'neon-captest-new' or 'rds-aurora'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'" exit 1 ;; esac @@ -246,7 +250,7 @@ jobs: build_type: ${{ env.BUILD_TYPE }} - name: Delete Neon Project - if: ${{ matrix.platform == 'neon-captest-new' && always() }} + if: ${{ matrix.platform != 'neon-captest-reuse' && always() }} uses: ./.github/actions/neon-project-delete with: environment: dev From bb3c66d86f6c91e05d72d52baedcb4ff32617c2e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 19 Sep 2022 23:28:51 +0100 Subject: [PATCH 17/90] github/workflows: Make publishing perf reports more configurable (#2440) --- .github/actions/neon-project-create/action.yml | 1 + .github/actions/run-python-test-set/action.yml | 14 +++++--------- .github/workflows/benchmarking.yml | 10 +++++++--- .github/workflows/build_and_test.yml | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ba81afaaff..2f58ae77ad 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -60,6 +60,7 @@ runs: --header "Authorization: Bearer ${API_KEY}" \ --data "{ \"project\": { + \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"platform_id\": \"aws\", \"region_id\": \"${REGION_ID}\", \"settings\": { } diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 4c18641938..e69cb28df1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -112,10 +112,8 @@ runs: fi if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then - mkdir -p "$PERF_REPORT_DIR" - EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" - fi + mkdir -p "$PERF_REPORT_DIR" + EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi if [[ "${{ inputs.build_type }}" == "debug" ]]; then @@ -150,11 +148,9 @@ runs: -rA $TEST_SELECTION $EXTRA_PARAMS if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then - export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO="$PLATFORM" - scripts/generate_and_push_perf_report.sh - fi + export REPORT_FROM="$PERF_REPORT_DIR" + export REPORT_TO="$PLATFORM" + scripts/generate_and_push_perf_report.sh fi - name: Create Allure report diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 9a9021ac37..0430f0b9c0 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -19,8 +19,12 @@ on: description: 'Environment to run remote tests on (dev or staging)' required: false region_id: - description: 'Use a particular region. If empty the default one will be used' - false: true + description: 'Use a particular region. If not set the default region will be used' + required: false + save_perf_report: + type: boolean + description: 'Publish perf report or not. If not set, the report is published only for the main branch' + required: false defaults: run: @@ -139,7 +143,7 @@ jobs: POSTGRES_DISTRIB_DIR: /usr TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: true + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} strategy: fail-fast: false diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5bff469582..0b6cb21120 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -292,7 +292,7 @@ jobs: build_type: ${{ matrix.build_type }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ github.ref == 'refs/heads/main' }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" From e4f775436f534e8de49d0cb5a2c955e73ac6f03e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 19 Sep 2022 23:52:21 +0300 Subject: [PATCH 18/90] Don't override other options than statement_timeout in test conn string. In commit 6985f6cd6c, I tried passing extra GUCs in the 'options' part of the connection string, but it didn't work because the pgbench test overrode it with the statement_timeout. Change it so that it adds the statement_timeout to any other options, instead of replacing them. --- test_runner/performance/test_perf_pgbench.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 2a2213b783..d9bf237e49 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -84,9 +84,8 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P if workload_type == PgBenchLoadType.INIT: # Run initialize - init_pgbench( - env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options="-cstatement_timeout=1h")] - ) + options = "-cstatement_timeout=1h " + env.pg.default_options["options"] + init_pgbench(env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options=options)]) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload From 566e816298a201c9150f0c42846949296997d74d Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 20 Sep 2022 09:42:39 +0200 Subject: [PATCH 19/90] Refactor safekeeper timelines handling (#2329) See https://github.com/neondatabase/neon/pull/2329 for details --- Cargo.lock | 2 + libs/utils/src/postgres_backend.rs | 18 +- pageserver/src/lib.rs | 2 +- safekeeper/Cargo.toml | 2 + safekeeper/src/bin/safekeeper.rs | 6 +- safekeeper/src/broker.rs | 42 +- safekeeper/src/control_file.rs | 73 ++- safekeeper/src/handler.rs | 67 +-- safekeeper/src/http/routes.rs | 62 ++- safekeeper/src/json_ctrl.rs | 61 +-- safekeeper/src/lib.rs | 9 +- safekeeper/src/metrics.rs | 19 +- safekeeper/src/receive_wal.rs | 27 +- safekeeper/src/remove_wal.rs | 21 +- safekeeper/src/safekeeper.rs | 141 +++--- safekeeper/src/send_wal.rs | 20 +- safekeeper/src/timeline.rs | 665 ++++++++++++------------- safekeeper/src/timelines_global_map.rs | 348 +++++++++++++ safekeeper/src/wal_backup.rs | 72 +-- safekeeper/src/wal_storage.rs | 249 ++++----- 20 files changed, 1097 insertions(+), 809 deletions(-) create mode 100644 safekeeper/src/timelines_global_map.rs diff --git a/Cargo.lock b/Cargo.lock index ca169dc0c8..2f4a57b698 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2723,6 +2723,7 @@ dependencies = [ "hyper", "metrics", "once_cell", + "parking_lot 0.12.1", "postgres", "postgres-protocol", "postgres_ffi", @@ -2733,6 +2734,7 @@ dependencies = [ "serde_with", "signal-hook", "tempfile", + "thiserror", "tokio", "tokio-postgres", "toml_edit", diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 0498e0887b..adee46c2dd 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -429,8 +429,22 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; + // + // We also don't want to log full stacktrace when the error is primitive, + // such as usual connection closed. + let short_error = format!("{:#}", e); + let root_cause = e.root_cause().to_string(); + if root_cause.contains("connection closed unexpectedly") + || root_cause.contains("Broken pipe (os error 32)") + { + error!( + "query handler for '{}' failed: {}", + query_string, short_error + ); + } else { + error!("query handler for '{}' failed: {:?}", query_string, e); + } + self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { return Ok(ProcessMsgResult::Break); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 959942aa12..acd37161a0 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -105,7 +105,7 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds } /// A newtype to store arbitrary data grouped by tenant and timeline ids. -/// One could use [`utils::zid::TenantTimelineId`] for grouping, but that would +/// One could use [`utils::id::TenantTimelineId`] for grouping, but that would /// not include the cases where a certain tenant has zero timelines. /// This is sometimes important: a tenant could be registered during initial load from FS, /// even if he has no timelines on disk. diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index cae095c3c2..87ee63d1df 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,8 @@ git-version = "0.3.5" async-trait = "0.1" once_cell = "1.13.0" toml_edit = { version = "0.13", features = ["easy"] } +thiserror = "1" +parking_lot = "0.12.1" postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index d518ac01cc..7726f25a2d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -24,9 +24,9 @@ use safekeeper::defaults::{ }; use safekeeper::http; use safekeeper::remove_wal; -use safekeeper::timeline::GlobalTimelines; use safekeeper::wal_backup; use safekeeper::wal_service; +use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use utils::auth::JwtAuth; use utils::{ @@ -298,7 +298,9 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); - GlobalTimelines::init(wal_backup_launcher_tx); + + // Load all timelines from disk to memory. + GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?; let conf_ = conf.clone(); threads.push( diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index f276fad613..6a2456ecda 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -10,6 +10,7 @@ use etcd_broker::LeaseKeeper; use std::collections::hash_map::Entry; use std::collections::HashMap; +use std::collections::HashSet; use std::time::Duration; use tokio::spawn; use tokio::task::JoinHandle; @@ -17,7 +18,8 @@ use tokio::{runtime, time::sleep}; use tracing::*; use url::Url; -use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use crate::GlobalTimelines; +use crate::SafeKeeperConf; use etcd_broker::{ subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, Client, PutOptions, @@ -45,12 +47,12 @@ pub fn thread_main(conf: SafeKeeperConf) { /// Key to per timeline per safekeeper data. fn timeline_safekeeper_path( broker_etcd_prefix: String, - zttid: TenantTimelineId, + ttid: TenantTimelineId, sk_id: NodeId, ) -> String { format!( "{}/{sk_id}", - SubscriptionKey::sk_timeline_info(broker_etcd_prefix, zttid).watch_key() + SubscriptionKey::sk_timeline_info(broker_etcd_prefix, ttid).watch_key() ) } @@ -162,7 +164,7 @@ pub fn get_candiate_name(system_id: NodeId) -> String { } async fn push_sk_info( - zttid: TenantTimelineId, + ttid: TenantTimelineId, mut client: Client, key: String, sk_info: SkTimelineInfo, @@ -190,7 +192,7 @@ async fn push_sk_info( .await .context("failed to receive LeaseKeepAliveResponse")?; - Ok((zttid, lease)) + Ok((ttid, lease)) } struct Lease { @@ -210,11 +212,15 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. - let active_tlis = GlobalTimelines::get_active_timelines(); + let mut active_tlis = GlobalTimelines::get_all(); + active_tlis.retain(|tli| tli.is_active()); + + let active_tlis_set: HashSet = + active_tlis.iter().map(|tli| tli.ttid).collect(); // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data. - for zttid in active_tlis.iter() { - if let Entry::Vacant(v) = leases.entry(*zttid) { + for tli in &active_tlis { + if let Entry::Vacant(v) = leases.entry(tli.ttid) { let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?; v.insert(Lease { @@ -224,30 +230,26 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { }); } } - leases.retain(|zttid, _| active_tlis.contains(zttid)); + leases.retain(|ttid, _| active_tlis_set.contains(ttid)); // Push data concurrently to not suffer from latency, with many timelines it can be slow. let handles = active_tlis .iter() - .filter_map(|zttid| GlobalTimelines::get_loaded(*zttid)) .map(|tli| { let sk_info = tli.get_public_info(&conf); - let key = timeline_safekeeper_path( - conf.broker_etcd_prefix.clone(), - tli.zttid, - conf.my_id, - ); - let lease = leases.remove(&tli.zttid).unwrap(); - tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease)) + let key = + timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id); + let lease = leases.remove(&tli.ttid).unwrap(); + tokio::spawn(push_sk_info(tli.ttid, client.clone(), key, sk_info, lease)) }) .collect::>(); for h in handles { - let (zttid, lease) = h.await??; + let (ttid, lease) = h.await??; // It is ugly to pull leases from hash and then put it back, but // otherwise we have to resort to long living per tli tasks (which // would generate a lot of errors when etcd is down) as task wants to // have 'static objects, we can't borrow to it. - leases.insert(zttid, lease); + leases.insert(ttid, lease); } sleep(push_interval).await; @@ -279,7 +281,7 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { match subscription.value_updates.recv().await { Some(new_info) => { // note: there are blocking operations below, but it's considered fine for now - if let Ok(tli) = GlobalTimelines::get(&conf, new_info.key.id, false) { + if let Ok(tli) = GlobalTimelines::get(new_info.key.id) { tli.record_safekeeper_info(&new_info.value, new_info.key.node_id) .await? } diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index ff23f0360f..22ed34cc00 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -9,8 +9,6 @@ use std::io::{Read, Write}; use std::ops::Deref; use std::path::{Path, PathBuf}; -use tracing::*; - use crate::control_file_upgrade::upgrade_control_file; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; @@ -55,12 +53,13 @@ pub struct FileStorage { } impl FileStorage { - pub fn restore_new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { - let timeline_dir = conf.timeline_dir(zttid); - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + /// Initialize storage by loading state from disk. + pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + let tenant_id = ttid.tenant_id.to_string(); + let timeline_id = ttid.timeline_id.to_string(); - let state = Self::load_control_file_conf(conf, zttid)?; + let state = Self::load_control_file_conf(conf, ttid)?; Ok(FileStorage { timeline_dir, @@ -71,28 +70,28 @@ impl FileStorage { }) } + /// Create file storage for a new timeline, but don't persist it yet. pub fn create_new( - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, conf: &SafeKeeperConf, state: SafeKeeperState, ) -> Result { - let timeline_dir = conf.timeline_dir(zttid); - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + let timeline_dir = conf.timeline_dir(ttid); + let tenant_id = ttid.tenant_id.to_string(); + let timeline_id = ttid.timeline_id.to_string(); - let mut store = FileStorage { + let store = FileStorage { timeline_dir, conf: conf.clone(), persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS .with_label_values(&[&tenant_id, &timeline_id]), - state: state.clone(), + state, }; - store.persist(&state)?; Ok(store) } - // Check the magic/version in the on-disk data and deserialize it, if possible. + /// Check the magic/version in the on-disk data and deserialize it, if possible. fn deser_sk_state(buf: &mut &[u8]) -> Result { // Read the version independent part let magic = buf.read_u32::()?; @@ -112,23 +111,17 @@ impl FileStorage { upgrade_control_file(buf, version) } - // Load control file for given zttid at path specified by conf. + /// Load control file for given ttid at path specified by conf. pub fn load_control_file_conf( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result { - let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); + let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME); Self::load_control_file(path) } /// Read in the control file. - /// If create=false and file doesn't exist, bails out. pub fn load_control_file>(control_file_path: P) -> Result { - info!( - "loading control file {}", - control_file_path.as_ref().display(), - ); - let mut control_file = OpenOptions::new() .read(true) .write(true) @@ -179,8 +172,8 @@ impl Deref for FileStorage { } impl Storage for FileStorage { - // persists state durably to underlying storage - // for description see https://lwn.net/Articles/457667/ + /// persists state durably to underlying storage + /// for description see https://lwn.net/Articles/457667/ fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { let _timer = &self.persist_control_file_seconds.start_timer(); @@ -264,57 +257,57 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); Ok(( - FileStorage::restore_new(zttid, conf)?, - FileStorage::load_control_file_conf(conf, zttid)?, + FileStorage::restore_new(ttid, conf)?, + FileStorage::load_control_file_conf(conf, ttid)?, )) } fn create( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); - let storage = FileStorage::create_new(zttid, conf, state.clone())?; + let storage = FileStorage::create_new(ttid, conf, state.clone())?; Ok((storage, state)) } #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); - let zttid = TenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state"); // change something state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &zttid).expect("failed to read state"); + let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state"); assert_eq!(state.commit_lsn, Lsn(42)); } #[test] fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); - let zttid = TenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state"); // change something state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME); + let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); let mut data = fs::read(&control_path).unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data).expect("failed to write control file"); - match load_from_control_file(&conf, &zttid) { + match load_from_control_file(&conf, &ttid) { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index ad2c0ec8bf..ca887399e1 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -3,15 +3,15 @@ use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; use crate::receive_wal::ReceiveWalConn; -use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; + use crate::send_wal::ReplicationConn; -use crate::timeline::{Timeline, TimelineTools}; -use crate::SafeKeeperConf; + +use crate::{GlobalTimelines, SafeKeeperConf}; use anyhow::{bail, Context, Result}; use postgres_ffi::PG_TLI; use regex::Regex; -use std::sync::Arc; + use tracing::info; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, @@ -27,7 +27,7 @@ pub struct SafekeeperPostgresHandler { pub appname: Option, pub tenant_id: Option, pub timeline_id: Option, - pub timeline: Option>, + pub ttid: TenantTimelineId, } /// Parsed Postgres command. @@ -101,30 +101,21 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { query_string, self.timeline_id ); - let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) - || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); - - let tenant_id = self.tenant_id.context("tenant_id is required")?; - let timeline_id = self.timeline_id.context("timeline_id is required")?; - if self.timeline.is_none() { - self.timeline.set( - &self.conf, - TenantTimelineId::new(tenant_id, timeline_id), - create, - )?; - } + let tenant_id = self.tenant_id.context("tenantid is required")?; + let timeline_id = self.timeline_id.context("timelineid is required")?; + self.ttid = TenantTimelineId::new(tenant_id, timeline_id); match cmd { - SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb) - .run(self) - .context("failed to run ReceiveWalConn"), - SafekeeperPostgresCommand::StartReplication { start_lsn } => ReplicationConn::new(pgb) - .run(self, pgb, start_lsn) - .context("failed to run ReplicationConn"), + SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self), + SafekeeperPostgresCommand::StartReplication { start_lsn } => { + ReplicationConn::new(pgb).run(self, pgb, start_lsn) + } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } - .context(format!("timeline {timeline_id}"))?; + .context(format!( + "Failed to process query for timeline {timeline_id}" + ))?; Ok(()) } @@ -137,42 +128,26 @@ impl SafekeeperPostgresHandler { appname: None, tenant_id: None, timeline_id: None, - timeline: None, + ttid: TenantTimelineId::empty(), } } - /// Shortcut for calling `process_msg` in the timeline. - pub fn process_safekeeper_msg( - &self, - msg: &ProposerAcceptorMessage, - ) -> Result> { - self.timeline - .get() - .process_msg(msg) - .context("failed to process ProposerAcceptorMessage") - } - /// /// Handle IDENTIFY_SYSTEM replication command /// fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { + let tli = GlobalTimelines::get(self.ttid)?; + let lsn = if self.is_walproposer_recovery() { // walproposer should get all local WAL until flush_lsn - self.timeline.get().get_end_of_wal() + tli.get_flush_lsn() } else { // other clients shouldn't get any uncommitted WAL - self.timeline.get().get_state().0.commit_lsn + tli.get_state().0.commit_lsn } .to_string(); - let sysid = self - .timeline - .get() - .get_state() - .1 - .server - .system_id - .to_string(); + let sysid = tli.get_state().1.server.system_id.to_string(); let lsn_bytes = lsn.as_bytes(); let tli = PG_TLI.to_string(); let tli_bytes = tli.as_bytes(); diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 14c9414c09..244325368b 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,3 +1,4 @@ +use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode, Uri}; use once_cell::sync::Lazy; @@ -9,7 +10,9 @@ use std::sync::Arc; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; -use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult}; + +use crate::timelines_global_map::TimelineDeleteForceResult; +use crate::GlobalTimelines; use crate::SafeKeeperConf; use etcd_broker::subscription_value::SkTimelineInfo; use utils::{ @@ -90,15 +93,15 @@ struct TimelineStatus { /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; - let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?; + let tli = GlobalTimelines::get(ttid)?; let (inmem, state) = tli.get_state(); - let flush_lsn = tli.get_end_of_wal(); + let flush_lsn = tli.get_flush_lsn(); let acc_state = AcceptorStateStatus { term: state.acceptor_state.term, @@ -108,8 +111,8 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { let request_data: TimelineCreateRequest = json_request(&mut request).await?; - let zttid = TenantTimelineId { + let ttid = TenantTimelineId { tenant_id: parse_request_param(&request, "tenant_id")?, timeline_id: request_data.timeline_id, }; - check_permission(&request, Some(zttid.tenant_id))?; - GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids) - .map_err(ApiError::from_err)?; + check_permission(&request, Some(ttid.tenant_id))?; - json_response(StatusCode::CREATED, ()) + Err(ApiError::from_err(anyhow!("not implemented"))) } /// Deactivates the timeline and removes its data directory. -/// -/// It does not try to stop any processing of the timeline; there is no such code at the time of writing. -/// However, it tries to check whether the timeline was active and report it to caller just in case. -/// Note that this information is inaccurate: -/// 1. There is a race condition between checking the timeline for activity and actual directory deletion. -/// 2. At the time of writing Safekeeper rarely marks a timeline inactive. E.g. disconnecting the compute node does nothing. async fn timeline_delete_force_handler( mut request: Request, ) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; ensure_no_body(&mut request).await?; - json_response( - StatusCode::OK, - GlobalTimelines::delete_force(get_conf(&request), &zttid) - .await - .map_err(ApiError::from_err)?, - ) + let resp = tokio::task::spawn_blocking(move || GlobalTimelines::delete_force(&ttid)) + .await + .map_err(ApiError::from_err)??; + json_response(StatusCode::OK, resp) } /// Deactivates all timelines for the tenant and removes its data directory. @@ -168,27 +161,30 @@ async fn tenant_delete_force_handler( let tenant_id = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; ensure_no_body(&mut request).await?; + let delete_info = tokio::task::spawn_blocking(move || { + GlobalTimelines::delete_force_all_for_tenant(&tenant_id) + }) + .await + .map_err(ApiError::from_err)??; json_response( StatusCode::OK, - GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id) - .await - .map_err(ApiError::from_err)? + delete_info .iter() - .map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp)) + .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) .collect::>(), ) } /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?; - let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?; + let tli = GlobalTimelines::get(ttid)?; tli.record_safekeeper_info(&safekeeper_info, NodeId(1)) .await?; diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 00fc43521b..2456eb0752 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -6,18 +6,22 @@ //! modifications in tests. //! +use std::sync::Arc; + use anyhow::Result; use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; +use utils::id::TenantTimelineId; use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; +use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; use crate::safekeeper::{ - AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, ProposerGreeting, + AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, }; use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; -use crate::timeline::TimelineTools; +use crate::timeline::Timeline; +use crate::GlobalTimelines; use postgres_ffi::v14::xlog_utils; use postgres_ffi::WAL_SEGMENT_SIZE; use utils::{ @@ -57,23 +61,23 @@ struct AppendResult { /// content, and then append it with specified term and lsn. This /// function is used to test safekeepers in different scenarios. pub fn handle_json_ctrl( - spg: &mut SafekeeperPostgresHandler, + spg: &SafekeeperPostgresHandler, pgb: &mut PostgresBackend, append_request: &AppendLogicalMessage, ) -> Result<()> { info!("JSON_CTRL request: {:?}", append_request); // need to init safekeeper state before AppendRequest - prepare_safekeeper(spg)?; + let tli = prepare_safekeeper(spg.ttid)?; // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { - send_proposer_elected(spg, append_request.term, append_request.epoch_start_lsn)?; + send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?; } - let inserted_wal = append_logical_message(spg, append_request)?; + let inserted_wal = append_logical_message(&tli, append_request)?; let response = AppendResult { - state: spg.timeline.get().get_state().1, + state: tli.get_state().1, inserted_wal, }; let response_data = serde_json::to_vec(&response)?; @@ -91,28 +95,20 @@ pub fn handle_json_ctrl( /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting { - protocol_version: 2, // current protocol - pg_version: 0, // unknown - proposer_id: [0u8; 16], - system_id: 0, - timeline_id: spg.timeline_id.unwrap(), - tenant_id: spg.tenant_id.unwrap(), - tli: 0, - wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests - }); - - let response = spg.timeline.get().process_msg(&greeting_request)?; - match response { - Some(AcceptorProposerMessage::Greeting(_)) => Ok(()), - _ => anyhow::bail!("not GreetingResponse"), - } +fn prepare_safekeeper(ttid: TenantTimelineId) -> Result> { + GlobalTimelines::create( + ttid, + ServerInfo { + pg_version: 0, // unknown + wal_seg_size: WAL_SEGMENT_SIZE as u32, + system_id: 0, + }, + ) } -fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: Lsn) -> Result<()> { +fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> Result<()> { // add new term to existing history - let history = spg.timeline.get().get_state().1.acceptor_state.term_history; + let history = tli.get_state().1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); let mut history_entries = history.0; history_entries.push(TermSwitchEntry { term, lsn }); @@ -125,7 +121,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L timeline_start_lsn: lsn, }); - spg.timeline.get().process_msg(&proposer_elected_request)?; + tli.process_msg(&proposer_elected_request)?; Ok(()) } @@ -138,12 +134,9 @@ struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -fn append_logical_message( - spg: &mut SafekeeperPostgresHandler, - msg: &AppendLogicalMessage, -) -> Result { +fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = spg.timeline.get().get_state().1; + let sk_state = tli.get_state().1; let begin_lsn = msg.begin_lsn; let end_lsn = begin_lsn + wal_data.len() as u64; @@ -167,7 +160,7 @@ fn append_logical_message( wal_data: Bytes::from(wal_data), }); - let response = spg.timeline.get().process_msg(&append_request)?; + let response = tli.process_msg(&append_request)?; let append_response = match response { Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index b466d5aab5..58a237a5d3 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -23,6 +23,9 @@ pub mod wal_backup; pub mod wal_service; pub mod wal_storage; +mod timelines_global_map; +pub use timelines_global_map::GlobalTimelines; + pub mod defaults { use const_format::formatcp; use std::time::Duration; @@ -65,9 +68,9 @@ impl SafeKeeperConf { self.workdir.join(tenant_id.to_string()) } - pub fn timeline_dir(&self, zttid: &TenantTimelineId) -> PathBuf { - self.tenant_dir(&zttid.tenant_id) - .join(zttid.timeline_id.to_string()) + pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> PathBuf { + self.tenant_dir(&ttid.tenant_id) + .join(ttid.timeline_id.to_string()) } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 3fa3916266..851a568aec 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -12,11 +12,12 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ safekeeper::{SafeKeeperState, SafekeeperMemState}, - timeline::{GlobalTimelines, ReplicaState}, + timeline::ReplicaState, + GlobalTimelines, }; pub struct FullTimelineInfo { - pub zttid: TenantTimelineId, + pub ttid: TenantTimelineId, pub replicas: Vec, pub wal_backup_active: bool, pub timeline_is_active: bool, @@ -235,11 +236,17 @@ impl Collector for TimelineCollector { self.disk_usage.reset(); self.acceptor_term.reset(); - let timelines = GlobalTimelines::active_timelines_metrics(); + let timelines = GlobalTimelines::get_all(); - for tli in timelines { - let tenant_id = tli.zttid.tenant_id.to_string(); - let timeline_id = tli.zttid.timeline_id.to_string(); + for arc_tli in timelines { + let tli = arc_tli.info_for_metrics(); + if tli.is_none() { + continue; + } + let tli = tli.unwrap(); + + let tenant_id = tli.ttid.tenant_id.to_string(); + let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; let mut most_advanced: Option = None; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index b0b6a73621..e28caa2f19 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -7,7 +7,9 @@ use anyhow::{anyhow, bail, Result}; use bytes::BytesMut; use tracing::*; +use crate::safekeeper::ServerInfo; use crate::timeline::Timeline; +use crate::GlobalTimelines; use std::net::SocketAddr; use std::sync::mpsc::channel; @@ -20,7 +22,6 @@ use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::TimelineTools; use utils::{ postgres_backend::PostgresBackend, pq_proto::{BeMessage, FeMessage}, @@ -67,15 +68,21 @@ impl<'pg> ReceiveWalConn<'pg> { // Receive information about server let next_msg = poll_reader.recv_msg()?; - match next_msg { + let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( "start handshake with wal proposer {} sysid {} timeline {}", self.peer_addr, greeting.system_id, greeting.tli, ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, + }; + GlobalTimelines::create(spg.ttid, server_info)? } _ => bail!("unexpected message {:?} instead of greeting", next_msg), - } + }; let mut next_msg = Some(next_msg); @@ -88,7 +95,7 @@ impl<'pg> ReceiveWalConn<'pg> { while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg { let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); - let reply = spg.process_safekeeper_msg(&msg)?; + let reply = tli.process_msg(&msg)?; if let Some(reply) = reply { self.write_msg(&reply)?; } @@ -97,13 +104,13 @@ impl<'pg> ReceiveWalConn<'pg> { } // flush all written WAL to the disk - let reply = spg.process_safekeeper_msg(&ProposerAcceptorMessage::FlushWAL)?; + let reply = tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?; if let Some(reply) = reply { self.write_msg(&reply)?; } } else if let Some(msg) = next_msg.take() { // process other message - let reply = spg.process_safekeeper_msg(&msg)?; + let reply = tli.process_msg(&msg)?; if let Some(reply) = reply { self.write_msg(&reply)?; } @@ -112,9 +119,9 @@ impl<'pg> ReceiveWalConn<'pg> { // Register the connection and defer unregister. Do that only // after processing first message, as it sets wal_seg_size, // wanted by many. - spg.timeline.get().on_compute_connect()?; + tli.on_compute_connect()?; _guard = Some(ComputeConnectionGuard { - timeline: Arc::clone(spg.timeline.get()), + timeline: Arc::clone(&tli), }); first_time_through = false; } @@ -190,6 +197,8 @@ struct ComputeConnectionGuard { impl Drop for ComputeConnectionGuard { fn drop(&mut self) { - self.timeline.on_compute_disconnect().unwrap(); + if let Err(e) = self.timeline.on_compute_disconnect() { + error!("failed to unregister compute connection: {}", e); + } } } diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 004c0243f9..b6d497f34e 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -4,20 +4,21 @@ use std::{thread, time::Duration}; use tracing::*; -use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use crate::{GlobalTimelines, SafeKeeperConf}; pub fn thread_main(conf: SafeKeeperConf) { let wal_removal_interval = Duration::from_millis(5000); loop { - let active_tlis = GlobalTimelines::get_active_timelines(); - for zttid in &active_tlis { - if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { - warn!( - "failed to remove WAL for tenant {} timeline {}: {}", - tli.zttid.tenant_id, tli.zttid.timeline_id, e - ); - } + let tlis = GlobalTimelines::get_all(); + for tli in &tlis { + if !tli.is_active() { + continue; + } + let ttid = tli.ttid; + let _enter = + info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered(); + if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { + warn!("failed to remove WAL: {}", e); } } thread::sleep(wal_removal_interval) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fa045eed90..d34a77e02b 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -218,19 +218,19 @@ pub struct SafekeeperMemState { } impl SafeKeeperState { - pub fn new(zttid: &TenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new( + ttid: &TenantTimelineId, + server_info: ServerInfo, + peers: Vec, + ) -> SafeKeeperState { SafeKeeperState { - tenant_id: zttid.tenant_id, - timeline_id: zttid.timeline_id, + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, acceptor_state: AcceptorState { term: 0, term_history: TermHistory::empty(), }, - server: ServerInfo { - pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ - system_id: 0, /* Postgres system identifier */ - wal_seg_size: 0, - }, + server: server_info, proposer_uuid: [0; 16], timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), @@ -244,7 +244,15 @@ impl SafeKeeperState { #[cfg(test)] pub fn empty() -> Self { - SafeKeeperState::new(&TenantTimelineId::empty(), vec![]) + SafeKeeperState::new( + &TenantTimelineId::empty(), + ServerInfo { + pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ + system_id: 0, /* Postgres system identifier */ + wal_seg_size: 0, + }, + vec![], + ) } } @@ -479,8 +487,12 @@ impl AcceptorProposerMessage { } } -/// SafeKeeper which consumes events (messages from compute) and provides -/// replies. +/// Safekeeper implements consensus to reliably persist WAL across nodes. +/// It controls all WAL disk writes and updates of control file. +/// +/// Currently safekeeper processes: +/// - messages from compute (proposers) and provides replies +/// - messages from broker peers pub struct SafeKeeper { /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. /// Note: be careful to set only if we are sure our WAL (term history) matches @@ -503,20 +515,20 @@ where CTRL: control_file::Storage, WAL: wal_storage::Storage, { - // constructor - pub fn new( - timeline_id: TimelineId, - state: CTRL, - mut wal_store: WAL, - node_id: NodeId, - ) -> Result> { - if state.timeline_id != TimelineId::from([0u8; 16]) && timeline_id != state.timeline_id { - bail!("Calling SafeKeeper::new with inconsistent timeline_id ({}) and SafeKeeperState.server.timeline_id ({})", timeline_id, state.timeline_id); + /// Accepts a control file storage containing the safekeeper state. + /// State must be initialized, i.e. contain filled `tenant_id`, `timeline_id` + /// and `server` (`wal_seg_size` inside it) fields. + pub fn new(state: CTRL, wal_store: WAL, node_id: NodeId) -> Result> { + if state.tenant_id == TenantId::from([0u8; 16]) + || state.timeline_id == TimelineId::from([0u8; 16]) + { + bail!( + "Calling SafeKeeper::new with empty tenant_id ({}) or timeline_id ({})", + state.tenant_id, + state.timeline_id + ); } - // initialize wal_store, if state is already initialized - wal_store.init_storage(&state)?; - Ok(SafeKeeper { global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), @@ -574,7 +586,7 @@ where &mut self, msg: &ProposerGreeting, ) -> Result> { - /* Check protocol compatibility */ + // Check protocol compatibility if msg.protocol_version != SK_PROTOCOL_VERSION { bail!( "incompatible protocol version {}, expected {}", @@ -582,11 +594,11 @@ where SK_PROTOCOL_VERSION ); } - /* Postgres upgrade is not treated as fatal error */ + // Postgres upgrade is not treated as fatal error if msg.pg_version != self.state.server.pg_version && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { - info!( + warn!( "incompatible server version {}, expected {}", msg.pg_version, self.state.server.pg_version ); @@ -605,17 +617,25 @@ where self.state.timeline_id ); } - - // set basic info about server, if not yet - // TODO: verify that is doesn't change after - { - let mut state = self.state.clone(); - state.server.system_id = msg.system_id; - state.server.wal_seg_size = msg.wal_seg_size; - self.state.persist(&state)?; + if self.state.server.wal_seg_size != msg.wal_seg_size { + bail!( + "invalid wal_seg_size, got {}, expected {}", + msg.wal_seg_size, + self.state.server.wal_seg_size + ); } - self.wal_store.init_storage(&self.state)?; + // system_id will be updated on mismatch + if self.state.server.system_id != msg.system_id { + warn!( + "unexpected system ID arrived, got {}, expected {}", + msg.system_id, self.state.server.system_id + ); + + let mut state = self.state.clone(); + state.server.system_id = msg.system_id; + self.state.persist(&state)?; + } info!( "processed greeting from proposer {:?}, sending term {:?}", @@ -665,16 +685,6 @@ where Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) } - /// Bump our term if received a note from elected proposer with higher one - fn bump_if_higher(&mut self, term: Term) -> Result<()> { - if self.state.acceptor_state.term < term { - let mut state = self.state.clone(); - state.acceptor_state.term = term; - self.state.persist(&state)?; - } - Ok(()) - } - /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { let ar = AppendResponse { @@ -691,7 +701,12 @@ where fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { info!("received ProposerElected {:?}", msg); - self.bump_if_higher(msg.term)?; + if self.state.acceptor_state.term < msg.term { + let mut state = self.state.clone(); + state.acceptor_state.term = msg.term; + self.state.persist(&state)?; + } + // If our term is higher, ignore the message (next feedback will inform the compute) if self.state.acceptor_state.term > msg.term { return Ok(None); @@ -748,7 +763,7 @@ where } /// Advance commit_lsn taking into account what we have locally - pub fn update_commit_lsn(&mut self) -> Result<()> { + fn update_commit_lsn(&mut self) -> Result<()> { let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); assert!(commit_lsn >= self.inmem.commit_lsn); @@ -768,6 +783,11 @@ where Ok(()) } + /// Persist control file to disk, called only after timeline creation (bootstrap). + pub fn persist(&mut self) -> Result<()> { + self.persist_control_file(self.state.clone()) + } + /// Persist in-memory state to the disk, taking other data from state. fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; @@ -918,6 +938,8 @@ where #[cfg(test)] mod tests { + use postgres_ffi::WAL_SEGMENT_SIZE; + use super::*; use crate::wal_storage::Storage; use std::ops::Deref; @@ -942,6 +964,14 @@ mod tests { } } + fn test_sk_state() -> SafeKeeperState { + let mut state = SafeKeeperState::empty(); + state.server.wal_seg_size = WAL_SEGMENT_SIZE as u32; + state.tenant_id = TenantId::from([1u8; 16]); + state.timeline_id = TimelineId::from([1u8; 16]); + state + } + struct DummyWalStore { lsn: Lsn, } @@ -951,10 +981,6 @@ mod tests { self.lsn } - fn init_storage(&mut self, _state: &SafeKeeperState) -> Result<()> { - Ok(()) - } - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) @@ -977,12 +1003,10 @@ mod tests { #[test] fn test_voting() { let storage = InMemoryState { - persisted_state: SafeKeeperState::empty(), + persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let timeline_id = TimelineId::from([0u8; 16]); - - let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -998,7 +1022,7 @@ mod tests { persisted_state: state, }; - sk = SafeKeeper::new(timeline_id, storage, sk.wal_store, NodeId(0)).unwrap(); + sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1011,12 +1035,11 @@ mod tests { #[test] fn test_epoch_switch() { let storage = InMemoryState { - persisted_state: SafeKeeperState::empty(), + persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 375b6eea18..5a38558e9c 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -2,8 +2,9 @@ //! with the "START_REPLICATION" message. use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::{ReplicaState, Timeline, TimelineTools}; +use crate::timeline::{ReplicaState, Timeline}; use crate::wal_storage::WalReader; +use crate::GlobalTimelines; use anyhow::{bail, Context, Result}; use bytes::Bytes; @@ -167,8 +168,10 @@ impl ReplicationConn { ) -> Result<()> { let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); + let tli = GlobalTimelines::get(spg.ttid)?; + // spawn the background thread which receives HotStandbyFeedback messages. - let bg_timeline = Arc::clone(spg.timeline.get()); + let bg_timeline = Arc::clone(&tli); let bg_stream_in = self.stream_in.take().unwrap(); let bg_timeline_id = spg.timeline_id.unwrap(); @@ -201,11 +204,8 @@ impl ReplicationConn { .build()?; runtime.block_on(async move { - let (inmem_state, persisted_state) = spg.timeline.get().get_state(); + let (inmem_state, persisted_state) = tli.get_state(); // add persisted_state.timeline_start_lsn == Lsn(0) check - if persisted_state.server.wal_seg_size == 0 { - bail!("Cannot start replication before connecting to walproposer"); - } // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will @@ -217,7 +217,7 @@ impl ReplicationConn { // on this safekeeper itself. That's ok as (old) proposer will never be // able to commit such WAL. let stop_pos: Option = if spg.is_walproposer_recovery() { - let wal_end = spg.timeline.get().get_end_of_wal(); + let wal_end = tli.get_flush_lsn(); Some(wal_end) } else { None @@ -231,7 +231,7 @@ impl ReplicationConn { let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); let mut wal_reader = WalReader::new( - spg.conf.timeline_dir(&spg.timeline.get().zttid), + spg.conf.timeline_dir(&tli.ttid), &persisted_state, start_pos, spg.conf.wal_backup_enabled, @@ -241,7 +241,7 @@ impl ReplicationConn { let mut send_buf = vec![0u8; MAX_SEND_SIZE]; // watcher for commit_lsn updates - let mut commit_lsn_watch_rx = spg.timeline.get().get_commit_lsn_watch_rx(); + let mut commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); loop { if let Some(stop_pos) = stop_pos { @@ -258,7 +258,7 @@ impl ReplicationConn { } else { // TODO: also check once in a while whether we are walsender // to right pageserver. - if spg.timeline.get().stop_walsender(replica_id)? { + if tli.should_walsender_stop(replica_id) { // Shut down, timeline is suspended. // TODO create proper error type for this bail!("end streaming to {:?}", spg.appname); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index cf317c41c3..4000815857 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,27 +1,25 @@ -//! This module contains timeline id -> safekeeper state map with file-backed -//! persistence and support for interaction between sending and receiving wal. +//! This module implements Timeline lifecycle management and has all neccessary code +//! to glue together SafeKeeper and all other background services. -use anyhow::{bail, Context, Result}; +use anyhow::{bail, Result}; use etcd_broker::subscription_value::SkTimelineInfo; -use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; -use serde::Serialize; use tokio::sync::watch; use std::cmp::{max, min}; -use std::collections::{HashMap, HashSet}; -use std::fs::{self}; -use std::sync::{Arc, Mutex, MutexGuard}; +use parking_lot::{Mutex, MutexGuard}; + +use std::path::PathBuf; use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ - id::{NodeId, TenantId, TenantTimelineId}, + id::{NodeId, TenantTimelineId}, lsn::Lsn, pq_proto::ReplicationFeedback, }; @@ -29,7 +27,7 @@ use utils::{ use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, - SafekeeperMemState, + SafekeeperMemState, ServerInfo, }; use crate::send_wal::HotStandbyFeedback; @@ -73,7 +71,7 @@ impl ReplicaState { } /// Shared state associated with database instance -struct SharedState { +pub struct SharedState { /// Safekeeper object sk: SafeKeeper, /// State of replicas @@ -95,17 +93,21 @@ struct SharedState { } impl SharedState { - /// Initialize timeline state, creating control file - fn create( + /// Initialize fresh timeline state without persisting anything to disk. + fn create_new( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - peer_ids: Vec, + ttid: &TenantTimelineId, + state: SafeKeeperState, ) -> Result { - let state = SafeKeeperState::new(zttid, peer_ids); - let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } - let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; + // We don't want to write anything to disk, because we may have existing timeline there. + // These functions should not change anything on disk. + let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; + let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; Ok(Self { sk, @@ -117,16 +119,17 @@ impl SharedState { }) } - /// Restore SharedState from control file. - /// If file doesn't exist, bails out. - fn restore(conf: &SafeKeeperConf, zttid: &TenantTimelineId) -> Result { - let control_store = control_file::FileStorage::restore_new(zttid, conf)?; - let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); + /// Restore SharedState from control file. If file doesn't exist, bails out. + fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { + let control_store = control_file::FileStorage::restore_new(ttid, conf)?; + if control_store.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } - info!("timeline {} restored", zttid.timeline_id); + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; Ok(Self { - sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, + sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, replicas: Vec::new(), wal_backup_active: false, active: false, @@ -134,6 +137,7 @@ impl SharedState { last_removed_segno: 0, }) } + fn is_active(&self) -> bool { self.is_wal_backup_required() // FIXME: add tracking of relevant pageservers and check them here individually, @@ -254,148 +258,289 @@ impl SharedState { } } -/// Database instance (tenant) +#[derive(Debug, thiserror::Error)] +pub enum TimelineError { + #[error("Timeline {0} was cancelled and cannot be used anymore")] + Cancelled(TenantTimelineId), + #[error("Timeline {0} was not found in global map")] + NotFound(TenantTimelineId), + #[error("Timeline {0} exists on disk, but wasn't loaded on startup")] + Invalid(TenantTimelineId), + #[error("Timeline {0} is already exists")] + AlreadyExists(TenantTimelineId), + #[error("Timeline {0} is not initialized, wal_seg_size is zero")] + UninitializedWalSegSize(TenantTimelineId), +} + +/// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. +/// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { - pub zttid: TenantTimelineId, + pub ttid: TenantTimelineId, + /// Sending here asks for wal backup launcher attention (start/stop - /// offloading). Sending zttid instead of concrete command allows to do + /// offloading). Sending ttid instead of concrete command allows to do /// sending without timeline lock. wal_backup_launcher_tx: Sender, + + /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, - /// For breeding receivers. commit_lsn_watch_rx: watch::Receiver, + + /// Safekeeper and other state, that should remain consistent and synchronized + /// with the disk. mutex: Mutex, + + /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal. + cancellation_tx: watch::Sender, + + /// Timeline should not be used after cancellation. Background tasks should + /// monitor this channel and stop eventually after receiving `true` from this channel. + cancellation_rx: watch::Receiver, + + /// Directory where timeline state is stored. + timeline_dir: PathBuf, } impl Timeline { - fn new( - zttid: TenantTimelineId, + /// Load existing timeline from disk. + pub fn load_timeline( + conf: SafeKeeperConf, + ttid: TenantTimelineId, wal_backup_launcher_tx: Sender, - shared_state: SharedState, - ) -> Timeline { + ) -> Result { + let shared_state = SharedState::restore(&conf, &ttid)?; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = - watch::channel(shared_state.sk.inmem.commit_lsn); - Timeline { - zttid, + watch::channel(shared_state.sk.state.commit_lsn); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + + Ok(Timeline { + ttid, wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, mutex: Mutex::new(shared_state), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Create a new timeline, which is not yet persisted to disk. + pub fn create_empty( + conf: SafeKeeperConf, + ttid: TenantTimelineId, + wal_backup_launcher_tx: Sender, + server_info: ServerInfo, + ) -> Result { + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + let state = SafeKeeperState::new(&ttid, server_info, vec![]); + + Ok(Timeline { + ttid, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, + mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Initialize fresh timeline on disk and start background tasks. If bootstrap + /// fails, timeline is cancelled and cannot be used anymore. + /// + /// Bootstrap is transactional, so if it fails, created files will be deleted, + /// and state on disk should remain unchanged. + pub fn bootstrap(&self, shared_state: &mut MutexGuard) -> Result<()> { + match std::fs::metadata(&self.timeline_dir) { + Ok(_) => { + // Timeline directory exists on disk, we should leave state unchanged + // and return error. + bail!(TimelineError::Invalid(self.ttid)); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + return Err(e.into()); + } } + + // Create timeline directory. + std::fs::create_dir_all(&self.timeline_dir)?; + + // Write timeline to disk and TODO: start background tasks. + match || -> Result<()> { + shared_state.sk.persist()?; + // TODO: add more initialization steps here + Ok(()) + }() { + Ok(_) => Ok(()), + Err(e) => { + // Bootstrap failed, cancel timeline and remove timeline directory. + self.cancel(); + + if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) { + warn!( + "failed to remove timeline {} directory after bootstrap failure: {}", + self.ttid, fs_err + ); + } + + Err(e) + } + } + } + + /// Delete timeline from disk completely, by removing timeline directory. Background + /// timeline activities will stop eventually. + pub fn delete_from_disk( + &self, + shared_state: &mut MutexGuard, + ) -> Result<(bool, bool)> { + let was_active = shared_state.active; + self.cancel(); + let dir_existed = delete_dir(&self.timeline_dir)?; + Ok((dir_existed, was_active)) + } + + /// Cancel timeline to prevent further usage. Background tasks will stop + /// eventually after receiving cancellation signal. + fn cancel(&self) { + info!("Timeline {} is cancelled", self.ttid); + let _ = self.cancellation_tx.send(true); + let res = self.wal_backup_launcher_tx.blocking_send(self.ttid); + if let Err(e) = res { + error!("Failed to send stop signal to wal_backup_launcher: {}", e); + } + } + + /// Returns if timeline is cancelled. + pub fn is_cancelled(&self) -> bool { + *self.cancellation_rx.borrow() + } + + /// Take a writing mutual exclusive lock on timeline shared_state. + pub fn write_shared_state(&self) -> MutexGuard { + self.mutex.lock() } /// Register compute connection, starting timeline-related activity if it is /// not running yet. - /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_connect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let is_wal_backup_action_pending: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.num_computes += 1; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); } // Wake up wal backup launcher, if offloading not started yet. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; } Ok(()) } /// De-register compute connection, shutting down timeline activity if /// pageserver doesn't need catchup. - /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_disconnect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let is_wal_backup_action_pending: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.num_computes -= 1; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); } // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; } Ok(()) } - /// Whether we still need this walsender running? + /// Returns true if walsender should stop sending WAL to pageserver. /// TODO: check this pageserver is actually interested in this timeline. - pub fn stop_walsender(&self, replica_id: usize) -> Result { - let mut shared_state = self.mutex.lock().unwrap(); + pub fn should_walsender_stop(&self, replica_id: usize) -> bool { + if self.is_cancelled() { + return true; + } + + let mut shared_state = self.write_shared_state(); if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { - shared_state.update_status(self.zttid); - return Ok(true); + shared_state.update_status(self.ttid); + return true; } } - Ok(false) + false } /// Returns whether s3 offloading is required and sets current status as /// matching it. pub fn wal_backup_attend(&self) -> bool { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.wal_backup_attend() - } - - // Can this safekeeper offload to s3? Recently joined safekeepers might not - // have necessary WAL. - pub fn can_wal_backup(&self) -> bool { - self.mutex.lock().unwrap().can_wal_backup() - } - - /// Deactivates the timeline, assuming it is being deleted. - /// Returns whether the timeline was already active. - /// - /// We assume all threads will stop by themselves eventually (possibly with errors, but no panics). - /// There should be no compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but - /// we're deleting the timeline anyway. - pub async fn deactivate_for_delete(&self) -> Result { - let was_active: bool; - { - let shared_state = self.mutex.lock().unwrap(); - was_active = shared_state.active; + if self.is_cancelled() { + return false; } - self.wal_backup_launcher_tx.send(self.zttid).await?; - Ok(was_active) + + self.write_shared_state().wal_backup_attend() } - fn is_active(&self) -> bool { - let shared_state = self.mutex.lock().unwrap(); - shared_state.active + /// Can this safekeeper offload to s3? Recently joined safekeepers might not + /// have necessary WAL. + pub fn can_wal_backup(&self) -> bool { + if self.is_cancelled() { + return false; + } + + let shared_state = self.write_shared_state(); + shared_state.can_wal_backup() } - /// Returns full timeline info, required for the metrics. - /// If the timeline is not active, returns None instead. + /// Returns full timeline info, required for the metrics. If the timeline is + /// not active, returns None instead. pub fn info_for_metrics(&self) -> Option { - let shared_state = self.mutex.lock().unwrap(); - if !shared_state.active { + if self.is_cancelled() { return None; } - Some(FullTimelineInfo { - zttid: self.zttid, - replicas: shared_state - .replicas - .iter() - .filter_map(|r| r.as_ref()) - .copied() - .collect(), - wal_backup_active: shared_state.wal_backup_active, - timeline_is_active: shared_state.active, - num_computes: shared_state.num_computes, - last_removed_segno: shared_state.last_removed_segno, - epoch_start_lsn: shared_state.sk.epoch_start_lsn, - mem_state: shared_state.sk.inmem.clone(), - persisted_state: shared_state.sk.state.clone(), - flush_lsn: shared_state.sk.wal_store.flush_lsn(), - }) + let state = self.write_shared_state(); + if state.active { + Some(FullTimelineInfo { + ttid: self.ttid, + replicas: state + .replicas + .iter() + .filter_map(|r| r.as_ref()) + .copied() + .collect(), + wal_backup_active: state.wal_backup_active, + timeline_is_active: state.active, + num_computes: state.num_computes, + last_removed_segno: state.last_removed_segno, + epoch_start_lsn: state.sk.epoch_start_lsn, + mem_state: state.sk.inmem.clone(), + persisted_state: state.sk.state.clone(), + flush_lsn: state.sk.wal_store.flush_lsn(), + }) + } else { + None + } } + /// Returns commit_lsn watch channel. pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { self.commit_lsn_watch_rx.clone() } @@ -405,10 +550,14 @@ impl Timeline { &self, msg: &ProposerAcceptorMessage, ) -> Result> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let mut rmsg: Option; let commit_lsn: Lsn; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); rmsg = shared_state.sk.process_msg(msg)?; // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn @@ -426,28 +575,46 @@ impl Timeline { Ok(rmsg) } + /// Returns wal_seg_size. pub fn get_wal_seg_size(&self) -> usize { - self.mutex.lock().unwrap().get_wal_seg_size() + self.write_shared_state().get_wal_seg_size() } + /// Returns true only if the timeline is loaded and active. + pub fn is_active(&self) -> bool { + if self.is_cancelled() { + return false; + } + + self.write_shared_state().active + } + + /// Returns state of the timeline. pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { - let shared_state = self.mutex.lock().unwrap(); - (shared_state.sk.inmem.clone(), shared_state.sk.state.clone()) + let state = self.write_shared_state(); + (state.sk.inmem.clone(), state.sk.state.clone()) } + /// Returns latest backup_lsn. pub fn get_wal_backup_lsn(&self) -> Lsn { - self.mutex.lock().unwrap().sk.inmem.backup_lsn + self.write_shared_state().sk.inmem.backup_lsn } - pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) { - self.mutex.lock().unwrap().sk.inmem.backup_lsn = backup_lsn; + /// Sets backup_lsn to the given value. + pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + self.write_shared_state().sk.inmem.backup_lsn = backup_lsn; // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. + Ok(()) } - /// Prepare public safekeeper info for reporting. + /// Return public safekeeper info for broadcasting to broker and other peers. pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { - let shared_state = self.mutex.lock().unwrap(); + let shared_state = self.write_shared_state(); SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), @@ -473,54 +640,53 @@ impl Timeline { let is_wal_backup_action_pending: bool; let commit_lsn: Lsn; { - let mut shared_state = self.mutex.lock().unwrap(); - // WAL seg size not initialized yet (no message from compute ever - // received), can't do much without it. - if shared_state.get_wal_seg_size() == 0 { - return Ok(()); - } + let mut shared_state = self.write_shared_state(); shared_state.sk.record_safekeeper_info(sk_info)?; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); commit_lsn = shared_state.sk.inmem.commit_lsn; } self.commit_lsn_watch_tx.send(commit_lsn)?; // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.send(self.zttid).await?; + self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } + /// Add send_wal replica to the in-memory vector of replicas. pub fn add_replica(&self, state: ReplicaState) -> usize { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.add_replica(state) + self.write_shared_state().add_replica(state) } + /// Update replication replica state. pub fn update_replica_state(&self, id: usize, state: ReplicaState) { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.replicas[id] = Some(state); } + /// Remove send_wal replica from the in-memory vector of replicas. pub fn remove_replica(&self, id: usize) { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); assert!(shared_state.replicas[id].is_some()); shared_state.replicas[id] = None; } - pub fn get_end_of_wal(&self) -> Lsn { - let shared_state = self.mutex.lock().unwrap(); - shared_state.sk.wal_store.flush_lsn() + /// Returns flush_lsn. + pub fn get_flush_lsn(&self) -> Lsn { + self.write_shared_state().sk.wal_store.flush_lsn() } + /// Delete WAL segments from disk that are no longer needed. This is determined + /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { - let shared_state = self.mutex.lock().unwrap(); - // WAL seg size not initialized yet, no WAL exists. - if shared_state.get_wal_seg_size() == 0 { - return Ok(()); - } + let shared_state = self.write_shared_state(); horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { @@ -528,243 +694,22 @@ impl Timeline { } // release the lock before removing } - let _enter = - info_span!("", tenant = %self.zttid.tenant_id, timeline = %self.zttid.timeline_id) - .entered(); + + // delete old WAL files remover(horizon_segno - 1)?; - self.mutex.lock().unwrap().last_removed_segno = horizon_segno; + + // update last_removed_segno + let mut shared_state = self.write_shared_state(); + shared_state.last_removed_segno = horizon_segno; Ok(()) } } -// Utilities needed by various Connection-like objects -pub trait TimelineTools { - fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()>; - - fn get(&self) -> &Arc; -} - -impl TimelineTools for Option> { - fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()> { - *self = Some(GlobalTimelines::get(conf, zttid, create)?); - Ok(()) - } - - fn get(&self) -> &Arc { - self.as_ref().unwrap() - } -} - -struct GlobalTimelinesState { - timelines: HashMap>, - wal_backup_launcher_tx: Option>, -} - -static TIMELINES_STATE: Lazy> = Lazy::new(|| { - Mutex::new(GlobalTimelinesState { - timelines: HashMap::new(), - wal_backup_launcher_tx: None, - }) -}); - -#[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteForceResult { - pub dir_existed: bool, - pub was_active: bool, -} - -/// A zero-sized struct used to manage access to the global timelines map. -pub struct GlobalTimelines; - -impl GlobalTimelines { - pub fn init(wal_backup_launcher_tx: Sender) { - let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.wal_backup_launcher_tx.is_none()); - state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); - } - - fn create_internal( - mut state: MutexGuard, - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - peer_ids: Vec, - ) -> Result> { - match state.timelines.get(&zttid) { - Some(_) => bail!("timeline {} already exists", zttid), - None => { - // TODO: check directory existence - let dir = conf.timeline_dir(&zttid); - fs::create_dir_all(dir)?; - - let shared_state = SharedState::create(conf, &zttid, peer_ids) - .context("failed to create shared state")?; - - let new_tli = Arc::new(Timeline::new( - zttid, - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - shared_state, - )); - state.timelines.insert(zttid, Arc::clone(&new_tli)); - Ok(new_tli) - } - } - } - - pub fn create( - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - peer_ids: Vec, - ) -> Result> { - let state = TIMELINES_STATE.lock().unwrap(); - GlobalTimelines::create_internal(state, conf, zttid, peer_ids) - } - - /// Get a timeline with control file loaded from the global TIMELINES_STATE.timelines map. - /// If control file doesn't exist and create=false, bails out. - pub fn get( - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - create: bool, - ) -> Result> { - let _enter = info_span!("", timeline = %zttid.timeline_id).entered(); - - let mut state = TIMELINES_STATE.lock().unwrap(); - - match state.timelines.get(&zttid) { - Some(result) => Ok(Arc::clone(result)), - None => { - let shared_state = SharedState::restore(conf, &zttid); - - let shared_state = match shared_state { - Ok(shared_state) => shared_state, - Err(error) => { - // TODO: always create timeline explicitly - if error - .root_cause() - .to_string() - .contains("No such file or directory") - && create - { - return GlobalTimelines::create_internal(state, conf, zttid, vec![]); - } else { - return Err(error); - } - } - }; - - let new_tli = Arc::new(Timeline::new( - zttid, - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - shared_state, - )); - state.timelines.insert(zttid, Arc::clone(&new_tli)); - Ok(new_tli) - } - } - } - - /// Get loaded timeline, if it exists. - pub fn get_loaded(zttid: TenantTimelineId) -> Option> { - let state = TIMELINES_STATE.lock().unwrap(); - state.timelines.get(&zttid).map(Arc::clone) - } - - pub fn get_active_timelines() -> HashSet { - let state = TIMELINES_STATE.lock().unwrap(); - state - .timelines - .iter() - .filter(|&(_, tli)| tli.is_active()) - .map(|(zttid, _)| *zttid) - .collect() - } - - /// Return FullTimelineInfo for all active timelines. - pub fn active_timelines_metrics() -> Vec { - let state = TIMELINES_STATE.lock().unwrap(); - state - .timelines - .iter() - .filter_map(|(_, tli)| tli.info_for_metrics()) - .collect() - } - - fn delete_force_internal( - conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - was_active: bool, - ) -> Result { - match std::fs::remove_dir_all(conf.timeline_dir(zttid)) { - Ok(_) => Ok(TimelineDeleteForceResult { - dir_existed: true, - was_active, - }), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(TimelineDeleteForceResult { - dir_existed: false, - was_active, - }), - Err(e) => Err(e.into()), - } - } - - /// Deactivates and deletes the timeline, see `Timeline::deactivate_for_delete()`, the deletes - /// the corresponding data directory. - /// We assume all timeline threads do not care about `GlobalTimelines` not containing the timeline - /// anymore, and they will eventually terminate without panics. - /// - /// There are multiple ways the timeline may be accidentally "re-created" (so we end up with two - /// `Timeline` objects in memory): - /// a) a compute node connects after this method is called, or - /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or - /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. - /// TODO: ensure all of the above never happens. - pub async fn delete_force( - conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - ) -> Result { - info!("deleting timeline {}", zttid); - let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); - let mut was_active = false; - if let Some(tli) = timeline { - was_active = tli.deactivate_for_delete().await?; - } - GlobalTimelines::delete_force_internal(conf, zttid, was_active) - } - - /// Deactivates and deletes all timelines for the tenant, see `delete()`. - /// Returns map of all timelines which the tenant had, `true` if a timeline was active. - /// There may be a race if new timelines are created simultaneously. - pub async fn delete_force_all_for_tenant( - conf: &SafeKeeperConf, - tenant_id: &TenantId, - ) -> Result> { - info!("deleting all timelines for tenant {}", tenant_id); - let mut to_delete = HashMap::new(); - { - // Keep mutex in this scope. - let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines; - for (&zttid, tli) in timelines.iter() { - if zttid.tenant_id == *tenant_id { - to_delete.insert(zttid, tli.clone()); - } - } - // TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently. - timelines.retain(|zttid, _| !to_delete.contains_key(zttid)); - } - let mut deleted = HashMap::new(); - for (zttid, timeline) in to_delete { - let was_active = timeline.deactivate_for_delete().await?; - deleted.insert( - zttid, - GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?, - ); - } - // There may be inactive timelines, so delete the whole tenant dir as well. - match std::fs::remove_dir_all(conf.tenant_dir(tenant_id)) { - Ok(_) => (), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), - e => e?, - }; - Ok(deleted) +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: &PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), } } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs new file mode 100644 index 0000000000..cf99a243d7 --- /dev/null +++ b/safekeeper/src/timelines_global_map.rs @@ -0,0 +1,348 @@ +//! This module contains global (tenant_id, timeline_id) -> Arc mapping. +//! All timelines should always be present in this map, this is done by loading them +//! all from the disk on startup and keeping them in memory. + +use crate::safekeeper::ServerInfo; +use crate::timeline::{Timeline, TimelineError}; +use crate::SafeKeeperConf; +use anyhow::{anyhow, bail, Context, Result}; +use once_cell::sync::Lazy; +use serde::Serialize; +use std::collections::HashMap; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::{Arc, Mutex, MutexGuard}; +use tokio::sync::mpsc::Sender; +use tracing::*; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +struct GlobalTimelinesState { + timelines: HashMap>, + wal_backup_launcher_tx: Option>, + conf: SafeKeeperConf, +} + +impl GlobalTimelinesState { + /// Get dependencies for a timeline constructor. + fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { + ( + self.conf.clone(), + self.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) + } + + /// Insert timeline into the map. Returns error if timeline with the same id already exists. + fn try_insert(&mut self, timeline: Arc) -> Result<()> { + let ttid = timeline.ttid; + if self.timelines.contains_key(&ttid) { + bail!(TimelineError::AlreadyExists(ttid)); + } + self.timelines.insert(ttid, timeline); + Ok(()) + } + + /// Get timeline from the map. Returns error if timeline doesn't exist. + fn get(&self, ttid: &TenantTimelineId) -> Result> { + self.timelines + .get(ttid) + .cloned() + .ok_or_else(|| anyhow!(TimelineError::NotFound(*ttid))) + } +} + +static TIMELINES_STATE: Lazy> = Lazy::new(|| { + Mutex::new(GlobalTimelinesState { + timelines: HashMap::new(), + wal_backup_launcher_tx: None, + conf: SafeKeeperConf::default(), + }) +}); + +/// A zero-sized struct used to manage access to the global timelines map. +pub struct GlobalTimelines; + +impl GlobalTimelines { + /// Inject dependencies needed for the timeline constructors and load all timelines to memory. + pub fn init( + conf: SafeKeeperConf, + wal_backup_launcher_tx: Sender, + ) -> Result<()> { + let mut state = TIMELINES_STATE.lock().unwrap(); + assert!(state.wal_backup_launcher_tx.is_none()); + state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); + state.conf = conf; + + // Iterate through all directories and load tenants for all directories + // named as a valid tenant_id. + let mut tenant_count = 0; + let tenants_dir = state.conf.workdir.clone(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + if let Ok(tenant_id) = + TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or("")) + { + tenant_count += 1; + GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?; + } + } + Err(e) => error!( + "failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } + } + + info!( + "found {} tenants directories, successfully loaded {} timelines", + tenant_count, + state.timelines.len() + ); + Ok(()) + } + + /// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any. + fn load_tenant_timelines( + state: &mut MutexGuard, + tenant_id: TenantId, + ) -> Result<()> { + let timelines_dir = state.conf.tenant_dir(&tenant_id); + for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + .with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))? + { + match &timelines_dir_entry { + Ok(timeline_dir_entry) => { + if let Ok(timeline_id) = + TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) + { + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + match Timeline::load_timeline( + state.conf.clone(), + ttid, + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) { + Ok(timeline) => { + state.timelines.insert(ttid, Arc::new(timeline)); + } + // If we can't load a timeline, it's most likely because of a corrupted + // directory. We will log an error and won't allow to delete/recreate + // this timeline. The only way to fix this timeline is to repair manually + // and restart the safekeeper. + Err(e) => error!( + "failed to load timeline {} for tenant {}, reason: {:?}", + timeline_id, tenant_id, e + ), + } + } + } + Err(e) => error!( + "failed to list timelines dir entry {:?} in directory {}, reason: {:?}", + timelines_dir_entry, + timelines_dir.display(), + e + ), + } + } + + Ok(()) + } + + /// Create a new timeline with the given id. If the timeline already exists, returns + /// an existing timeline. + pub fn create(ttid: TenantTimelineId, server_info: ServerInfo) -> Result> { + let (conf, wal_backup_launcher_tx) = { + let state = TIMELINES_STATE.lock().unwrap(); + if let Ok(timeline) = state.get(&ttid) { + // Timeline already exists, return it. + return Ok(timeline); + } + state.get_dependencies() + }; + + info!("creating new timeline {}", ttid); + + let timeline = Arc::new(Timeline::create_empty( + conf, + ttid, + wal_backup_launcher_tx, + server_info, + )?); + + // Take a lock and finish the initialization holding this mutex. No other threads + // can interfere with creation after we will insert timeline into the map. + let mut shared_state = timeline.write_shared_state(); + + // We can get a race condition here in case of concurrent create calls, but only + // in theory. create() will return valid timeline on the next try. + TIMELINES_STATE + .lock() + .unwrap() + .try_insert(timeline.clone())?; + + // Write the new timeline to the disk and start background workers. + // Bootstrap is transactional, so if it fails, the timeline will be deleted, + // and the state on disk should remain unchanged. + match timeline.bootstrap(&mut shared_state) { + Ok(_) => { + // We are done with bootstrap, release the lock, return the timeline. + drop(shared_state); + Ok(timeline) + } + Err(e) => { + // Note: the most likely reason for bootstrap failure is that the timeline + // directory already exists on disk. This happens when timeline is corrupted + // and wasn't loaded from disk on startup because of that. We want to preserve + // the timeline directory in this case, for further inspection. + + // TODO: this is an unusual error, perhaps we should send it to sentry + // TODO: compute will try to create timeline every second, we should add backoff + error!("failed to bootstrap timeline {}: {}", ttid, e); + + // Timeline failed to bootstrap, it cannot be used. Remove it from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid); + Err(e) + } + } + } + + /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, + /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, + /// i.e. loaded in memory and not cancelled. + pub fn get(ttid: TenantTimelineId) -> Result> { + let res = TIMELINES_STATE.lock().unwrap().get(&ttid); + + match res { + Ok(tli) => { + if tli.is_cancelled() { + anyhow::bail!(TimelineError::Cancelled(ttid)); + } + Ok(tli) + } + Err(e) => Err(e), + } + } + + /// Returns all timelines. This is used for background timeline proccesses. + pub fn get_all() -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .cloned() + .filter(|t| !t.is_cancelled()) + .collect() + } + + /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant, + /// and that's why it can return cancelled timelines, to retry deleting them. + fn get_all_for_tenant(tenant_id: TenantId) -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .filter(|t| t.ttid.tenant_id == tenant_id) + .cloned() + .collect() + } + + /// Cancels timeline, then deletes the corresponding data directory. + pub fn delete_force(ttid: &TenantTimelineId) -> Result { + let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); + match tli_res { + Ok(timeline) => { + // Take a lock and finish the deletion holding this mutex. + let mut shared_state = timeline.write_shared_state(); + + info!("deleting timeline {}", ttid); + let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?; + + // Remove timeline from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(ttid); + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active, + }) + } + Err(_) => { + // Timeline is not memory, but it may still exist on disk in broken state. + let dir_path = TIMELINES_STATE.lock().unwrap().conf.timeline_dir(ttid); + let dir_existed = delete_dir(dir_path)?; + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active: false, + }) + } + } + } + + /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which + /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are + /// created simultaneously. In that case the function will return error and the caller should + /// retry tenant deletion again later. + pub fn delete_force_all_for_tenant( + tenant_id: &TenantId, + ) -> Result> { + info!("deleting all timelines for tenant {}", tenant_id); + let to_delete = Self::get_all_for_tenant(*tenant_id); + + let mut err = None; + + let mut deleted = HashMap::new(); + for tli in &to_delete { + match Self::delete_force(&tli.ttid) { + Ok(result) => { + deleted.insert(tli.ttid, result); + } + Err(e) => { + error!("failed to delete timeline {}: {}", tli.ttid, e); + // Save error to return later. + err = Some(e); + } + } + } + + // If there was an error, return it. + if let Some(e) = err { + return Err(e); + } + + // There may be broken timelines on disk, so delete the whole tenant dir as well. + // Note that we could concurrently create new timelines while we were deleting them, + // so the directory may be not empty. In this case timelines will have bad state + // and timeline background jobs can panic. + delete_dir(TIMELINES_STATE.lock().unwrap().conf.tenant_dir(tenant_id))?; + + let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); + if !tlis_after_delete.is_empty() { + // Some timelines were created while we were deleting them, returning error + // to the caller, so it can retry later. + bail!( + "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them", + tenant_id + ); + } + + Ok(deleted) + } +} + +#[derive(Clone, Copy, Serialize)] +pub struct TimelineDeleteForceResult { + pub dir_existed: bool, + pub was_active: bool, +} + +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), + } +} diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 85e967e218..0d5321fb3a 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -26,8 +26,8 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::broker::{Election, ElectionLeader}; -use crate::timeline::{GlobalTimelines, Timeline}; -use crate::{broker, SafeKeeperConf}; +use crate::timeline::Timeline; +use crate::{broker, GlobalTimelines, SafeKeeperConf}; use once_cell::sync::OnceCell; @@ -53,8 +53,10 @@ pub fn wal_backup_launcher_thread_main( /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. -fn is_wal_backup_required(zttid: TenantTimelineId) -> Option> { - GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend()) +fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { + GlobalTimelines::get(ttid) + .ok() + .filter(|tli| tli.wal_backup_attend()) } struct WalBackupTaskHandle { @@ -70,20 +72,20 @@ struct WalBackupTimelineEntry { /// Start per timeline task, if it makes sense for this safekeeper to offload. fn consider_start_task( conf: &SafeKeeperConf, - zttid: TenantTimelineId, + ttid: TenantTimelineId, task: &mut WalBackupTimelineEntry, ) { if !task.timeline.can_wal_backup() { return; } - info!("starting WAL backup task for {}", zttid); + info!("starting WAL backup task for {}", ttid); // TODO: decide who should offload right here by simply checking current // state instead of running elections in offloading task. let election_name = SubscriptionKey { cluster_prefix: conf.broker_etcd_prefix.clone(), kind: SubscriptionKind::Operation( - zttid, + ttid, NodeKind::Safekeeper, OperationKind::Safekeeper(SkOperationKind::WalBackup), ), @@ -97,11 +99,11 @@ fn consider_start_task( ); let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&zttid); + let timeline_dir = conf.timeline_dir(&ttid); let handle = tokio::spawn( - backup_task_main(zttid, timeline_dir, shutdown_rx, election) - .instrument(info_span!("WAL backup task", zttid = %zttid)), + backup_task_main(ttid, timeline_dir, shutdown_rx, election) + .instrument(info_span!("WAL backup task", ttid = %ttid)), ); task.handle = Some(WalBackupTaskHandle { @@ -140,33 +142,33 @@ async fn wal_backup_launcher_main_loop( let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); loop { tokio::select! { - zttid = wal_backup_launcher_rx.recv() => { + ttid = wal_backup_launcher_rx.recv() => { // channel is never expected to get closed - let zttid = zttid.unwrap(); + let ttid = ttid.unwrap(); if conf.remote_storage.is_none() || !conf.wal_backup_enabled { continue; /* just drain the channel and do nothing */ } - let timeline = is_wal_backup_required(zttid); + let timeline = is_wal_backup_required(ttid); // do we need to do anything at all? - if timeline.is_some() != tasks.contains_key(&zttid) { + if timeline.is_some() != tasks.contains_key(&ttid) { if let Some(timeline) = timeline { // need to start the task - let entry = tasks.entry(zttid).or_insert(WalBackupTimelineEntry { + let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry { timeline, handle: None, }); - consider_start_task(&conf, zttid, entry); + consider_start_task(&conf, ttid, entry); } else { // need to stop the task - info!("stopping WAL backup task for {}", zttid); + info!("stopping WAL backup task for {}", ttid); - let entry = tasks.remove(&zttid).unwrap(); + let entry = tasks.remove(&ttid).unwrap(); if let Some(wb_handle) = entry.handle { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", zttid, e); + warn!("WAL backup task for {} panicked: {}", ttid, e); } } } @@ -174,8 +176,8 @@ async fn wal_backup_launcher_main_loop( } // Start known tasks, if needed and possible. _ = ticker.tick() => { - for (zttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { - consider_start_task(&conf, *zttid, entry); + for (ttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { + consider_start_task(&conf, *ttid, entry); } } } @@ -191,26 +193,26 @@ struct WalBackupTask { election: Election, } -/// Offload single timeline. +/// Offload single timeline. Called only after we checked that backup +/// is required (wal_backup_attend) and possible (can_wal_backup). async fn backup_task_main( - zttid: TenantTimelineId, + ttid: TenantTimelineId, timeline_dir: PathBuf, mut shutdown_rx: Receiver<()>, election: Election, ) { info!("started"); - let timeline: Arc = if let Some(tli) = GlobalTimelines::get_loaded(zttid) { - tli - } else { - /* Timeline could get deleted while task was starting, just exit then. */ - info!("no timeline, exiting"); + let res = GlobalTimelines::get(ttid); + if let Err(e) = res { + error!("backup error for timeline {}: {}", ttid, e); return; - }; + } + let tli = res.unwrap(); let mut wb = WalBackupTask { - wal_seg_size: timeline.get_wal_seg_size(), - commit_lsn_watch_rx: timeline.get_commit_lsn_watch_rx(), - timeline, + wal_seg_size: tli.get_wal_seg_size(), + commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + timeline: tli, timeline_dir, leader: None, election, @@ -322,7 +324,11 @@ impl WalBackupTask { { Ok(backup_lsn_result) => { backup_lsn = backup_lsn_result; - self.timeline.set_wal_backup_lsn(backup_lsn_result); + let res = self.timeline.set_wal_backup_lsn(backup_lsn_result); + if let Err(e) = res { + error!("backup error: {}", e); + return; + } retry_attempt = 0; } Err(e) => { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 58b69f06e7..ea613dd0f1 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -7,7 +7,7 @@ //! //! Note that last file has `.partial` suffix, that's different from postgres. -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{bail, Context, Result}; use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; @@ -17,7 +17,7 @@ use postgres_ffi::v14::xlog_utils::{ find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; use postgres_ffi::{XLogSegNo, PG_TLI}; -use std::cmp::min; +use std::cmp::{max, min}; use std::fs::{self, remove_file, File, OpenOptions}; use std::io::Write; @@ -86,9 +86,9 @@ struct WalStorageMetrics { } impl WalStorageMetrics { - fn new(zttid: &TenantTimelineId) -> Self { - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + fn new(ttid: &TenantTimelineId) -> Self { + let tenant_id = ttid.tenant_id.to_string(); + let timeline_id = ttid.timeline_id.to_string(); Self { write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), @@ -101,9 +101,6 @@ pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; - /// Init storage with wal_seg_size and read WAL from disk to get latest LSN. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()>; - /// Write piece of WAL from buf to disk, but not necessarily sync it. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; @@ -119,7 +116,7 @@ pub trait Storage { } /// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes -/// for better performance. Storage must be initialized before use. +/// for better performance. Storage is initialized in the constructor. /// /// WAL is stored in segments, each segment is a file. Last segment has ".partial" suffix in /// its filename and may be not fully flushed. @@ -127,16 +124,14 @@ pub trait Storage { /// Relationship of LSNs: /// `write_lsn` >= `write_record_lsn` >= `flush_record_lsn` /// -/// When storage is just created, all LSNs are zeroes and there are no segments on disk. +/// When storage is created first time, all LSNs are zeroes and there are no segments on disk. pub struct PhysicalStorage { metrics: WalStorageMetrics, - zttid: TenantTimelineId, timeline_dir: PathBuf, conf: SafeKeeperConf, - // fields below are filled upon initialization - /// None if uninitialized, Some(usize) if storage is initialized. - wal_seg_size: Option, + /// Size of WAL segment in bytes. + wal_seg_size: usize, /// Written to disk, but possibly still in the cache and not fully persisted. /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. @@ -161,25 +156,47 @@ pub struct PhysicalStorage { } impl PhysicalStorage { - pub fn new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { - let timeline_dir = conf.timeline_dir(zttid); - PhysicalStorage { - metrics: WalStorageMetrics::new(zttid), - zttid: *zttid, + /// Create new storage. If commit_lsn is not zero, flush_lsn is tried to be restored from + /// the disk. Otherwise, all LSNs are set to zero. + pub fn new( + ttid: &TenantTimelineId, + conf: &SafeKeeperConf, + state: &SafeKeeperState, + ) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + let wal_seg_size = state.server.wal_seg_size as usize; + + // Find out where stored WAL ends, starting at commit_lsn which is a + // known recent record boundary (unless we don't have WAL at all). + let write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + find_end_of_wal(&timeline_dir, wal_seg_size, state.commit_lsn)? + }; + + // TODO: do we really know that write_lsn is fully flushed to disk? + // If not, maybe it's better to call fsync() here to be sure? + let flush_lsn = write_lsn; + + info!( + "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", + ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, + ); + if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn { + warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id); + } + + Ok(PhysicalStorage { + metrics: WalStorageMetrics::new(ttid), timeline_dir, conf: conf.clone(), - wal_seg_size: None, - write_lsn: Lsn(0), - write_record_lsn: Lsn(0), - flush_record_lsn: Lsn(0), - decoder: WalStreamDecoder::new(Lsn(0)), + wal_seg_size, + write_lsn, + write_record_lsn: write_lsn, + flush_record_lsn: flush_lsn, + decoder: WalStreamDecoder::new(write_lsn), file: None, - } - } - - /// Wrapper for flush_lsn updates that also updates metrics. - fn update_flush_lsn(&mut self) { - self.flush_record_lsn = self.write_record_lsn; + }) } /// Call fdatasync if config requires so. @@ -204,9 +221,9 @@ impl PhysicalStorage { /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. - fn open_or_create(&self, segno: XLogSegNo, wal_seg_size: usize) -> Result<(File, bool)> { + fn open_or_create(&self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; // Try to open already completed segment if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { @@ -222,24 +239,18 @@ impl PhysicalStorage { .open(&wal_file_partial_path) .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; - write_zeroes(&mut file, wal_seg_size)?; + write_zeroes(&mut file, self.wal_seg_size)?; self.fsync_file(&mut file)?; Ok((file, true)) } } /// Write WAL bytes, which are known to be located in a single WAL segment. - fn write_in_segment( - &mut self, - segno: u64, - xlogoff: usize, - buf: &[u8], - wal_seg_size: usize, - ) -> Result<()> { + fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> { let mut file = if let Some(file) = self.file.take() { file } else { - let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + let (mut file, is_partial) = self.open_or_create(segno)?; assert!(is_partial, "unexpected write into non-partial segment file"); file.seek(SeekFrom::Start(xlogoff as u64))?; file @@ -247,13 +258,13 @@ impl PhysicalStorage { file.write_all(buf)?; - if xlogoff + buf.len() == wal_seg_size { + if xlogoff + buf.len() == self.wal_seg_size { // If we reached the end of a WAL segment, flush and close it. self.fdatasync_file(&mut file)?; // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; fs::rename(&wal_file_partial_path, &wal_file_path)?; } else { // otherwise, file can be reused later @@ -269,10 +280,6 @@ impl PhysicalStorage { /// /// Updates `write_lsn`. fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - if self.write_lsn != pos { // need to flush the file before discarding it if let Some(mut file) = self.file.take() { @@ -284,17 +291,17 @@ impl PhysicalStorage { while !buf.is_empty() { // Extract WAL location for this block - let xlogoff = self.write_lsn.segment_offset(wal_seg_size) as usize; - let segno = self.write_lsn.segment_number(wal_seg_size); + let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize; + let segno = self.write_lsn.segment_number(self.wal_seg_size); // If crossing a WAL boundary, only write up until we reach wal segment size. - let bytes_write = if xlogoff + buf.len() > wal_seg_size { - wal_seg_size - xlogoff + let bytes_write = if xlogoff + buf.len() > self.wal_seg_size { + self.wal_seg_size - xlogoff } else { buf.len() }; - self.write_in_segment(segno, xlogoff, &buf[..bytes_write], wal_seg_size)?; + self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?; self.write_lsn += bytes_write as u64; buf = &buf[bytes_write..]; } @@ -309,53 +316,6 @@ impl Storage for PhysicalStorage { self.flush_record_lsn } - /// Storage needs to know wal_seg_size to know which segment to read/write, but - /// wal_seg_size is not always known at the moment of storage creation. This method - /// allows to postpone its initialization. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> { - if state.server.wal_seg_size == 0 { - // wal_seg_size is still unknown. This is dead path normally, should - // be used only in tests. - return Ok(()); - } - - if let Some(wal_seg_size) = self.wal_seg_size { - // physical storage is already initialized - assert_eq!(wal_seg_size, state.server.wal_seg_size as usize); - return Ok(()); - } - - // initialize physical storage - let wal_seg_size = state.server.wal_seg_size as usize; - self.wal_seg_size = Some(wal_seg_size); - - // Find out where stored WAL ends, starting at commit_lsn which is a - // known recent record boundary (unless we don't have WAL at all). - self.write_lsn = if state.commit_lsn == Lsn(0) { - Lsn(0) - } else { - find_end_of_wal(&self.timeline_dir, wal_seg_size, state.commit_lsn)? - }; - - self.write_record_lsn = self.write_lsn; - - // TODO: do we really know that write_lsn is fully flushed to disk? - // If not, maybe it's better to call fsync() here to be sure? - self.update_flush_lsn(); - - info!( - "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", - self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.peer_horizon_lsn, - ); - if self.flush_record_lsn < state.commit_lsn - || self.flush_record_lsn < state.peer_horizon_lsn - { - warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", self.zttid.timeline_id); - } - - Ok(()) - } - /// Write WAL to disk. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. @@ -419,80 +379,83 @@ impl Storage for PhysicalStorage { // We have unflushed data (write_lsn != flush_lsn), but no file. // This should only happen if last file was fully written and flushed, // but haven't updated flush_lsn yet. - assert!(self.write_lsn.segment_offset(self.wal_seg_size.unwrap()) == 0); + if self.write_lsn.segment_offset(self.wal_seg_size) != 0 { + bail!( + "unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}", + self.write_lsn, + self.flush_record_lsn + ); + } } // everything is flushed now, let's update flush_lsn - self.update_flush_lsn(); + self.flush_record_lsn = self.write_record_lsn; Ok(()) } /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - // Streaming must not create a hole, so truncate cannot be called on non-written lsn - assert!(self.write_lsn == Lsn(0) || self.write_lsn >= end_pos); + if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + bail!( + "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", + self.write_lsn, + end_pos + ); + } // Close previously opened file, if any if let Some(mut unflushed_file) = self.file.take() { self.fdatasync_file(&mut unflushed_file)?; } - let xlogoff = end_pos.segment_offset(wal_seg_size) as usize; - let segno = end_pos.segment_number(wal_seg_size); - let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize; + let segno = end_pos.segment_number(self.wal_seg_size); + + // Remove all segments after the given LSN. + remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?; + + let (mut file, is_partial) = self.open_or_create(segno)?; // Fill end with zeroes file.seek(SeekFrom::Start(xlogoff as u64))?; - write_zeroes(&mut file, wal_seg_size - xlogoff)?; + write_zeroes(&mut file, self.wal_seg_size - xlogoff)?; self.fdatasync_file(&mut file)?; if !is_partial { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; fs::rename(&wal_file_path, &wal_file_partial_path)?; } - // Remove all subsequent segments - let mut segno = segno; - loop { - segno += 1; - let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; - // TODO: better use fs::try_exists which is currently available only in nightly build - if wal_file_path.exists() { - fs::remove_file(&wal_file_path)?; - } else if wal_file_partial_path.exists() { - fs::remove_file(&wal_file_partial_path)?; - } else { - break; - } - } - // Update LSNs self.write_lsn = end_pos; self.write_record_lsn = end_pos; - self.update_flush_lsn(); + self.flush_record_lsn = end_pos; Ok(()) } fn remove_up_to(&self) -> Box Result<()>> { let timeline_dir = self.timeline_dir.clone(); - let wal_seg_size = self.wal_seg_size.unwrap(); + let wal_seg_size = self.wal_seg_size; Box::new(move |segno_up_to: XLogSegNo| { - remove_up_to(&timeline_dir, wal_seg_size, segno_up_to) + remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) }) } } -/// Remove all WAL segments in timeline_dir <= given segno. -fn remove_up_to(timeline_dir: &Path, wal_seg_size: usize, segno_up_to: XLogSegNo) -> Result<()> { +/// Remove all WAL segments in timeline_dir that match the given predicate. +fn remove_segments_from_disk( + timeline_dir: &Path, + wal_seg_size: usize, + remove_predicate: impl Fn(XLogSegNo) -> bool, +) -> Result<()> { let mut n_removed = 0; + let mut min_removed = u64::MAX; + let mut max_removed = u64::MIN; + for entry in fs::read_dir(&timeline_dir)? { let entry = entry?; let entry_path = entry.path(); @@ -504,19 +467,21 @@ fn remove_up_to(timeline_dir: &Path, wal_seg_size: usize, segno_up_to: XLogSegNo continue; } let (segno, _) = XLogFromFileName(fname_str, wal_seg_size); - if segno <= segno_up_to { + if remove_predicate(segno) { remove_file(entry_path)?; n_removed += 1; + min_removed = min(min_removed, segno); + max_removed = max(max_removed, segno); } } } - let segno_from = segno_up_to - n_removed + 1; - info!( - "removed {} WAL segments [{}; {}]", - n_removed, - XLogFileName(PG_TLI, segno_from, wal_seg_size), - XLogFileName(PG_TLI, segno_up_to, wal_seg_size) - ); + + if n_removed > 0 { + info!( + "removed {} WAL segments [{}; {}]", + n_removed, min_removed, max_removed + ); + } Ok(()) } @@ -526,8 +491,10 @@ pub struct WalReader { pos: Lsn, wal_segment: Option>>, - enable_remote_read: bool, // S3 will be used to read WAL if LSN is not available locally + enable_remote_read: bool, + + // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, } From 7863c4a702617b2af5917d6a273a675395455e69 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 08:37:06 +0300 Subject: [PATCH 20/90] Regenerate Hakari files, add a CI check for that --- .github/workflows/codestyle.yml | 10 +++++++++- Cargo.lock | 22 +++------------------- libs/postgres_ffi/wal_craft/Cargo.toml | 1 + workspace_hack/Cargo.toml | 6 ++---- 4 files changed, 15 insertions(+), 24 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 237cf81205..5220258ef0 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -30,7 +30,7 @@ jobs: # this is all we need to install our toolchain later via rust-toolchain.toml # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] - timeout-minutes: 60 + timeout-minutes: 90 name: check codestyle rust and postgres runs-on: ${{ matrix.os }} @@ -108,6 +108,14 @@ jobs: target key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust + # https://github.com/facebookincubator/cargo-guppy/tree/main/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check every project module is covered by Hakari + run: | + cargo install cargo-hakari + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + shell: bash -euxo pipefail {0} + - name: Run cargo clippy run: ./run_clippy.sh diff --git a/Cargo.lock b/Cargo.lock index 2f4a57b698..3ce0ce465f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -377,13 +377,9 @@ version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ - "ansi_term", - "atty", "bitflags", - "strsim 0.8.0", "textwrap 0.11.0", "unicode-width", - "vec_map", ] [[package]] @@ -396,7 +392,7 @@ dependencies = [ "bitflags", "clap_lex", "indexmap", - "strsim 0.10.0", + "strsim", "termcolor", "textwrap 0.15.0", ] @@ -746,7 +742,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim 0.10.0", + "strsim", "syn", ] @@ -3023,12 +3019,6 @@ dependencies = [ "unicode-normalization", ] -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - [[package]] name = "strsim" version = "0.10.0" @@ -3685,12 +3675,6 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - [[package]] name = "version_check" version = "0.9.4" @@ -3709,6 +3693,7 @@ dependencies = [ "postgres", "postgres_ffi", "tempfile", + "workspace_hack", ] [[package]] @@ -3942,7 +3927,6 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 2.34.0", "either", "fail", "futures-channel", diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index f848ac1273..88466737ed 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -14,3 +14,4 @@ once_cell = "1.13.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres_ffi = { path = "../" } tempfile = "3.2" +workspace_hack = { version = "0.1", path = "../../../workspace_hack" } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 096b3a5d70..96594bbf96 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,7 +19,6 @@ anyhow = { version = "1", features = ["backtrace", "std"] } bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } -clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } @@ -46,16 +45,15 @@ regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } -tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["once_cell", "std", "valuable"] } +tracing-core = { version = "0.1", features = ["once_cell", "std"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } -clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } From a5019bf771e878b8e3f02563d7803580450ff39f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Sep 2022 12:38:47 +0300 Subject: [PATCH 21/90] Use a simpler way to set extra options for benchmark test. Commit 43a4f7173e fixed the case that there are extra options in the connection string, but broke it in the case when there are not. Fix that. But on second thoughts, it's more straightforward set the options with ALTER DATABASE, so change the workflow yaml file to do that instead. --- .github/workflows/benchmarking.yml | 13 +++++++++---- test_runner/performance/test_perf_pgbench.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 0430f0b9c0..4e28223c18 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -183,12 +183,9 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new) + neon-captest-new | neon-captest-prefetch) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; - neon-captest-prefetch) - CONNSTR=${{ steps.create-neon-project.outputs.dsn }}?options=-cenable_seqscan_prefetch%3Don%20-cseqscan_prefetch_buffers%3D10 - ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; @@ -204,6 +201,14 @@ jobs: env: PLATFORM: ${{ matrix.platform }} + - name: Set database options + if: matrix.platform == 'neon-captest-prefetch' + run: | + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on" + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10" + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index d9bf237e49..e167ddaafa 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -84,7 +84,7 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P if workload_type == PgBenchLoadType.INIT: # Run initialize - options = "-cstatement_timeout=1h " + env.pg.default_options["options"] + options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") init_pgbench(env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options=options)]) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: From 4b25b9652a024dd876259088ef8fad56e708ba4d Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 20 Sep 2022 11:06:31 -0700 Subject: [PATCH 22/90] Rename more zid-like idents (#2480) Follow-up to PR #2433 (b8eb908a). There's still a few more unresolved locations that have been left as-is for the same compatibility reasons in the original PR. --- libs/utils/benches/benchmarks.rs | 12 +++---- libs/utils/src/pq_proto.rs | 46 +++++++++++++------------- pageserver/src/tenant.rs | 2 +- pgxn/neon/libpagestore.c | 12 +++---- pgxn/neon/pagestore_client.h | 6 ++-- pgxn/neon/pagestore_smgr.c | 12 +++---- safekeeper/src/control_file_upgrade.rs | 2 +- 7 files changed, 46 insertions(+), 46 deletions(-) diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index badcb5774e..98d839ca55 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -3,20 +3,20 @@ use criterion::{criterion_group, criterion_main, Criterion}; use utils::id; -pub fn bench_zid_stringify(c: &mut Criterion) { +pub fn bench_id_stringify(c: &mut Criterion) { // Can only use public methods. - let ztl = id::TenantTimelineId::generate(); + let ttid = id::TenantTimelineId::generate(); - c.bench_function("zid.to_string", |b| { + c.bench_function("id.to_string", |b| { b.iter(|| { // FIXME measurement overhead? //for _ in 0..1000 { - // ztl.tenant_id.to_string(); + // ttid.tenant_id.to_string(); //} - ztl.tenant_id.to_string(); + ttid.tenant_id.to_string(); }) }); } -criterion_group!(benches, bench_zid_stringify); +criterion_group!(benches, bench_id_stringify); criterion_main!(benches); diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index dde76039d7..21952ab87e 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -931,7 +931,7 @@ impl ReplicationFeedback { // Deserialize ReplicationFeedback message pub fn parse(mut buf: Bytes) -> ReplicationFeedback { - let mut zf = ReplicationFeedback::empty(); + let mut rf = ReplicationFeedback::empty(); let nfields = buf.get_u8(); for _ in 0..nfields { let key = read_cstr(&mut buf).unwrap(); @@ -939,31 +939,31 @@ impl ReplicationFeedback { b"current_timeline_size" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.current_timeline_size = buf.get_u64(); + rf.current_timeline_size = buf.get_u64(); } b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_writelsn = buf.get_u64(); + rf.ps_writelsn = buf.get_u64(); } b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_flushlsn = buf.get_u64(); + rf.ps_flushlsn = buf.get_u64(); } b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_applylsn = buf.get_u64(); + rf.ps_applylsn = buf.get_u64(); } b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); if raw_time > 0 { - zf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); } else { - zf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } _ => { @@ -976,8 +976,8 @@ impl ReplicationFeedback { } } } - trace!("ReplicationFeedback parsed is {:?}", zf); - zf + trace!("ReplicationFeedback parsed is {:?}", rf); + rf } } @@ -987,29 +987,29 @@ mod tests { #[test] fn test_replication_feedback_serialization() { - let mut zf = ReplicationFeedback::empty(); - // Fill zf with some values - zf.current_timeline_size = 12345678; + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); - let zf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); } #[test] fn test_replication_feedback_unknown_key() { - let mut zf = ReplicationFeedback::empty(); - // Fill zf with some values - zf.current_timeline_size = 12345678; + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { @@ -1021,8 +1021,8 @@ mod tests { data.put_u64(42); // Parse serialized data and check that new field is not parsed - let zf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); } #[test] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f56f10d7ea..204caf6dfa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -132,7 +132,7 @@ pub enum TenantState { /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { - /// Get Timeline handle for given zenith timeline ID. + /// Get Timeline handle for given Neon timeline ID. /// This function is idempotent. It doesn't change internal state in any way. pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { self.timelines diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 296865838d..9cd2a86941 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -183,7 +183,7 @@ pageserver_send(NeonRequest * request) if (!connected) pageserver_connect(); - req_buff = zm_pack_request(request); + req_buff = nm_pack_request(request); /* * Send request. @@ -204,7 +204,7 @@ pageserver_send(NeonRequest * request) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((NeonMessage *) request); + char *msg = nm_to_string((NeonMessage *) request); neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); @@ -230,12 +230,12 @@ pageserver_receive(void) else if (resp_buff.len == -2) neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); } - resp = zm_unpack_response(&resp_buff); + resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((NeonMessage *) resp); + char *msg = nm_to_string((NeonMessage *) resp); neon_log(PageStoreTrace, "got response: %s", msg); pfree(msg); @@ -282,9 +282,9 @@ page_server_api api = { static bool check_neon_id(char **newval, void **extra, GucSource source) { - uint8 zid[16]; + uint8 id[16]; - return **newval == '\0' || HexDecodeString(zid, *newval, 16); + return **newval == '\0' || HexDecodeString(id, *newval, 16); } static char * diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 633c7b465c..e0cda11b63 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -128,9 +128,9 @@ typedef struct * message */ } NeonErrorResponse; -extern StringInfoData zm_pack_request(NeonRequest * msg); -extern NeonResponse * zm_unpack_response(StringInfo s); -extern char *zm_to_string(NeonMessage * msg); +extern StringInfoData nm_pack_request(NeonRequest * msg); +extern NeonResponse * nm_unpack_response(StringInfo s); +extern char *nm_to_string(NeonMessage * msg); /* * API diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 8e6dd373b0..1187550f2a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -160,7 +160,7 @@ page_server_request(void const *req) StringInfoData -zm_pack_request(NeonRequest * msg) +nm_pack_request(NeonRequest * msg) { StringInfoData s; @@ -235,7 +235,7 @@ zm_pack_request(NeonRequest * msg) } NeonResponse * -zm_unpack_response(StringInfo s) +nm_unpack_response(StringInfo s) { NeonMessageTag tag = pq_getmsgbyte(s); NeonResponse *resp = NULL; @@ -329,7 +329,7 @@ zm_unpack_response(StringInfo s) /* dump to json for debugging / error reporting purposes */ char * -zm_to_string(NeonMessage * msg) +nm_to_string(NeonMessage * msg) { StringInfoData s; @@ -632,7 +632,7 @@ neon_init(void) * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. */ static XLogRecPtr -zm_adjust_lsn(XLogRecPtr lsn) +nm_adjust_lsn(XLogRecPtr lsn) { /* * If lsn points to the beging of first record on page or segment, then @@ -685,7 +685,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); - lsn = zm_adjust_lsn(lsn); + lsn = nm_adjust_lsn(lsn); /* * Is it possible that the last-written LSN is ahead of last flush @@ -1569,7 +1569,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) */ lsn = GetXLogInsertRecPtr(); - lsn = zm_adjust_lsn(lsn); + lsn = nm_adjust_lsn(lsn); /* * Flush it, too. We don't actually care about it here, but let's uphold diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 87204d6b49..d8434efb20 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -167,7 +167,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); - // migrate to hexing some zids + // migrate to hexing some ids } else if version == 2 { info!("reading safekeeper control file version {}", version); let oldstate = SafeKeeperStateV2::des(&buf[..buf.len()])?; From 4a3b3ff11d89d02300041e32f43847110637f2e0 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 20 Sep 2022 11:28:12 -0700 Subject: [PATCH 23/90] Move testing pageserver libpq cmds to HTTP api (#2429) Closes #2422. The APIs have been feature gated with the `testing_api!` macro so that they return 400s when support hasn't been compiled in. --- .cargo/config.toml | 2 +- .github/workflows/build_and_test.yml | 4 +- README.md | 7 +- pageserver/Cargo.toml | 6 +- pageserver/src/bin/pageserver.rs | 4 +- pageserver/src/http/models.rs | 18 +++ pageserver/src/http/routes.rs | 141 ++++++++++++++++++ pageserver/src/page_service.rs | 115 +------------- pageserver/src/repository.rs | 11 +- test_runner/README.md | 4 +- test_runner/fixtures/neon_fixtures.py | 57 +++++++ test_runner/regress/test_ancestor_branch.py | 7 +- test_runner/regress/test_basebackup_error.py | 3 +- test_runner/regress/test_branch_and_gc.py | 11 +- test_runner/regress/test_branch_behind.py | 13 +- test_runner/regress/test_broken_timeline.py | 3 +- test_runner/regress/test_gc_aggressive.py | 12 +- test_runner/regress/test_import.py | 5 +- test_runner/regress/test_old_request_lsn.py | 9 +- test_runner/regress/test_pitr_gc.py | 15 +- test_runner/regress/test_readonly_node.py | 3 +- test_runner/regress/test_recovery.py | 41 ++--- test_runner/regress/test_remote_storage.py | 5 +- test_runner/regress/test_tenant_detach.py | 23 +-- test_runner/regress/test_tenant_relocation.py | 30 ++-- test_runner/regress/test_tenants.py | 3 +- .../test_tenants_with_remote_storage.py | 2 +- test_runner/regress/test_timeline_size.py | 21 ++- test_runner/regress/test_wal_acceptor.py | 4 +- 29 files changed, 352 insertions(+), 227 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index d70d57a817..c40783bc1b 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -13,4 +13,4 @@ opt-level = 3 opt-level = 1 [alias] -build_testing = ["build", "--features", "failpoints"] +build_testing = ["build", "--features", "testing"] diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0b6cb21120..44db968753 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -100,11 +100,11 @@ jobs: run: | if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="--features failpoints" + CARGO_FEATURES="--features testing" CARGO_FLAGS="--locked --timings $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features failpoints,profiling" + CARGO_FEATURES="--features testing,profiling" CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV diff --git a/README.md b/README.md index 03ed57a0fa..dc469c36b1 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,12 @@ Ensure your dependencies are installed as described [here](https://github.com/ne ```sh git clone --recursive https://github.com/neondatabase/neon.git -make # builds also postgres and installs it to ./pg_install + +# either: +CARGO_BUILD_FLAGS="--features=testing" make +# or: +make debug + ./scripts/pytest ``` diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index ce09e788bd..85ece97d9b 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -5,10 +5,10 @@ edition = "2021" [features] default = [] +# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, +# which adds some runtime cost to run tests on outage conditions +testing = ["fail/failpoints"] -# Feature that enables a special API, fail_point! macro (adds some runtime cost) -# to run tests on outage conditions -failpoints = ["fail/failpoints"] profiling = ["pprof"] [dependencies] diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 92d5eab379..fb79ad3945 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -87,8 +87,8 @@ fn main() -> anyhow::Result<()> { if arg_matches.is_present("enabled-features") { let features: &[&str] = &[ - #[cfg(feature = "failpoints")] - "failpoints", + #[cfg(feature = "testing")] + "testing", #[cfg(feature = "profiling")] "profiling", ]; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index c0dc5b9677..2d7d560d2a 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -160,3 +160,21 @@ pub struct TimelineInfo { pub local: Option, pub remote: Option, } + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineGcRequest { + pub gc_horizon: Option, +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2e49429f38..bfc9e4462b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -29,6 +29,12 @@ use utils::{ lsn::Lsn, }; +// Imports only used for testing APIs +#[cfg(feature = "testing")] +use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; +#[cfg(feature = "testing")] +use crate::CheckpointConfig; + struct State { conf: &'static PageServerConf, auth: Option>, @@ -661,6 +667,103 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest( + "Cannot manage failpoints because pageserver was compiled without failpoints support" + .to_owned(), + )); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = if fp.actions == "exit" { + fail::cfg_callback(fp.name, || { + info!("Exit requested by failpoint"); + std::process::exit(1); + }) + } else { + fail::cfg(fp.name, &fp.actions) + }; + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(format!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} + +// Run GC immediately on given timeline. +// FIXME: This is just for tests. See test_runner/regress/test_gc.py. +// This probably should require special authentication or a global flag to +// enable, I don't think we want to or need to allow regular clients to invoke +// GC. +// @hllinnaka in commits ec44f4b29, 3aca717f3 +#[cfg(feature = "testing")] +async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + // FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX + let repo = tenant_mgr::get_tenant(tenant_id, false)?; + let gc_req: TimelineGcRequest = json_request(&mut request).await?; + + let _span_guard = + info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon()); + + // Use tenant's pitr setting + let pitr = repo.get_pitr_interval(); + let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + json_response(StatusCode::OK, result) +} + +// Run compaction immediately on given timeline. +// FIXME This is just for tests. Don't expect this to be exposed to +// the users or the api. +// @dhammika in commit a0781f229 +#[cfg(feature = "testing")] +async fn timeline_compact_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let repo = tenant_mgr::get_tenant(tenant_id, true)?; + // FIXME: currently this will return a 500 error on bad timeline id; it should be 4XX + let timeline = repo.get_timeline(timeline_id).with_context(|| { + format!("No timeline {timeline_id} in repository for tenant {tenant_id}") + })?; + timeline.compact()?; + + json_response(StatusCode::OK, ()) +} + +// Run checkpoint immediately on given timeline. +#[cfg(feature = "testing")] +async fn timeline_checkpoint_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let repo = tenant_mgr::get_tenant(tenant_id, true)?; + // FIXME: currently this will return a 500 error on bad timeline id; it should be 4XX + let timeline = repo.get_timeline(timeline_id).with_context(|| { + format!("No timeline {timeline_id} in repository for tenant {tenant_id}") + })?; + timeline.checkpoint(CheckpointConfig::Forced)?; + + json_response(StatusCode::OK, ()) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -687,12 +790,38 @@ pub fn make_router( })) } + macro_rules! testing_api { + ($handler_desc:literal, $handler:path $(,)?) => {{ + #[cfg(not(feature = "testing"))] + async fn cfg_disabled(_req: Request) -> Result, ApiError> { + Err(ApiError::BadRequest( + concat!( + "Cannot ", + $handler_desc, + " because pageserver was compiled without testing APIs", + ) + .to_owned(), + )) + } + + #[cfg(feature = "testing")] + let handler = $handler; + #[cfg(not(feature = "testing"))] + let handler = cfg_disabled; + handler + }}; + } + Ok(router .data(Arc::new( State::new(conf, auth, remote_index, remote_storage) .context("Failed to initialize router state")?, )) .get("/v1/status", status_handler) + .put( + "/v1/failpoints", + testing_api!("manage failpoints", failpoints_handler), + ) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) .get("/v1/tenant/:tenant_id", tenant_status) @@ -705,6 +834,18 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", + testing_api!("run timeline GC", timeline_gc_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", + testing_api!("run timeline compaction", timeline_compact_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + testing_api!("run timeline checkpoint", timeline_checkpoint_handler), + ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1461a6d117..9e159f7391 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -27,7 +27,7 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, postgres_backend_async::{self, PostgresBackend}, - pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, + pq_proto::{BeMessage, FeMessage, RowDescriptor}, simple_rcu::RcuReadGuard, }; @@ -1005,31 +1005,6 @@ impl postgres_backend_async::Handler for PageServerHandler { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("failpoints ") { - ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support"); - - let (_, failpoints) = query_string.split_at("failpoints ".len()); - - for failpoint in failpoints.split(';') { - if let Some((name, actions)) = failpoint.split_once('=') { - info!("cfg failpoint: {} {}", name, actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - if actions == "exit" { - fail::cfg_callback(name, || { - info!("Exit requested by failpoint"); - std::process::exit(1); - }) - .unwrap(); - } else { - fail::cfg(name, actions).unwrap(); - } - } else { - bail!("Invalid failpoints format"); - } - } - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); @@ -1072,94 +1047,6 @@ impl postgres_backend_async::Handler for PageServerHandler { Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("do_gc ") { - // Run GC immediately on given timeline. - // FIXME: This is just for tests. See test_runner/regress/test_gc.py. - // This probably should require special authentication or a global flag to - // enable, I don't think we want to or need to allow regular clients to invoke - // GC. - - // do_gc - let re = Regex::new(r"^do_gc ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)([[:digit:]]+)?") - .unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - - let _span_guard = - info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); - - let tenant = tenant_mgr::get_tenant(tenant_id, true)?; - - let gc_horizon: u64 = caps - .get(4) - .map(|h| h.as_str().parse()) - .unwrap_or_else(|| Ok(tenant.get_gc_horizon()))?; - - // Use tenant's pitr setting - let pitr = tenant.get_pitr_interval(); - let result = tenant.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; - pgb.write_message(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"layers_total"), - RowDescriptor::int8_col(b"layers_needed_by_cutoff"), - RowDescriptor::int8_col(b"layers_needed_by_pitr"), - RowDescriptor::int8_col(b"layers_needed_by_branches"), - RowDescriptor::int8_col(b"layers_not_updated"), - RowDescriptor::int8_col(b"layers_removed"), - RowDescriptor::int8_col(b"elapsed"), - ]))? - .write_message(&BeMessage::DataRow(&[ - Some(result.layers_total.to_string().as_bytes()), - Some(result.layers_needed_by_cutoff.to_string().as_bytes()), - Some(result.layers_needed_by_pitr.to_string().as_bytes()), - Some(result.layers_needed_by_branches.to_string().as_bytes()), - Some(result.layers_not_updated.to_string().as_bytes()), - Some(result.layers_removed.to_string().as_bytes()), - Some(result.elapsed.as_millis().to_string().as_bytes()), - ]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("compact ") { - // Run compaction immediately on given timeline. - // FIXME This is just for tests. Don't expect this to be exposed to - // the users or the api. - - // compact - let re = Regex::new(r"^compact ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("Invalid compact: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - timeline.compact()?; - - pgb.write_message(&SINGLE_COL_ROWDESC)? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("checkpoint ") { - // Run checkpoint immediately on given timeline. - - // checkpoint - let re = Regex::new(r"^checkpoint ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - - // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). - timeline.checkpoint(CheckpointConfig::Forced)?; - - pgb.write_message(&SINGLE_COL_ROWDESC)? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { // Locate LSN of last transaction with timestamp less or equal than sppecified // TODO lazy static diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index f6ea9d8c5d..cfcc87a2ed 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -176,7 +176,7 @@ impl Value { /// /// Result of performing GC /// -#[derive(Default)] +#[derive(Default, Serialize)] pub struct GcResult { pub layers_total: u64, pub layers_needed_by_cutoff: u64, @@ -185,9 +185,18 @@ pub struct GcResult { pub layers_not_updated: u64, pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. + #[serde(serialize_with = "serialize_duration_as_millis")] pub elapsed: Duration, } +// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds +fn serialize_duration_as_millis(d: &Duration, serializer: S) -> Result +where + S: serde::Serializer, +{ + d.as_millis().serialize(serializer) +} + impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { self.layers_total += other.layers_total; diff --git a/test_runner/README.md b/test_runner/README.md index f17a4a5a5d..79b2418af6 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,9 +6,9 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions - If you want to test tests with failpoints, you would need to add `--features failpoints` to Rust code build commands. + If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands. For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. - Usage example: `cargo build_testing --release` is equivalent to `cargo build --features failpoints --release` + Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release` - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0c03429f95..1e83ee3839 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -964,6 +964,24 @@ class NeonPageserverHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]) -> None: + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def tenant_list(self) -> List[Dict[Any, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant") self.verbose_error(res) @@ -1061,6 +1079,45 @@ class NeonPageserverHttpClient(requests.Session): assert res_json is None return res_json + def timeline_gc( + self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] + ) -> dict[str, Any]: + log.info( + f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" + ) + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", + json={"gc_horizon": gc_horizon}, + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): + log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" + ) + log.info(f"Got compact request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + + def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" + ) + log.info(f"Got checkpoint request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index cb2621ff02..d7aebfb938 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -9,6 +9,7 @@ from fixtures.utils import query_scalar # def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() # Override defaults, 1M gc_horizon and 4M checkpoint_distance. # Extend compaction_period and gc_period to disable background compaction and gc. @@ -23,7 +24,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): } ) - env.pageserver.safe_psql("failpoints flush-frozen-before-sync=sleep(10000)") + pageserver_http.configure_failpoints(("flush-frozen-before-sync", "sleep(10000)")) pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() @@ -92,9 +93,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 300k rows: {lsn_300}") # Run compaction on branch1. - compact = f"compact {tenant} {branch1_timeline} {lsn_200}" + compact = f"compact {tenant} {branch1_timeline}" log.info(compact) - env.pageserver.safe_psql(compact) + pageserver_http.timeline_compact(tenant, branch1_timeline) assert query_scalar(branch0_cur, "SELECT count(*) FROM foo") == 100000 diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py index 81a46ee2f0..94d3999d17 100644 --- a/test_runner/regress/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -9,9 +9,10 @@ from fixtures.neon_fixtures import NeonEnv def test_basebackup_error(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_basebackup_error", "empty") + pageserver_http = env.pageserver.http_client() # Introduce failpoint - env.pageserver.safe_psql("failpoints basebackup-before-control-file=return") + pageserver_http.configure_failpoints(("basebackup-before-control-file", "return")) with pytest.raises(Exception, match="basebackup-before-control-file"): env.postgres.create_start("test_basebackup_error") diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index c8c5929066..12debe50eb 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -47,6 +47,7 @@ from fixtures.utils import query_scalar # could not find data for key ... at LSN ..., for request at LSN ... def test_branch_and_gc(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() tenant, _ = env.neon_cli.create_tenant( conf={ @@ -84,7 +85,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. - env.pageserver.safe_psql(f"do_gc {tenant} {timeline_main} {lsn2 - lsn1 + 1024}") + pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) env.neon_cli.create_branch( "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 @@ -113,6 +114,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # For more details, see discussion in https://github.com/neondatabase/neon/pull/2101#issuecomment-1185273447. def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() + # Disable background GC but set the `pitr_interval` to be small, so GC can delete something tenant, _ = env.neon_cli.create_tenant( conf={ @@ -147,10 +150,10 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the # branch creation task but the individual timeline GC iteration happens *after* # the branch creation task. - env.pageserver.safe_psql("failpoints before-timeline-gc=sleep(2000)") + pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)")) def do_gc(): - env.pageserver.safe_psql(f"do_gc {tenant} {b0} 0") + pageserver_http_client.timeline_gc(tenant, b0, 0) thread = threading.Thread(target=do_gc, daemon=True) thread.start() @@ -161,7 +164,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): time.sleep(1.0) # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn) thread.join() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index b0d0737172..0e2a8b346b 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,4 +1,3 @@ -import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -96,7 +95,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): assert pg.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) # branch at pre-ancestor lsn @@ -106,13 +105,11 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): ) # check that we cannot create branch based on garbage collected data - with env.pageserver.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # call gc to advace latest_gc_cutoff_lsn - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + with env.pageserver.http_client() as pageserver_http: + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail env.neon_cli.create_branch( "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index fd81981b2b..7baa67935d 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -113,13 +113,14 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() tenant_id, _ = env.neon_cli.create_tenant() old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) # Introduce failpoint when creating a new timeline - env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return") + pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) with pytest.raises(Exception, match="before-checkpoint-new-timeline"): _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 88d4ad8a6e..332bef225f 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -1,4 +1,5 @@ import asyncio +import concurrent.futures import random from fixtures.log_helper import log @@ -30,10 +31,15 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon async def gc(env: NeonEnv, timeline: TimelineId): - psconn = await env.pageserver.connect_async() + pageserver_http = env.pageserver.http_client() - while updates_performed < updates_to_perform: - await psconn.execute(f"do_gc {env.initial_tenant} {timeline} 0") + loop = asyncio.get_running_loop() + + with concurrent.futures.ThreadPoolExecutor() as pool: + while updates_performed < updates_to_perform: + await loop.run_in_executor( + pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + ) # At the same time, run UPDATEs and GC diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 7b61b03b97..885a0dc26f 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -270,8 +270,7 @@ def _import( assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) # Check that gc works - psconn = env.pageserver.connect() - pscur = psconn.cursor() - pscur.execute(f"do_gc {tenant} {timeline} 0") + pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_gc(tenant, timeline, 0) return tar_output_file diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index c99e13f45f..3e387bb6cc 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,4 +1,3 @@ -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.types import TimelineId @@ -29,8 +28,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Get the timeline ID of our branch. We need it for the 'do_gc' command timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) - psconn = env.pageserver.connect() - pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) + pageserver_http = env.pageserver.http_client() # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers. @@ -61,9 +59,8 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - gcrow = pscur.fetchone() - print_gc_result(gcrow) + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) for j in range(100): cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 57b2ee1c04..d8b7256577 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -1,6 +1,3 @@ -from contextlib import closing - -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.types import TimelineId @@ -54,13 +51,11 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 10000 rows: {debug_lsn} xid {debug_xid}") # run GC - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute(f"compact {env.initial_tenant} {timeline}") - # perform aggressive GC. Data still should be kept because of the PITR setting. - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + with env.pageserver.http_client() as pageserver_http: + pageserver_http.timeline_compact(env.initial_tenant, timeline) + # perform aggressive GC. Data still should be kept because of the PITR setting. + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) # Branch at the point where only 100 rows were inserted # It must have been preserved by PITR setting diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 3be64e077f..dfa57aec25 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -106,6 +106,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Similar test, but with more data, and we force checkpoints def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() env.neon_cli.create_branch("test_timetravel", "empty") pg = env.postgres.create_start("test_timetravel") @@ -136,7 +137,7 @@ def test_timetravel(neon_simple_env: NeonEnv): wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to force a new layer file - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id) ##### Restart pageserver env.postgres.stop_all() diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 08c15d8f09..d0ba96e8e0 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -1,7 +1,6 @@ import time from contextlib import closing -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -19,8 +18,8 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): f = env.neon_cli.pageserver_enabled_features() assert ( - "failpoints" in f["features"] - ), "Build pageserver with --features=failpoints option to run this test" + "testing" in f["features"] + ), "Build pageserver with --features=testing option to run this test" neon_env_builder.start() # Create a branch for us @@ -31,26 +30,28 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): with closing(pg.connect()) as conn: with conn.cursor() as cur: - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # Create and initialize test table - cur.execute("CREATE TABLE foo(x bigint)") - cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") + with env.pageserver.http_client() as pageserver_http: + # Create and initialize test table + cur.execute("CREATE TABLE foo(x bigint)") + cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") - # Sleep for some time to let checkpoint create image layers - time.sleep(2) + # Sleep for some time to let checkpoint create image layers + time.sleep(2) - # Configure failpoints - pscur.execute( - "failpoints flush-frozen-before-sync=sleep(2000);checkpoint-after-sync=exit" - ) + # Configure failpoints + pageserver_http.configure_failpoints( + [ + ("flush-frozen-before-sync", "sleep(2000)"), + ("checkpoint-after-sync", "exit"), + ] + ) - # Do some updates until pageserver is crashed - try: - while True: - cur.execute("update foo set x=x+1") - except Exception as err: - log.info(f"Expected server crash {err}") + # Do some updates until pageserver is crashed + try: + while True: + cur.execute("update foo set x=x+1") + except Exception as err: + log.info(f"Expected server crash {err}") log.info("Wait before server restart") env.pageserver.stop() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index cbe74cad5c..3e775b10b0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -57,6 +57,7 @@ def test_remote_storage_backup_and_restore( ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") client = env.pageserver.http_client() @@ -80,7 +81,7 @@ def test_remote_storage_backup_and_restore( wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to be sure that data landed in remote storage - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) log.info(f"waiting for checkpoint {checkpoint_number} upload") # wait until pageserver successfully uploaded a checkpoint to remote storage @@ -99,7 +100,7 @@ def test_remote_storage_backup_and_restore( env.pageserver.start() # Introduce failpoint in download - env.pageserver.safe_psql("failpoints remote-storage-download-pre-rename=return") + pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return")) client.tenant_attach(tenant_id) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index e3c9a091f9..f18e6867a9 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,16 +1,21 @@ from threading import Thread -import psycopg2 import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + NeonPageserverApiException, + NeonPageserverHttpClient, +) from fixtures.types import TenantId, TimelineId -def do_gc_target(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): +def do_gc_target( + pageserver_http: NeonPageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: - env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) except Exception as e: log.error("do_gc failed: %s", e) @@ -44,13 +49,13 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # gc should not try to even start with pytest.raises( - expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" + expected_exception=NeonPageserverApiException, match="gc target timeline does not exist" ): bogus_timeline_id = TimelineId.generate() - env.pageserver.safe_psql(f"do_gc {tenant_id} {bogus_timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) # try to concurrently run gc and detach - gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id)) + gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id)) gc_thread.start() last_error = None @@ -73,6 +78,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() with pytest.raises( - expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id} not found" + expected_exception=NeonPageserverApiException, match=f"Tenant {tenant_id} not found" ): - env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index aa7d92f1fd..2b01546198 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -147,14 +147,13 @@ def populate_branch( def ensure_checkpoint( - pageserver_cur, pageserver_http: NeonPageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, current_lsn: Lsn, ): # run checkpoint manually to be sure that data landed in remote storage - pageserver_cur.execute(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) # wait until pageserver successfully uploaded a checkpoint to remote storage wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) @@ -324,22 +323,19 @@ def test_tenant_relocation( # this requirement introduces a problem # if user creates a branch during migration # it wont appear on the new pageserver - with pg_cur(env.pageserver) as cur: - ensure_checkpoint( - cur, - pageserver_http=pageserver_http, - tenant_id=tenant_id, - timeline_id=timeline_id_main, - current_lsn=current_lsn_main, - ) + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_main, + current_lsn=current_lsn_main, + ) - ensure_checkpoint( - cur, - pageserver_http=pageserver_http, - tenant_id=tenant_id, - timeline_id=timeline_id_second, - current_lsn=current_lsn_second, - ) + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_second, + current_lsn=current_lsn_second, + ) log.info("inititalizing new pageserver") # bootstrap second pageserver diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4500395c8f..52b9e6369c 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -19,7 +19,8 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) - neon_simple_env.pageserver.safe_psql("failpoints tenant-creation-before-tmp-rename=return") + pageserver_http = neon_simple_env.pageserver.http_client() + pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): _ = neon_simple_env.neon_cli.create_tenant() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 85f371c845..83affac062 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -91,5 +91,5 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) # run final checkpoint manually to flush all the data to remote storage - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 83018f46f5..979d1a107f 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -238,6 +238,7 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") @@ -251,7 +252,7 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -264,6 +265,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") @@ -278,8 +280,8 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") - env.pageserver.safe_psql(f"compact {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) + pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -290,6 +292,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") pg = env.postgres.create_start("test_timeline_physical_size_post_gc") @@ -304,7 +307,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pg.safe_psql( """ @@ -315,9 +318,9 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"do_gc {env.initial_tenant} {new_timeline_id} 0") + pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -326,6 +329,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Test the metrics. def test_timeline_size_metrics(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics") pg = env.postgres.create_start("test_timeline_size_metrics") @@ -340,7 +344,7 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() @@ -382,6 +386,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): random.seed(100) env = neon_simple_env + pageserver_http = env.pageserver.http_client() client = env.pageserver.http_client() tenant, timeline = env.neon_cli.create_tenant() @@ -405,7 +410,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, tenant, timeline) - env.pageserver.safe_psql(f"checkpoint {tenant} {timeline}") + pageserver_http.timeline_checkpoint(tenant, timeline) timeline_total_size += get_timeline_physical_size(timeline) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 089ed91c98..931de0f1e3 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -59,9 +59,7 @@ def wait_lsn_force_checkpoint( ) # force checkpoint to advance remote_consistent_lsn - with closing(ps.connect(**pageserver_conn_options)) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) # ensure that remote_consistent_lsn is advanced wait_for_upload( From 6fc719db13a1feec1fef4bd227147ea19e56cf0f Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 07:52:39 +0300 Subject: [PATCH 24/90] Merge timelines.rs with tenant.rs --- pageserver/src/http/routes.rs | 7 +- pageserver/src/lib.rs | 1 - pageserver/src/tenant.rs | 324 ++++++++++++++++++++++++---------- pageserver/src/timelines.rs | 168 ------------------ 4 files changed, 233 insertions(+), 267 deletions(-) delete mode 100644 pageserver/src/timelines.rs diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index bfc9e4462b..0c6f7927fa 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -15,7 +15,7 @@ use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant::{TenantState, Timeline}; use crate::tenant_config::TenantConfOpt; -use crate::{config::PageServerConf, tenant_mgr, timelines}; +use crate::{config::PageServerConf, tenant_mgr}; use utils::{ auth::JwtAuth, http::{ @@ -166,10 +166,9 @@ async fn timeline_create_handler(mut request: Request) -> Result TenantId { + self.tenant_id + } + /// Get Timeline handle for given Neon timeline ID. /// This function is idempotent. It doesn't change internal state in any way. pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { @@ -142,8 +148,7 @@ impl Tenant { .with_context(|| { format!( "Timeline {} was not found for tenant {}", - timeline_id, - self.tenant_id() + timeline_id, self.tenant_id ) }) .map(Arc::clone) @@ -204,98 +209,67 @@ impl Tenant { Ok(new_timeline) } - /// Branch a timeline - pub fn branch_timeline( + /// Create a new timeline. + /// + /// Returns the new timeline ID and reference to its Timeline object. + /// + /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with + /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, + /// a new unique ID is generated. + pub async fn create_timeline( &self, - src: TimelineId, - dst: TimelineId, - start_lsn: Option, - ) -> Result> { - // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn - // about timelines, so otherwise a race condition is possible, where we create new timeline and GC - // concurrently removes data that is needed by the new timeline. - let _gc_cs = self.gc_cs.lock().unwrap(); + new_timeline_id: Option, + ancestor_timeline_id: Option, + mut ancestor_start_lsn: Option, + ) -> Result>> { + let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - // In order for the branch creation task to not wait for GC/compaction, - // we need to make sure that the starting LSN of the child branch is not out of scope midway by - // - // 1. holding the GC lock to prevent overwritting timeline's GC data - // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline - // - // Step 2 is to avoid initializing the new branch using data removed by past GC iterations - // or in-queue GC iterations. - - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = timelines - .get(&src) - // message about timeline being remote is one .context up in the stack - .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; - - let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); - - // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN - let start_lsn = start_lsn.unwrap_or_else(|| { - let lsn = src_timeline.get_last_record_lsn(); - info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); - lsn - }); - - // Check if the starting LSN is out of scope because it is less than - // 1. the latest GC cutoff LSN or - // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. - src_timeline - .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context(format!( - "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn, - ))?; + if self + .conf + .timeline_path(&new_timeline_id, &self.tenant_id) + .exists() { - let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); - if start_lsn < cutoff { - bail!(format!( - "invalid branch start lsn: less than planned GC cutoff {cutoff}" - )); - } + debug!("timeline {new_timeline_id} already exists"); + return Ok(None); } - // Determine prev-LSN for the new timeline. We can only determine it if - // the timeline was branched at the current end of the source timeline. - let RecordLsn { - last: src_last, - prev: src_prev, - } = src_timeline.get_last_record_rlsn(); - let dst_prev = if src_last == start_lsn { - Some(src_prev) - } else { - None + let loaded_timeline = match ancestor_timeline_id { + Some(ancestor_timeline_id) => { + let ancestor_timeline = self + .get_timeline(ancestor_timeline_id) + .context("Cannot branch off the timeline that's not present in pageserver")?; + + if let Some(lsn) = ancestor_start_lsn.as_mut() { + // Wait for the WAL to arrive and be processed on the parent branch up + // to the requested branch point. The repository code itself doesn't + // require it, but if we start to receive WAL on the new timeline, + // decoding the new WAL might need to look up previous pages, relation + // sizes etc. and that would get confused if the previous page versions + // are not in the repository yet. + *lsn = lsn.align(); + ancestor_timeline.wait_lsn(*lsn).await?; + + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > *lsn { + // can we safely just branch from the ancestor instead? + anyhow::bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); + } + } + + self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + } + None => self.bootstrap_timeline(new_timeline_id)?, }; - // create a new timeline directory - let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); - crashsafe_dir::create_dir(&timelinedir)?; + // Have added new timeline into the tenant, now its background tasks are needed. + self.activate(true); - // Create the metadata file, noting the ancestor of the new timeline. - // There is initially no data in it, but all the read-calls know to look - // into the ancestor. - let metadata = TimelineMetadata::new( - start_lsn, - dst_prev, - Some(src), - start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read(), - src_timeline.initdb_lsn, - ); - crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - - let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; - timelines.insert(dst, Arc::clone(&new_timeline)); - - info!("branched timeline {dst} from {src} at {start_lsn}"); - - Ok(new_timeline) + Ok(Some(loaded_timeline)) } /// perform one garbage collection iteration, removing old data files from disk. @@ -948,9 +922,171 @@ impl Tenant { Ok(totals) } - pub fn tenant_id(&self) -> TenantId { - self.tenant_id + fn branch_timeline( + &self, + src: TimelineId, + dst: TimelineId, + start_lsn: Option, + ) -> Result> { + // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn + // about timelines, so otherwise a race condition is possible, where we create new timeline and GC + // concurrently removes data that is needed by the new timeline. + let _gc_cs = self.gc_cs.lock().unwrap(); + + // In order for the branch creation task to not wait for GC/compaction, + // we need to make sure that the starting LSN of the child branch is not out of scope midway by + // + // 1. holding the GC lock to prevent overwritting timeline's GC data + // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline + // + // Step 2 is to avoid initializing the new branch using data removed by past GC iterations + // or in-queue GC iterations. + + // XXX: keep the lock to avoid races during timeline creation + let mut timelines = self.timelines.lock().unwrap(); + let src_timeline = timelines + .get(&src) + // message about timeline being remote is one .context up in the stack + .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; + + let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + + // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN + let start_lsn = start_lsn.unwrap_or_else(|| { + let lsn = src_timeline.get_last_record_lsn(); + info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); + lsn + }); + + // Check if the starting LSN is out of scope because it is less than + // 1. the latest GC cutoff LSN or + // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. + src_timeline + .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {}", + *latest_gc_cutoff_lsn, + ))?; + { + let gc_info = src_timeline.gc_info.read().unwrap(); + let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + if start_lsn < cutoff { + bail!(format!( + "invalid branch start lsn: less than planned GC cutoff {cutoff}" + )); + } + } + + // Determine prev-LSN for the new timeline. We can only determine it if + // the timeline was branched at the current end of the source timeline. + let RecordLsn { + last: src_last, + prev: src_prev, + } = src_timeline.get_last_record_rlsn(); + let dst_prev = if src_last == start_lsn { + Some(src_prev) + } else { + None + }; + + // create a new timeline directory + let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); + crashsafe_dir::create_dir(&timelinedir)?; + + // Create the metadata file, noting the ancestor of the new timeline. + // There is initially no data in it, but all the read-calls know to look + // into the ancestor. + let metadata = TimelineMetadata::new( + start_lsn, + dst_prev, + Some(src), + start_lsn, + *src_timeline.latest_gc_cutoff_lsn.read(), + src_timeline.initdb_lsn, + ); + crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; + save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; + + let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; + timelines.insert(dst, Arc::clone(&new_timeline)); + + info!("branched timeline {dst} from {src} at {start_lsn}"); + + Ok(new_timeline) } + + /// - run initdb to init temporary instance and get bootstrap data + /// - after initialization complete, remove the temp dir. + fn bootstrap_timeline(&self, timeline_id: TimelineId) -> Result> { + // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` + // temporary directory for basebackup files for the given timeline. + let initdb_path = path_with_suffix_extension( + self.conf + .timelines_path(&self.tenant_id) + .join(format!("basebackup-{timeline_id}")), + TEMP_FILE_SUFFIX, + ); + + // Init temporarily repo to get bootstrap data + run_initdb(self.conf, &initdb_path)?; + let pgdata_path = initdb_path; + + let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); + + // Import the contents of the data directory at the initial checkpoint + // LSN, and any WAL after that. + // Initdb lsn will be equal to last_record_lsn which will be set after import. + // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. + let timeline = self.create_empty_timeline(timeline_id, lsn)?; + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + + timeline.checkpoint(CheckpointConfig::Forced)?; + + info!( + "created root timeline {} timeline.lsn {}", + timeline_id, + timeline.get_last_record_lsn() + ); + + // Remove temp dir. We don't need it anymore + fs::remove_dir_all(pgdata_path)?; + + Ok(timeline) + } +} + +/// Create the cluster temporarily in 'initdbpath' directory inside the repository +/// to get bootstrap data for timeline initialization. +fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { + info!("running initdb in {}... ", initdbpath.display()); + + let initdb_path = conf.pg_bin_dir().join("initdb"); + let initdb_output = Command::new(initdb_path) + .args(&["-D", &initdbpath.to_string_lossy()]) + .args(&["-U", &conf.superuser]) + .args(&["-E", "utf8"]) + .arg("--no-instructions") + // This is only used for a temporary installation that is deleted shortly after, + // so no need to fsync it + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .stdout(Stdio::null()) + .output() + .context("failed to execute initdb")?; + if !initdb_output.status.success() { + bail!( + "initdb failed: '{}'", + String::from_utf8_lossy(&initdb_output.stderr) + ); + } + + Ok(()) } impl Drop for Tenant { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs deleted file mode 100644 index 88b26e18f4..0000000000 --- a/pageserver/src/timelines.rs +++ /dev/null @@ -1,168 +0,0 @@ -//! -//! Timeline management code -// - -use std::{ - fs, - path::Path, - process::{Command, Stdio}, - sync::Arc, -}; - -use anyhow::{bail, Context, Result}; -use tracing::*; - -use remote_storage::path_with_suffix_extension; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use crate::config::PageServerConf; -use crate::tenant::{Tenant, Timeline}; -use crate::tenant_mgr; -use crate::CheckpointConfig; -use crate::{import_datadir, TEMP_FILE_SUFFIX}; - -// Create the cluster temporarily in 'initdbpath' directory inside the repository -// to get bootstrap data for timeline initialization. -// -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { - info!("running initdb in {}... ", initdbpath.display()); - - let initdb_path = conf.pg_bin_dir().join("initdb"); - let initdb_output = Command::new(initdb_path) - .args(&["-D", &initdbpath.to_string_lossy()]) - .args(&["-U", &conf.superuser]) - .args(&["-E", "utf8"]) - .arg("--no-instructions") - // This is only used for a temporary installation that is deleted shortly after, - // so no need to fsync it - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) - .stdout(Stdio::null()) - .output() - .context("failed to execute initdb")?; - if !initdb_output.status.success() { - bail!( - "initdb failed: '{}'", - String::from_utf8_lossy(&initdb_output.stderr) - ); - } - - Ok(()) -} - -// -// - run initdb to init temporary instance and get bootstrap data -// - after initialization complete, remove the temp dir. -// -fn bootstrap_timeline( - conf: &'static PageServerConf, - tenant_id: TenantId, - timeline_id: TimelineId, - tenant: &Tenant, -) -> Result> { - // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` - // temporary directory for basebackup files for the given timeline. - let initdb_path = path_with_suffix_extension( - conf.timelines_path(&tenant_id) - .join(format!("basebackup-{timeline_id}")), - TEMP_FILE_SUFFIX, - ); - - // Init temporarily repo to get bootstrap data - run_initdb(conf, &initdb_path)?; - let pgdata_path = initdb_path; - - let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); - - // Import the contents of the data directory at the initial checkpoint - // LSN, and any WAL after that. - // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = tenant.create_empty_timeline(timeline_id, lsn)?; - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; - - fail::fail_point!("before-checkpoint-new-timeline", |_| { - bail!("failpoint before-checkpoint-new-timeline"); - }); - - timeline.checkpoint(CheckpointConfig::Forced)?; - - info!( - "created root timeline {} timeline.lsn {}", - timeline_id, - timeline.get_last_record_lsn() - ); - - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - - Ok(timeline) -} - -/// -/// Create a new timeline. -/// -/// Returns the new timeline ID and reference to its Timeline object. -/// -/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with -/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, -/// a new unique ID is generated. -/// -pub(crate) async fn create_timeline( - conf: &'static PageServerConf, - tenant_id: TenantId, - new_timeline_id: Option, - ancestor_timeline_id: Option, - mut ancestor_start_lsn: Option, -) -> Result>> { - let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - let tenant = tenant_mgr::get_tenant(tenant_id, true)?; - - if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { - debug!("timeline {new_timeline_id} already exists"); - return Ok(None); - } - - let loaded_timeline = match ancestor_timeline_id { - Some(ancestor_timeline_id) => { - let ancestor_timeline = tenant - .get_timeline(ancestor_timeline_id) - .context("Cannot branch off the timeline that's not present in pageserver")?; - - if let Some(lsn) = ancestor_start_lsn.as_mut() { - // Wait for the WAL to arrive and be processed on the parent branch up - // to the requested branch point. The repository code itself doesn't - // require it, but if we start to receive WAL on the new timeline, - // decoding the new WAL might need to look up previous pages, relation - // sizes etc. and that would get confused if the previous page versions - // are not in the repository yet. - *lsn = lsn.align(); - ancestor_timeline.wait_lsn(*lsn).await?; - - let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); - if ancestor_ancestor_lsn > *lsn { - // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", - lsn, - ancestor_timeline_id, - ancestor_ancestor_lsn, - ); - } - } - - tenant.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? - } - None => bootstrap_timeline(conf, tenant_id, new_timeline_id, &tenant)?, - }; - - // Have added new timeline into the tenant, now its background tasks are needed. - tenant.activate(true); - - Ok(Some(loaded_timeline)) -} From 310c507303d642c97a778f9850b57e1593ba5717 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 07:58:06 +0300 Subject: [PATCH 25/90] Merge path retrieval methods in config.rs --- pageserver/src/config.rs | 17 +++++++++++++++++ pageserver/src/storage_sync.rs | 13 ++++--------- pageserver/src/storage_sync/download.rs | 15 +++++++-------- pageserver/src/storage_sync/upload.rs | 5 +++-- pageserver/src/tenant.rs | 9 ++++----- pageserver/src/tenant/metadata.rs | 17 +---------------- pageserver/src/tenant/timeline.rs | 4 ++-- pageserver/src/tenant_config.rs | 11 ----------- pageserver/src/tenant_mgr.rs | 12 +++++------- 9 files changed, 43 insertions(+), 60 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 75c71b09d2..945ee098ea 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -22,6 +22,10 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; +/// The name of the metadata file pageserver creates per timeline. +pub const METADATA_FILE_NAME: &str = "metadata"; +const TENANT_CONFIG_NAME: &str = "config"; + pub mod defaults { use crate::tenant_config::defaults::*; use const_format::formatcp; @@ -346,6 +350,12 @@ impl PageServerConf { self.tenants_path().join(tenant_id.to_string()) } + /// Points to a place in pageserver's local directory, + /// where certain tenant's tenantconf file should be located. + pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf { + self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) + } + pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } @@ -354,6 +364,13 @@ impl PageServerConf { self.timelines_path(tenant_id).join(timeline_id.to_string()) } + /// Points to a place in pageserver's local directory, + /// where certain timeline's metadata file should be located. + pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { + self.timeline_path(&timeline_id, &tenant_id) + .join(METADATA_FILE_NAME) + } + // // Postgres distribution paths // diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 64e0f9a9e3..489d0ad4ed 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -169,13 +169,8 @@ use self::{ upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; use crate::{ - config::PageServerConf, - exponential_backoff, - storage_sync::index::RemoteIndex, - task_mgr, - task_mgr::TaskKind, - task_mgr::BACKGROUND_RUNTIME, - tenant::metadata::{metadata_path, TimelineMetadata}, + config::PageServerConf, exponential_backoff, storage_sync::index::RemoteIndex, task_mgr, + task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata, tenant_mgr::attach_local_tenants, }; use crate::{ @@ -1012,7 +1007,7 @@ async fn update_local_metadata( }; let remote_lsn = remote_metadata.disk_consistent_lsn(); - let local_metadata_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); + let local_metadata_path = conf.metadata_path(sync_id.timeline_id, sync_id.tenant_id); let local_lsn = if local_metadata_path.exists() { let local_metadata = read_metadata_file(&local_metadata_path) .await @@ -1433,7 +1428,7 @@ mod test_utils { } fs::write( - metadata_path(harness.conf, timeline_id, harness.tenant_id), + harness.conf.metadata_path(timeline_id, harness.tenant_id), metadata.to_bytes()?, ) .await?; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 80d5ca5994..980001f95d 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -16,10 +16,7 @@ use tokio::{ }; use tracing::{debug, error, info, warn}; -use crate::{ - config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, - TEMP_FILE_SUFFIX, -}; +use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use super::{ @@ -137,7 +134,8 @@ async fn download_index_part( storage: &GenericRemoteStorage, sync_id: TenantTimelineId, ) -> Result { - let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); let mut index_part_download = storage .download_storage_object(None, &index_part_path) @@ -620,9 +618,10 @@ mod tests { metadata.to_bytes()?, ); - let local_index_part_path = - metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME); + let local_index_part_path = harness + .conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME); let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?; let index_part_local_path = PathBuf::from(index_part_remote_id.to_string()); fs::create_dir_all(index_part_local_path.parent().unwrap()).await?; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index aa5a2232cf..75657915c0 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -15,7 +15,7 @@ use super::{ LayersUpload, SyncData, SyncQueue, }; use crate::metrics::NO_LAYERS_UPLOAD; -use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path}; +use crate::{config::PageServerConf, storage_sync::SyncTask}; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -29,7 +29,8 @@ pub(super) async fn upload_index_part( let index_part_size = index_part_bytes.len(); let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); - let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); storage .upload_storage_object( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cf236a0a9c..b753c1979c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -41,7 +41,7 @@ use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; use crate::repository::GcResult; use crate::storage_sync::index::RemoteIndex; use crate::task_mgr; -use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; @@ -676,7 +676,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_id: TenantId, ) -> anyhow::Result { - let target_config_path = TenantConf::path(conf, tenant_id); + let target_config_path = conf.tenant_config_path(tenant_id); let target_config_display = target_config_path.display(); info!("loading tenantconf from {target_config_display}"); @@ -1134,7 +1134,6 @@ pub mod harness { walredo::{WalRedoError, WalRedoManager}, }; - use super::metadata::metadata_path; use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; @@ -1270,7 +1269,7 @@ pub mod harness { timeline_id: TimelineId, tenant_id: TenantId, ) -> anyhow::Result { - let metadata_path = metadata_path(conf, timeline_id, tenant_id); + let metadata_path = conf.metadata_path(timeline_id, tenant_id); let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { format!( "Failed to read metadata bytes from path {}", @@ -1316,8 +1315,8 @@ pub mod harness { #[cfg(test)] mod tests { - use super::metadata::METADATA_FILE_NAME; use super::*; + use crate::config::METADATA_FILE_NAME; use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; use crate::tenant::harness::*; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index ace4dc91e9..606acbf2f1 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -8,7 +8,6 @@ use std::fs::{File, OpenOptions}; use std::io::Write; -use std::path::PathBuf; use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; @@ -29,9 +28,6 @@ use crate::STORAGE_FORMAT_VERSION; /// see PG_CONTROL_MAX_SAFE_SIZE const METADATA_MAX_SIZE: usize = 512; -/// The name of the metadata file pageserver creates per timeline. -pub const METADATA_FILE_NAME: &str = "metadata"; - /// Metadata stored on disk for each timeline /// /// The fields correspond to the values we hold in memory, in Timeline. @@ -166,17 +162,6 @@ impl TimelineMetadata { } } -/// Points to a place in pageserver's local directory, -/// where certain timeline's metadata file should be located. -pub fn metadata_path( - conf: &'static PageServerConf, - timeline_id: TimelineId, - tenant_id: TenantId, -) -> PathBuf { - conf.timeline_path(&timeline_id, &tenant_id) - .join(METADATA_FILE_NAME) -} - /// Save timeline metadata to file pub fn save_metadata( conf: &'static PageServerConf, @@ -186,7 +171,7 @@ pub fn save_metadata( first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timeline_id, tenant_id); + let path = conf.metadata_path(timeline_id, tenant_id); // use OpenOptions to ensure file presence is consistent with first_save let mut file = VirtualFile::open_with_options( &path, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8670e979ee..b80d023c7f 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -24,12 +24,12 @@ use crate::tenant::{ image_layer::{ImageLayer, ImageLayerWriter}, inmemory_layer::InMemoryLayer, layer_map::{LayerMap, SearchResult}, - metadata::{save_metadata, TimelineMetadata, METADATA_FILE_NAME}, + metadata::{save_metadata, TimelineMetadata}, par_fsync, storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, }; -use crate::config::PageServerConf; +use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 4448ffc456..4c5d5cc3f3 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -8,14 +8,9 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! -use crate::config::PageServerConf; use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; -use std::path::PathBuf; use std::time::Duration; -use utils::id::TenantId; - -pub const TENANT_CONFIG_NAME: &str = "config"; pub mod defaults { // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB @@ -215,12 +210,6 @@ impl TenantConf { } } - /// Points to a place in pageserver's local directory, - /// where certain tenant's tenantconf file should be located. - pub fn path(conf: &'static PageServerConf, tenant_id: TenantId) -> PathBuf { - conf.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) - } - #[cfg(test)] pub fn dummy_conf() -> Self { TenantConf { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d6fa843305..2c6f5fa863 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -12,17 +12,15 @@ use tracing::*; use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; -use crate::config::PageServerConf; +use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::http::models::TenantInfo; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::{ - ephemeral_file::is_ephemeral_file, - metadata::{TimelineMetadata, METADATA_FILE_NAME}, - Tenant, TenantState, + ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState, }; -use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; @@ -246,7 +244,7 @@ fn create_tenant_files( &temporary_tenant_dir, )?; let temporary_tenant_config_path = rebase_directory( - &TenantConf::path(conf, tenant_id), + &conf.tenant_config_path(tenant_id), &target_tenant_directory, &temporary_tenant_dir, )?; @@ -343,7 +341,7 @@ pub fn update_tenant_config( ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); - Tenant::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; + Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?; Ok(()) } From 6b8dcad1bbc02b0f045c0ee192629ef129dd5755 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 08:13:25 +0300 Subject: [PATCH 26/90] Unify timeline creation steps --- pageserver/src/tenant.rs | 73 ++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index b753c1979c..40c9f1e9ad 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -185,27 +185,12 @@ impl Tenant { bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") } - // Create the timeline directory, and write initial metadata to file. - crashsafe_dir::create_dir_all(timeline_path)?; - let new_metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - save_metadata( - self.conf, - new_timeline_id, - self.tenant_id, - &new_metadata, - true, - )?; - let new_timeline = - self.initialize_new_timeline(new_timeline_id, new_metadata, &mut timelines)?; + self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); - if let hash_map::Entry::Vacant(v) = timelines.entry(new_timeline_id) { - v.insert(Arc::clone(&new_timeline)); - } - Ok(new_timeline) } @@ -1004,12 +989,7 @@ impl Tenant { *src_timeline.latest_gc_cutoff_lsn.read(), src_timeline.initdb_lsn, ); - crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - - let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; - timelines.insert(dst, Arc::clone(&new_timeline)); - + let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; info!("branched timeline {dst} from {src} at {start_lsn}"); Ok(new_timeline) @@ -1057,6 +1037,55 @@ impl Tenant { Ok(timeline) } + + fn create_initialized_timeline( + &self, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + timelines: &mut MutexGuard>>, + ) -> Result> { + crashsafe_dir::create_dir_all(self.conf.timeline_path(&new_timeline_id, &self.tenant_id)) + .with_context(|| { + format!( + "Failed to create timeline {}/{} directory", + new_timeline_id, self.tenant_id + ) + })?; + save_metadata( + self.conf, + new_timeline_id, + self.tenant_id, + &new_metadata, + true, + ) + .with_context(|| { + format!( + "Failed to create timeline {}/{} metadata", + new_timeline_id, self.tenant_id + ) + })?; + + let new_timeline = self + .initialize_new_timeline(new_timeline_id, new_metadata, timelines) + .with_context(|| { + format!( + "Failed to initialize timeline {}/{}", + new_timeline_id, self.tenant_id + ) + })?; + + match timelines.entry(new_timeline_id) { + hash_map::Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {} in the tenant map", + new_timeline_id + ), + hash_map::Entry::Vacant(v) => { + v.insert(Arc::clone(&new_timeline)); + } + } + + Ok(new_timeline) + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository From 8d7024a8c26d9f143202d28665ec2ae8a8e32ea1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 08:24:18 +0300 Subject: [PATCH 27/90] Move path manipulation function to utils --- Cargo.lock | 7 +-- libs/remote_storage/Cargo.toml | 1 + libs/remote_storage/src/lib.rs | 47 ------------------ libs/remote_storage/src/local_fs.rs | 3 +- libs/utils/src/crashsafe_dir.rs | 49 ++++++++++++++++++- pageserver/src/storage_sync/download.rs | 7 ++- pageserver/src/tenant.rs | 6 +-- pageserver/src/tenant_mgr.rs | 4 +- .../src/walreceiver/connection_manager.rs | 2 +- pageserver/src/walredo.rs | 2 +- workspace_hack/Cargo.toml | 6 --- 11 files changed, 62 insertions(+), 72 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3ce0ce465f..fc4ef90b8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2444,6 +2444,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "utils", "workspace_hack", ] @@ -3929,13 +3930,7 @@ dependencies = [ "chrono", "either", "fail", - "futures-channel", - "futures-task", - "futures-util", - "generic-array", "hashbrown", - "hex", - "hyper", "indexmap", "itoa 0.4.8", "libc", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index b3485f274a..cec344a4ad 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = { version = "1.0", features = ["backtrace"] } async-trait = "0.1" metrics = { version = "0.1", path = "../metrics" } +utils = { version = "0.1", path = "../utils" } once_cell = "1.13.0" rusoto_core = "0.48" rusoto_s3 = "0.48" diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 6b3fd29a0e..4bdd2b9608 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -9,9 +9,7 @@ mod local_fs; mod s3_bucket; use std::{ - borrow::Cow, collections::HashMap, - ffi::OsStr, fmt::{Debug, Display}, num::{NonZeroU32, NonZeroUsize}, ops::Deref, @@ -344,22 +342,6 @@ impl Debug for S3Config { } } -/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension, -/// or if there's no extension, creates one and puts a suffix there. -pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { - let new_extension = match original_path - .as_ref() - .extension() - .map(OsStr::to_string_lossy) - { - Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), - None => Cow::Borrowed(suffix), - }; - original_path - .as_ref() - .with_extension(new_extension.as_ref()) -} - impl RemoteStorageConfig { pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { let local_path = toml.get("local_path"); @@ -448,35 +430,6 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { mod tests { use super::*; - #[test] - fn test_path_with_suffix_extension() { - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp").to_string_lossy(), - "/foo/bar.temp" - ); - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.baz.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar.baz..temp" - ); - let p = PathBuf::from("/foo/bar/dir/"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar/dir..temp" - ); - } - #[test] fn object_name() { let k = RemoteObjectId("a/b/c".to_owned()); diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 3ffbf3cb39..5723a512f6 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -16,8 +16,9 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; -use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId}; +use crate::{Download, DownloadError, RemoteObjectId}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; diff --git a/libs/utils/src/crashsafe_dir.rs b/libs/utils/src/crashsafe_dir.rs index a7eab73a43..032ab0a916 100644 --- a/libs/utils/src/crashsafe_dir.rs +++ b/libs/utils/src/crashsafe_dir.rs @@ -1,7 +1,9 @@ use std::{ + borrow::Cow, + ffi::OsStr, fs::{self, File}, io, - path::Path, + path::{Path, PathBuf}, }; /// Similar to [`std::fs::create_dir`], except we fsync the @@ -74,6 +76,22 @@ pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { Ok(()) } +/// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension, +/// or if there's no extension, creates one and puts a suffix there. +pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { + let new_extension = match original_path + .as_ref() + .extension() + .map(OsStr::to_string_lossy) + { + Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), + None => Cow::Borrowed(suffix), + }; + original_path + .as_ref() + .with_extension(new_extension.as_ref()) +} + #[cfg(test)] mod tests { use tempfile::tempdir; @@ -122,4 +140,33 @@ mod tests { let invalid_dir_path = file_path.join("folder"); create_dir_all(&invalid_dir_path).unwrap_err(); } + + #[test] + fn test_path_with_suffix_extension() { + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp").to_string_lossy(), + "/foo/bar.temp" + ); + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.baz.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar.baz..temp" + ); + let p = PathBuf::from("/foo/bar/dir/"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar/dir..temp" + ); + } } diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 980001f95d..3e850443d8 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -9,7 +9,7 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{path_with_suffix_extension, DownloadError, GenericRemoteStorage}; +use remote_storage::{DownloadError, GenericRemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -17,7 +17,10 @@ use tokio::{ use tracing::{debug, error, info, warn}; use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX}; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::{ + crashsafe_dir::path_with_suffix_extension, + id::{TenantId, TenantTimelineId, TimelineId}, +}; use super::{ index::{IndexPart, RemoteTimeline}, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 40c9f1e9ad..ca97796870 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -14,6 +14,7 @@ use anyhow::{bail, ensure, Context, Result}; use tokio::sync::watch; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; use std::cmp::min; use std::collections::hash_map; @@ -45,7 +46,6 @@ use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; -use remote_storage::path_with_suffix_extension; use toml_edit; use utils::{ @@ -974,10 +974,6 @@ impl Tenant { None }; - // create a new timeline directory - let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); - crashsafe_dir::create_dir(&timelinedir)?; - // Create the metadata file, noting the ancestor of the new timeline. // There is initially no data in it, but all the read-calls know to look // into the ancestor. diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 2c6f5fa863..fcb2c18b79 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use anyhow::Context; use tracing::*; -use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; +use remote_storage::GenericRemoteStorage; use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::http::models::TenantInfo; @@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use utils::crashsafe_dir; +use utils::crashsafe_dir::{self, path_with_suffix_extension}; use utils::id::{TenantId, TimelineId}; mod tenants_state { diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 799062e935..148372c9d0 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1358,7 +1358,7 @@ mod tests { const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; - fn dummy_state(harness: &TenantHarness) -> WalreceiverState { + fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState { WalreceiverState { id: TenantTimelineId { tenant_id: harness.tenant_id, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9faabfebda..79c2edc96e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,7 +21,6 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; -use remote_storage::path_with_suffix_extension; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -36,6 +35,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 96594bbf96..dc4cbb5284 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -21,13 +21,7 @@ bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } -futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } -futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } -futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } -generic-array = { version = "0.14", default-features = false, features = ["more_lengths"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } -hex = { version = "0.4", features = ["alloc", "serde", "std"] } -hyper = { version = "0.14", features = ["client", "full", "h2", "http1", "http2", "runtime", "server", "socket2", "stream", "tcp"] } indexmap = { version = "1", default-features = false, features = ["std"] } itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } From 6f949e15563280cc791b02940c711a5641813891 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 20 Sep 2022 17:02:10 -0700 Subject: [PATCH 28/90] Improve pageserver/safekeepeer HTTP API errors (#2461) Part of the general work on improving pageserver logs. Brief summary of changes: * Remove `ApiError::from_err` * Remove `impl From for ApiError` * Convert `ApiError::{BadRequest, NotFound}` to use `anyhow::Error` * Note: `NotFound` has more verbose formatting because it's more likely to have useful information for the receiving "user" * Explicitly convert from `tokio::task::JoinError`s into `InternalServerError`s where appropriate Also note: many of the places where errors were implicitly converted to 500s have now been updated to return a more appropriate error. Some places where it's not yet possible to distinguish the error types have been left as 500s. --- libs/utils/src/http/error.rs | 17 +-- libs/utils/src/http/json.rs | 13 +- libs/utils/src/http/request.rs | 13 +- pageserver/src/http/routes.rs | 220 +++++++++++++++++++++------------ safekeeper/src/http/routes.rs | 39 ++++-- 5 files changed, 195 insertions(+), 107 deletions(-) diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index b3bbec0f1c..b0ecb746d9 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -1,12 +1,11 @@ -use anyhow::anyhow; use hyper::{header, Body, Response, StatusCode}; use serde::{Deserialize, Serialize}; use thiserror::Error; #[derive(Debug, Error)] pub enum ApiError { - #[error("Bad request: {0}")] - BadRequest(String), + #[error("Bad request: {0:#?}")] + BadRequest(anyhow::Error), #[error("Forbidden: {0}")] Forbidden(String), @@ -15,24 +14,20 @@ pub enum ApiError { Unauthorized(String), #[error("NotFound: {0}")] - NotFound(String), + NotFound(anyhow::Error), #[error("Conflict: {0}")] Conflict(String), #[error(transparent)] - InternalServerError(#[from] anyhow::Error), + InternalServerError(anyhow::Error), } impl ApiError { - pub fn from_err>(err: E) -> Self { - Self::InternalServerError(anyhow!(err)) - } - pub fn into_response(self) -> Response { match self { - ApiError::BadRequest(_) => HttpErrorBody::response_from_msg_and_status( - self.to_string(), + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause StatusCode::BAD_REQUEST, ), ApiError::Forbidden(_) => { diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 08f2ac4205..8981fdd1dd 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -1,3 +1,4 @@ +use anyhow::Context; use bytes::Buf; use hyper::{header, Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; @@ -9,20 +10,24 @@ pub async fn json_request Deserialize<'de>>( ) -> Result { let whole_body = hyper::body::aggregate(request.body_mut()) .await - .map_err(ApiError::from_err)?; + .context("Failed to read request body") + .map_err(ApiError::BadRequest)?; serde_json::from_reader(whole_body.reader()) - .map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err))) + .context("Failed to parse json request") + .map_err(ApiError::BadRequest) } pub fn json_response( status: StatusCode, data: T, ) -> Result, ApiError> { - let json = serde_json::to_string(&data).map_err(ApiError::from_err)?; + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; let response = Response::builder() .status(status) .header(header::CONTENT_TYPE, "application/json") .body(Body::from(json)) - .map_err(ApiError::from_err)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 4984d695fd..7b96ccd584 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -1,6 +1,7 @@ use std::str::FromStr; use super::error::ApiError; +use anyhow::anyhow; use hyper::{body::HttpBody, Body, Request}; use routerify::ext::RequestExt; @@ -10,9 +11,8 @@ pub fn get_request_param<'a>( ) -> Result<&'a str, ApiError> { match request.param(param_name) { Some(arg) => Ok(arg), - None => Err(ApiError::BadRequest(format!( - "no {} specified in path param", - param_name + None => Err(ApiError::BadRequest(anyhow!( + "no {param_name} specified in path param", ))), } } @@ -23,16 +23,15 @@ pub fn parse_request_param( ) -> Result { match get_request_param(request, param_name)?.parse() { Ok(v) => Ok(v), - Err(_) => Err(ApiError::BadRequest(format!( - "failed to parse {}", - param_name + Err(_) => Err(ApiError::BadRequest(anyhow!( + "failed to parse {param_name}", ))), } } pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { match request.body_mut().data().await { - Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())), + Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))), None => Ok(()), } } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0c6f7927fa..c676dfacd2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,9 +1,10 @@ use std::sync::Arc; -use anyhow::{Context, Result}; +use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use remote_storage::GenericRemoteStorage; +use tokio::task::JoinError; use tracing::*; use super::models::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; @@ -166,7 +167,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result { // Created. Construct a TimelineInfo for it. - let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; + let local_info = local_timeline_info_from_timeline(&new_timeline, false, false) + .map_err(ApiError::InternalServerError)?; Ok(Some(TimelineInfo { tenant_id, timeline_id: new_timeline.timeline_id, @@ -184,12 +186,11 @@ async fn timeline_create_handler(mut request: Request) -> Result Ok(None), // timeline already exists - Err(err) => Err(err), + Err(err) => Err(ApiError::InternalServerError(err)), } } .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) - .await - .map_err(ApiError::from_err)?; + .await?; Ok(match new_timeline_info { Some(info) => json_response(StatusCode::CREATED, info)?, @@ -207,10 +208,11 @@ async fn timeline_list_handler(request: Request) -> Result, let timelines = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - Ok::<_, anyhow::Error>(tenant_mgr::get_tenant(tenant_id, true)?.list_timelines()) + let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + Ok(tenant.list_timelines()) }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; let mut response_data = Vec::with_capacity(timelines.len()); for (timeline_id, timeline) in timelines { @@ -275,7 +277,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result((local_timeline_info, remote_timeline_info)) + Ok::<_, ApiError>((local_timeline_info, remote_timeline_info)) } .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) .await?; if local_timeline_info.is_none() && remote_timeline_info.is_none() { - Err(ApiError::NotFound(format!( + Err(ApiError::NotFound(anyhow!( "Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely" ))) } else { @@ -332,14 +334,14 @@ async fn tenant_attach_handler(request: Request) -> Result, info!("Handling tenant attach {tenant_id}"); - tokio::task::spawn_blocking(move || { - if tenant_mgr::get_tenant(tenant_id, false).is_ok() { - anyhow::bail!("Tenant is already present locally") - }; - Ok(()) + tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) { + Ok(_) => Err(ApiError::Conflict( + "Tenant is already present locally".to_owned(), + )), + Err(_) => Ok(()), }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; let state = get_state(&request); let remote_index = &state.remote_index; @@ -364,12 +366,12 @@ async fn tenant_attach_handler(request: Request) -> Result, // download index parts for every tenant timeline let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await { Ok(Some(remote_timelines)) => remote_timelines, - Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())), + Ok(None) => return Err(ApiError::NotFound(anyhow!("Unknown remote tenant"))), Err(e) => { error!("Failed to retrieve remote tenant data: {:?}", e); - return Err(ApiError::NotFound( - "Failed to retrieve remote tenant".to_string(), - )); + return Err(ApiError::NotFound(anyhow!( + "Failed to retrieve remote tenant" + ))); } }; @@ -392,7 +394,8 @@ async fn tenant_attach_handler(request: Request) -> Result, for (timeline_id, mut remote_timeline) in remote_timelines { tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) .await - .context("Failed to create new timeline directory")?; + .context("Failed to create new timeline directory") + .map_err(ApiError::InternalServerError)?; remote_timeline.awaits_download = true; tenant_entry.insert(timeline_id, remote_timeline); @@ -438,7 +441,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, tenant_mgr::detach_tenant(conf, tenant_id) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) .await - .map_err(ApiError::from_err)?; + // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. + // Replace this with better handling once the error type permits it. + .map_err(ApiError::InternalServerError)?; let mut remote_index = state.remote_index.write().await; remote_index.remove_tenant_entry(&tenant_id); @@ -478,7 +486,7 @@ async fn tenant_list_handler(request: Request) -> Result, A crate::tenant_mgr::list_tenant_info(&remote_index) }) .await - .map_err(ApiError::from_err)?; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; json_response(StatusCode::OK, response_data) } @@ -490,7 +498,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro // if tenant is in progress of downloading it can be absent in global tenant map let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false)) .await - .map_err(ApiError::from_err)?; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; let state = get_state(&request); let remote_index = &state.remote_index; @@ -519,7 +527,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro let current_physical_size = match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false)) .await - .map_err(ApiError::from_err)? + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))? { Err(err) => { // Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded). @@ -545,6 +553,16 @@ async fn tenant_status(request: Request) -> Result, ApiErro ) } +// Helper function to standardize the error messages we produce on bad durations +// +// Intended to be used with anyhow's `with_context`, e.g.: +// +// let value = result.with_context(bad_duration("name", &value))?; +// +fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { + move || format!("Cannot parse `{field_name}` duration {value:?}") +} + async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -553,25 +571,39 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result json_response(StatusCode::CREATED, TenantCreateResponse(id))?, @@ -618,24 +659,38 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result) -> Result, ApiError> { if !fail::has_failpoints() { - return Err(ApiError::BadRequest( + return Err(ApiError::BadRequest(anyhow!( "Cannot manage failpoints because pageserver was compiled without failpoints support" - .to_owned(), - )); + ))); } let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; @@ -691,7 +754,7 @@ async fn failpoints_handler(mut request: Request) -> Result }; if let Err(err_msg) = cfg_result { - return Err(ApiError::BadRequest(format!( + return Err(ApiError::BadRequest(anyhow!( "Failed to configure failpoints: {err_msg}" ))); } @@ -713,7 +776,7 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result {{ #[cfg(not(feature = "testing"))] async fn cfg_disabled(_req: Request) -> Result, ApiError> { - Err(ApiError::BadRequest( - concat!( - "Cannot ", - $handler_desc, - " because pageserver was compiled without testing APIs", - ) - .to_owned(), - )) + Err(ApiError::BadRequest(anyhow!(concat!( + "Cannot ", + $handler_desc, + " because pageserver was compiled without testing APIs", + )))) } #[cfg(feature = "testing")] diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 244325368b..43c0a17f84 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,12 +1,14 @@ use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode, Uri}; +use anyhow::Context; use once_cell::sync::Lazy; use serde::Serialize; use serde::Serializer; use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::sync::Arc; +use tokio::task::JoinError; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; @@ -99,7 +101,12 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result Date: Wed, 21 Sep 2022 13:13:11 +0300 Subject: [PATCH 29/90] Use prebuilt image with Hakari for CI style checks (#2488) --- .github/workflows/codestyle.yml | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 5220258ef0..641943199e 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -108,20 +108,32 @@ jobs: target key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - # https://github.com/facebookincubator/cargo-guppy/tree/main/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check every project module is covered by Hakari - run: | - cargo install cargo-hakari - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - shell: bash -euxo pipefail {0} - - name: Run cargo clippy run: ./run_clippy.sh - name: Ensure all project builds run: cargo build --locked --all --all-targets + check-rust-dependencies: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + fetch-depth: 1 + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check every project module is covered by Hakari + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + shell: bash -euxo pipefail {0} + check-codestyle-python: runs-on: [ self-hosted, Linux, k8s-runner ] steps: From b82e2e3f18cbeb08c45074015cbe4606d36c51c5 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 21 Sep 2022 11:08:12 +0300 Subject: [PATCH 30/90] Bump postgres submodules and update docs/core_changes.md. The old change to downgrade a WARNING in postgres vacuumlazy.c was reverted. --- docs/core_changes.md | 25 ------------------------- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 2 insertions(+), 27 deletions(-) diff --git a/docs/core_changes.md b/docs/core_changes.md index 8f29dd9121..ea219adae9 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -148,31 +148,6 @@ relcache? (I think we do cache nblocks in relcache already, check why that's not Neon) -## Misc change in vacuumlazy.c - -``` -index 8aab6e324e..c684c4fbee 100644 ---- a/src/backend/access/heap/vacuumlazy.c -+++ b/src/backend/access/heap/vacuumlazy.c -@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) - else if (all_visible_according_to_vm && !PageIsAllVisible(page) - && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) - { -- elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", -+ /* ZENITH-XXX: all visible hint is not wal-logged -+ * FIXME: Replay visibilitymap changes in pageserver -+ */ -+ elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno); - visibilitymap_clear(vacrel->rel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); -``` - - -Is this still needed? If that WARNING happens, it looks like potential corruption that we should -fix! - - ## Use buffer manager when extending VM or FSM ``` diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 796770565f..19d948fd47 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 796770565ff668b585e80733b8d679961ad50e93 +Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 34c47d6c99..5b8b3eeef5 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 34c47d6c99415c94296d5e599ec5590d0001d6c2 +Subproject commit 5b8b3eeef5ec34c0cad9377833906a1387841d04 From 19fa410ff84ad41ce39fcbdedf1e8e7c158ef1b4 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 21 Sep 2022 12:50:37 +0100 Subject: [PATCH 31/90] NeonCompare: switch to new pageserver HTTP API --- test_runner/fixtures/compare_fixtures.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index ceeeffc785..78a12c6c45 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -89,16 +89,13 @@ class NeonCompare(PgCompare): self.env = neon_simple_env self._zenbenchmark = zenbenchmark self._pg_bin = pg_bin + self.pageserver_http_client = self.env.pageserver.http_client() # We only use one branch and one timeline self.env.neon_cli.create_branch(branch_name, "empty") self._pg = self.env.postgres.create_start(branch_name) self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] - # Long-lived cursor, useful for flushing - self.psconn = self.env.pageserver.connect() - self.pscur = self.psconn.cursor() - @property def pg(self): return self._pg @@ -112,10 +109,10 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): - self.pscur.execute(f"do_gc {self.env.initial_tenant} {self.timeline} 0") + self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0) def compact(self): - self.pscur.execute(f"compact {self.env.initial_tenant} {self.timeline}") + self.pageserver_http_client.timeline_compact(self.env.initial_tenant, self.timeline) def report_peak_memory_use(self) -> None: self.zenbenchmark.record( From 7eebb45ea6635404d494563af3d58790a44a68eb Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 21 Sep 2022 18:13:30 +0200 Subject: [PATCH 32/90] Reduce metrics footprint in safekeeper (#2491) Fixes bugs with metrics in control_file and wal_storage, where we haven't deleted metrics for inactive timelines. --- safekeeper/src/control_file.rs | 24 +----- safekeeper/src/metrics.rs | 138 ++++++++++++++++++++++++++++++++- safekeeper/src/safekeeper.rs | 4 + safekeeper/src/timeline.rs | 1 + safekeeper/src/wal_storage.rs | 92 +++++----------------- 5 files changed, 162 insertions(+), 97 deletions(-) diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 22ed34cc00..6be3f9abb2 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,7 +2,6 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use once_cell::sync::Lazy; use std::fs::{self, File, OpenOptions}; use std::io::{Read, Write}; @@ -10,8 +9,8 @@ use std::ops::Deref; use std::path::{Path, PathBuf}; use crate::control_file_upgrade::upgrade_control_file; +use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; -use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; @@ -24,16 +23,6 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control"; const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); -static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_persist_control_file_seconds", - "Seconds to persist and sync control file, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") -}); - /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. pub trait Storage: Deref { @@ -46,7 +35,6 @@ pub struct FileStorage { // save timeline dir to avoid reconstructing it every time timeline_dir: PathBuf, conf: SafeKeeperConf, - persist_control_file_seconds: Histogram, /// Last state persisted to disk. state: SafeKeeperState, @@ -56,16 +44,12 @@ impl FileStorage { /// Initialize storage by loading state from disk. pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { let timeline_dir = conf.timeline_dir(ttid); - let tenant_id = ttid.tenant_id.to_string(); - let timeline_id = ttid.timeline_id.to_string(); let state = Self::load_control_file_conf(conf, ttid)?; Ok(FileStorage { timeline_dir, conf: conf.clone(), - persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS - .with_label_values(&[&tenant_id, &timeline_id]), state, }) } @@ -77,14 +61,10 @@ impl FileStorage { state: SafeKeeperState, ) -> Result { let timeline_dir = conf.timeline_dir(ttid); - let tenant_id = ttid.tenant_id.to_string(); - let timeline_id = ttid.timeline_id.to_string(); let store = FileStorage { timeline_dir, conf: conf.clone(), - persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS - .with_label_values(&[&tenant_id, &timeline_id]), state, }; @@ -175,7 +155,7 @@ impl Storage for FileStorage { /// persists state durably to underlying storage /// for description see https://lwn.net/Articles/457667/ fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { - let _timer = &self.persist_control_file_seconds.start_timer(); + let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); // write data to safekeeper.control.partial let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 851a568aec..51138df776 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,12 +1,15 @@ -//! This module exports metrics for all active timelines. +//! Global safekeeper mertics and per-timeline safekeeper metrics. use std::time::{Instant, SystemTime}; +use ::metrics::{register_histogram, GaugeVec, Histogram, DISK_WRITE_SECONDS_BUCKETS}; +use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, proto::MetricFamily, Gauge, IntGaugeVec, }; +use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; use utils::{id::TenantTimelineId, lsn::Lsn}; @@ -16,6 +19,85 @@ use crate::{ GlobalTimelines, }; +// Global metrics across all timelines. +pub static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_bytes", + "Bytes written to WAL in a single request", + vec![ + 1.0, + 10.0, + 100.0, + 1024.0, + 8192.0, + 128.0 * 1024.0, + 1024.0 * 1024.0, + 10.0 * 1024.0 * 1024.0 + ] + ) + .expect("Failed to register safekeeper_write_wal_bytes histogram") +}); +pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_seconds", + "Seconds spent writing and syncing WAL to a disk in a single request", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_write_wal_seconds histogram") +}); +pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_flush_wal_seconds", + "Seconds spent syncing WAL to a disk", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_flush_wal_seconds histogram") +}); +pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_persist_control_file_seconds", + "Seconds to persist and sync control file", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") +}); + +/// Metrics for WalStorage in a single timeline. +#[derive(Clone, Default)] +pub struct WalStorageMetrics { + /// How much bytes were written in total. + write_wal_bytes: u64, + /// How much time spent writing WAL to disk, waiting for write(2). + write_wal_seconds: f64, + /// How much time spent syncing WAL to disk, waiting for fsync(2). + flush_wal_seconds: f64, +} + +impl WalStorageMetrics { + pub fn observe_write_bytes(&mut self, bytes: usize) { + self.write_wal_bytes += bytes as u64; + WRITE_WAL_BYTES.observe(bytes as f64); + } + + pub fn observe_write_seconds(&mut self, seconds: f64) { + self.write_wal_seconds += seconds; + WRITE_WAL_SECONDS.observe(seconds); + } + + pub fn observe_flush_seconds(&mut self, seconds: f64) { + self.flush_wal_seconds += seconds; + FLUSH_WAL_SECONDS.observe(seconds); + } +} + +/// Accepts a closure that returns a result, and returns the duration of the closure. +pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result { + let start = std::time::Instant::now(); + closure()?; + Ok(start.elapsed().as_secs_f64()) +} + +/// Metrics for a single timeline. pub struct FullTimelineInfo { pub ttid: TenantTimelineId, pub replicas: Vec, @@ -29,8 +111,11 @@ pub struct FullTimelineInfo { pub persisted_state: SafeKeeperState, pub flush_lsn: Lsn, + + pub wal_storage: WalStorageMetrics, } +/// Collects metrics for all active timelines. pub struct TimelineCollector { descs: Vec, commit_lsn: GenericGaugeVec, @@ -46,6 +131,9 @@ pub struct TimelineCollector { connected_computes: IntGaugeVec, disk_usage: GenericGaugeVec, acceptor_term: GenericGaugeVec, + written_wal_bytes: GenericGaugeVec, + written_wal_seconds: GaugeVec, + flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, } @@ -186,6 +274,36 @@ impl TimelineCollector { .unwrap(); descs.extend(acceptor_term.desc().into_iter().cloned()); + let written_wal_bytes = GenericGaugeVec::new( + Opts::new( + "safekeeper_written_wal_bytes_total", + "Number of WAL bytes written to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_bytes.desc().into_iter().cloned()); + + let written_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_written_wal_seconds_total", + "Total time spent in write(2) writing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_seconds.desc().into_iter().cloned()); + + let flushed_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_flushed_wal_seconds_total", + "Total time spent in fsync(2) flushing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(flushed_wal_seconds.desc().into_iter().cloned()); + let collect_timeline_metrics = Gauge::new( "safekeeper_collect_timeline_metrics_seconds", "Time spent collecting timeline metrics, including obtaining mutex lock for all timelines", @@ -208,6 +326,9 @@ impl TimelineCollector { connected_computes, disk_usage, acceptor_term, + written_wal_bytes, + written_wal_seconds, + flushed_wal_seconds, collect_timeline_metrics, } } @@ -235,6 +356,9 @@ impl Collector for TimelineCollector { self.connected_computes.reset(); self.disk_usage.reset(); self.acceptor_term.reset(); + self.written_wal_bytes.reset(); + self.written_wal_seconds.reset(); + self.flushed_wal_seconds.reset(); let timelines = GlobalTimelines::get_all(); @@ -292,6 +416,15 @@ impl Collector for TimelineCollector { self.acceptor_term .with_label_values(labels) .set(tli.persisted_state.acceptor_state.term as u64); + self.written_wal_bytes + .with_label_values(labels) + .set(tli.wal_storage.write_wal_bytes); + self.written_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.write_wal_seconds); + self.flushed_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.flush_wal_seconds); if let Some(feedback) = most_advanced { self.feedback_ps_write_lsn @@ -332,6 +465,9 @@ impl Collector for TimelineCollector { mfs.extend(self.connected_computes.collect()); mfs.extend(self.disk_usage.collect()); mfs.extend(self.acceptor_term.collect()); + mfs.extend(self.written_wal_bytes.collect()); + mfs.extend(self.written_wal_seconds.collect()); + mfs.extend(self.flushed_wal_seconds.collect()); // report time it took to collect all info let elapsed = start_collecting.elapsed().as_secs_f64(); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d34a77e02b..65340ac0ed 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -998,6 +998,10 @@ mod tests { fn remove_up_to(&self) -> Box Result<()>> { Box::new(move |_segno_up_to: XLogSegNo| Ok(())) } + + fn get_metrics(&self) -> crate::metrics::WalStorageMetrics { + crate::metrics::WalStorageMetrics::default() + } } #[test] diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4000815857..ec29e13931 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -534,6 +534,7 @@ impl Timeline { mem_state: state.sk.inmem.clone(), persisted_state: state.sk.state.clone(), flush_lsn: state.sk.wal_store.flush_lsn(), + wal_storage: state.sk.wal_store.get_metrics(), }) } else { None diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index ea613dd0f1..692bd18342 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -8,11 +8,11 @@ //! Note that last file has `.partial` suffix, that's different from postgres. use anyhow::{bail, Context, Result}; + use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; -use once_cell::sync::Lazy; use postgres_ffi::v14::xlog_utils::{ find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; @@ -27,6 +27,7 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; +use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; use crate::wal_backup::read_object; @@ -36,67 +37,8 @@ use postgres_ffi::XLOG_BLCKSZ; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; - use tokio::io::{AsyncReadExt, AsyncSeekExt}; -// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). -// i64 is faster than f64, so update to u64 when available. -static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_write_wal_bytes", - "Bytes written to WAL in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - vec![ - 1.0, - 10.0, - 100.0, - 1024.0, - 8192.0, - 128.0 * 1024.0, - 1024.0 * 1024.0, - 10.0 * 1024.0 * 1024.0 - ] - ) - .expect("Failed to register safekeeper_write_wal_bytes histogram vec") -}); -static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_write_wal_seconds", - "Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_write_wal_seconds histogram vec") -}); -static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_flush_wal_seconds", - "Seconds spent syncing WAL to a disk, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_flush_wal_seconds histogram vec") -}); - -struct WalStorageMetrics { - write_wal_bytes: Histogram, - write_wal_seconds: Histogram, - flush_wal_seconds: Histogram, -} - -impl WalStorageMetrics { - fn new(ttid: &TenantTimelineId) -> Self { - let tenant_id = ttid.tenant_id.to_string(); - let timeline_id = ttid.timeline_id.to_string(); - Self { - write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), - write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), - flush_wal_seconds: FLUSH_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), - } - } -} - pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; @@ -113,6 +55,9 @@ pub trait Storage { /// Remove all segments <= given segno. Returns closure as we want to do /// that without timeline lock. fn remove_up_to(&self) -> Box Result<()>>; + + /// Get metrics for this timeline. + fn get_metrics(&self) -> WalStorageMetrics; } /// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes @@ -187,7 +132,7 @@ impl PhysicalStorage { } Ok(PhysicalStorage { - metrics: WalStorageMetrics::new(ttid), + metrics: WalStorageMetrics::default(), timeline_dir, conf: conf.clone(), wal_seg_size, @@ -200,28 +145,26 @@ impl PhysicalStorage { } /// Call fdatasync if config requires so. - fn fdatasync_file(&self, file: &mut File) -> Result<()> { + fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .flush_wal_seconds - .observe_closure_duration(|| file.sync_data())?; + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?); } Ok(()) } /// Call fsync if config requires so. - fn fsync_file(&self, file: &mut File) -> Result<()> { + fn fsync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .flush_wal_seconds - .observe_closure_duration(|| file.sync_all())?; + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?); } Ok(()) } /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. - fn open_or_create(&self, segno: XLogSegNo) -> Result<(File, bool)> { + fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; @@ -335,13 +278,10 @@ impl Storage for PhysicalStorage { ); } - { - let _timer = self.metrics.write_wal_seconds.start_timer(); - self.write_exact(startpos, buf)?; - } - + let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?; // WAL is written, updating write metrics - self.metrics.write_wal_bytes.observe(buf.len() as f64); + self.metrics.observe_write_seconds(write_seconds); + self.metrics.observe_write_bytes(buf.len()); // figure out last record's end lsn for reporting (if we got the // whole record) @@ -444,6 +384,10 @@ impl Storage for PhysicalStorage { remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) }) } + + fn get_metrics(&self) -> WalStorageMetrics { + self.metrics.clone() + } } /// Remove all WAL segments in timeline_dir that match the given predicate. From e9a103c09f4e24a70697a3187419b4a51b024209 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 21 Sep 2022 21:42:47 +0300 Subject: [PATCH 33/90] [proxy] Pass extra parameters to the console (#2467) With this change we now pass additional params to the console's auth methods. --- Cargo.lock | 6 ++ proxy/Cargo.toml | 8 +- proxy/src/auth.rs | 2 +- proxy/src/auth/backend.rs | 132 +++++++++++++++--------------- proxy/src/auth/backend/console.rs | 57 +++++++++---- proxy/src/auth/backend/link.rs | 6 +- proxy/src/config.rs | 10 +-- proxy/src/http.rs | 92 ++++++++++++++++----- proxy/src/http/server.rs | 27 ++++++ proxy/src/main.rs | 48 +++++------ proxy/src/proxy.rs | 24 +++--- proxy/src/url.rs | 12 +-- workspace_hack/Cargo.toml | 1 + 13 files changed, 259 insertions(+), 166 deletions(-) create mode 100644 proxy/src/http/server.rs diff --git a/Cargo.lock b/Cargo.lock index fc4ef90b8b..0579d381cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2283,6 +2283,7 @@ dependencies = [ "tokio-rustls", "url", "utils", + "uuid", "workspace_hack", "x509-parser", ] @@ -3663,6 +3664,10 @@ name = "uuid" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom", + "serde", +] [[package]] name = "valuable" @@ -3953,6 +3958,7 @@ dependencies = [ "tokio-util", "tracing", "tracing-core", + "uuid", ] [[package]] diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5417f4f2b3..7d0449cd1a 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -11,13 +11,14 @@ bstr = "0.2.17" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" futures = "0.3.13" +git-version = "0.3.5" hashbrown = "0.12" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" itertools = "0.10.3" -once_cell = "1.13.0" md5 = "0.7.0" +once_cell = "1.13.0" parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" @@ -35,14 +36,13 @@ tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" url = "2.2.2" -git-version = "0.3.5" +uuid = { version = "0.8.2", features = ["v4", "serde"]} +x509-parser = "0.13.2" utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } -x509-parser = "0.13.2" - [dev-dependencies] rcgen = "0.8.14" rstest = "0.12" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index a50d23e351..2df4f9d920 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,7 +1,7 @@ //! Client authentication mechanisms. pub mod backend; -pub use backend::{BackendType, DatabaseInfo}; +pub use backend::{BackendType, ConsoleReqExtra, DatabaseInfo}; mod credentials; pub use credentials::ClientCredentials; diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index de0719a196..7e93a32950 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -8,13 +8,12 @@ pub use console::{GetAuthInfoError, WakeComputeError}; use crate::{ auth::{self, AuthFlow, ClientCredentials}, - compute, config, mgmt, - stream::PqStream, + compute, http, mgmt, stream, url, waiters::{self, Waiter, Waiters}, }; - use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; +use std::borrow::Cow; use tokio::io::{AsyncRead, AsyncWrite}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); @@ -75,6 +74,14 @@ impl From for tokio_postgres::Config { } } +/// Extra query params we'd like to pass to the console. +pub struct ConsoleReqExtra<'a> { + /// A unique identifier for a connection. + pub session_id: uuid::Uuid, + /// Name of client application, if set. + pub application_name: Option<&'a str>, +} + /// This type serves two purposes: /// /// * When `T` is `()`, it's just a regular auth backend selector @@ -83,53 +90,83 @@ impl From for tokio_postgres::Config { /// * However, when we substitute `T` with [`ClientCredentials`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum BackendType { +#[derive(Debug)] +pub enum BackendType<'a, T> { /// Current Cloud API (V2). - Console(T), + Console(Cow<'a, http::Endpoint>, T), /// Local mock of Cloud API (V2). - Postgres(T), + Postgres(Cow<'a, url::ApiUrl>, T), /// Authentication via a web browser. - Link, + Link(Cow<'a, url::ApiUrl>), } -impl BackendType { +impl std::fmt::Display for BackendType<'_, ()> { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use BackendType::*; + match self { + Console(endpoint, _) => fmt + .debug_tuple("Console") + .field(&endpoint.url().as_str()) + .finish(), + Postgres(endpoint, _) => fmt + .debug_tuple("Postgres") + .field(&endpoint.as_str()) + .finish(), + Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), + } + } +} + +impl BackendType<'_, T> { + /// Very similar to [`std::option::Option::as_ref`]. + /// This helps us pass structured config to async tasks. + pub fn as_ref(&self) -> BackendType<'_, &T> { + use BackendType::*; + match self { + Console(c, x) => Console(Cow::Borrowed(c), x), + Postgres(c, x) => Postgres(Cow::Borrowed(c), x), + Link(c) => Link(Cow::Borrowed(c)), + } + } +} + +impl<'a, T> BackendType<'a, T> { /// Very similar to [`std::option::Option::map`]. /// Maps [`BackendType`] to [`BackendType`] by applying /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> { use BackendType::*; match self { - Console(x) => Console(f(x)), - Postgres(x) => Postgres(f(x)), - Link => Link, + Console(c, x) => Console(c, f(x)), + Postgres(c, x) => Postgres(c, f(x)), + Link(c) => Link(c), } } } -impl BackendType> { +impl<'a, T, E> BackendType<'a, Result> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub fn transpose(self) -> Result, E> { + pub fn transpose(self) -> Result, E> { use BackendType::*; match self { - Console(x) => x.map(Console), - Postgres(x) => x.map(Postgres), - Link => Ok(Link), + Console(c, x) => x.map(|x| Console(c, x)), + Postgres(c, x) => x.map(|x| Postgres(c, x)), + Link(c) => Ok(Link(c)), } } } -impl BackendType> { +impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. pub async fn authenticate( mut self, - urls: &config::AuthUrls, - client: &mut PqStream, + extra: &ConsoleReqExtra<'_>, + client: &mut stream::PqStream, ) -> super::Result { use BackendType::*; - if let Console(creds) | Postgres(creds) = &mut self { + if let Console(_, creds) | Postgres(_, creds) = &mut self { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the project name. // We now expect to see a very specific payload in the place of password. @@ -145,15 +182,13 @@ impl BackendType> { creds.project = Some(payload.project.into()); let mut config = match &self { - Console(creds) => { - console::Api::new(&urls.auth_endpoint, creds) + Console(endpoint, creds) => { + console::Api::new(endpoint, extra, creds) .wake_compute() .await? } - Postgres(creds) => { - postgres::Api::new(&urls.auth_endpoint, creds) - .wake_compute() - .await? + Postgres(endpoint, creds) => { + postgres::Api::new(endpoint, creds).wake_compute().await? } _ => unreachable!("see the patterns above"), }; @@ -169,49 +204,18 @@ impl BackendType> { } match self { - Console(creds) => { - console::Api::new(&urls.auth_endpoint, &creds) + Console(endpoint, creds) => { + console::Api::new(&endpoint, extra, &creds) .handle_user(client) .await } - Postgres(creds) => { - postgres::Api::new(&urls.auth_endpoint, &creds) + Postgres(endpoint, creds) => { + postgres::Api::new(&endpoint, &creds) .handle_user(client) .await } // NOTE: this auth backend doesn't use client credentials. - Link => link::handle_user(&urls.auth_link_uri, client).await, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_backend_type_map() { - let values = [ - BackendType::Console(0), - BackendType::Postgres(0), - BackendType::Link, - ]; - - for value in values { - assert_eq!(value.map(|x| x), value); - } - } - - #[test] - fn test_backend_type_transpose() { - let values = [ - BackendType::Console(Ok::<_, ()>(0)), - BackendType::Postgres(Ok(0)), - BackendType::Link, - ]; - - for value in values { - assert_eq!(value.map(Result::unwrap), value.transpose().unwrap()); + Link(url) => link::handle_user(&url, client).await, } } } diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index e239320e9b..e5ee07813c 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -1,12 +1,12 @@ //! Cloud API V2. +use super::ConsoleReqExtra; use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute::{self, ComputeConnCfg}, error::{io_error, UserFacingError}, - scram, + http, scram, stream::PqStream, - url::ApiUrl, }; use serde::{Deserialize, Serialize}; use std::future::Future; @@ -120,14 +120,23 @@ pub enum AuthInfo { #[must_use] pub(super) struct Api<'a> { - endpoint: &'a ApiUrl, + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, creds: &'a ClientCredentials<'a>, } impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { - Self { endpoint, creds } + pub(super) fn new( + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, + creds: &'a ClientCredentials, + ) -> Self { + Self { + endpoint, + extra, + creds, + } } /// Authenticate the existing user or throw an error. @@ -139,16 +148,22 @@ impl<'a> Api<'a> { } async fn get_auth_info(&self) -> Result { - let mut url = self.endpoint.clone(); - url.path_segments_mut().push("proxy_get_role_secret"); - url.query_pairs_mut() - .append_pair("project", self.creds.project().expect("impossible")) - .append_pair("role", self.creds.user); + let req = self + .endpoint + .get("proxy_get_role_secret") + .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ("role", Some(self.creds.user)), + ]) + .build()?; // TODO: use a proper logger - println!("cplane request: {url}"); + println!("cplane request: {}", req.url()); - let resp = reqwest::get(url.into_inner()).await?; + let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); } @@ -162,15 +177,21 @@ impl<'a> Api<'a> { /// Wake up the compute node and return the corresponding connection info. pub(super) async fn wake_compute(&self) -> Result { - let mut url = self.endpoint.clone(); - url.path_segments_mut().push("proxy_wake_compute"); - url.query_pairs_mut() - .append_pair("project", self.creds.project().expect("impossible")); + let req = self + .endpoint + .get("proxy_wake_compute") + .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ]) + .build()?; // TODO: use a proper logger - println!("cplane request: {url}"); + println!("cplane request: {}", req.url()); - let resp = reqwest::get(url.into_inner()).await?; + let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); } diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index d740a4c5c4..eefa246eba 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -29,7 +29,7 @@ impl UserFacingError for LinkAuthError { } } -fn hello_message(redirect_uri: &str, session_id: &str) -> String { +fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String { format!( concat![ "Welcome to Neon!\n", @@ -46,11 +46,11 @@ pub fn new_psql_session_id() -> String { } pub async fn handle_user( - redirect_uri: &reqwest::Url, + link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { let psql_session_id = new_psql_session_id(); - let greeting = hello_message(redirect_uri.as_str(), &psql_session_id); + let greeting = hello_message(link_uri, &psql_session_id); let db_info = super::with_waiter(psql_session_id, |waiter| async { // Give user a URL to spawn a new database diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 8835d660d5..031fa84509 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,16 +1,10 @@ -use crate::{auth, url::ApiUrl}; +use crate::auth; use anyhow::{ensure, Context}; use std::sync::Arc; pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::BackendType<()>, - pub auth_urls: AuthUrls, -} - -pub struct AuthUrls { - pub auth_endpoint: ApiUrl, - pub auth_link_uri: ApiUrl, + pub auth_backend: auth::BackendType<'static, ()>, } pub struct TlsConfig { diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 5a75718742..dbeb3dc784 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,27 +1,81 @@ -use anyhow::anyhow; -use hyper::{Body, Request, Response, StatusCode}; -use std::net::TcpListener; -use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; +pub mod server; -async fn status_handler(_: Request) -> Result, ApiError> { - json_response(StatusCode::OK, "") +use crate::url::ApiUrl; + +/// Thin convenience wrapper for an API provided by an http endpoint. +#[derive(Debug, Clone)] +pub struct Endpoint { + /// API's base URL. + endpoint: ApiUrl, + /// Connection manager with built-in pooling. + client: reqwest::Client, } -fn make_router() -> RouterBuilder { - let router = endpoint::make_router(); - router.get("/v1/status", status_handler) -} - -pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { - scopeguard::defer! { - println!("http has shut down"); +impl Endpoint { + /// Construct a new HTTP endpoint wrapper. + pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self { + Self { endpoint, client } } - let service = || RouterService::new(make_router().build()?); + pub fn url(&self) -> &ApiUrl { + &self.endpoint + } - hyper::Server::from_tcp(http_listener)? - .serve(service().map_err(|e| anyhow!(e))?) - .await?; + /// Return a [builder](reqwest::RequestBuilder) for a `GET` request, + /// appending a single `path` segment to the base endpoint URL. + pub fn get(&self, path: &str) -> reqwest::RequestBuilder { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push(path); + self.client.get(url.into_inner()) + } - Ok(()) + /// Execute a [request](reqwest::Request). + pub async fn execute( + &self, + request: reqwest::Request, + ) -> Result { + self.client.execute(request).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn optional_query_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + // Validate that this pattern makes sense. + let req = endpoint + .get("frobnicate") + .query(&[ + ("foo", Some("10")), // should be just `foo=10` + ("bar", None), // shouldn't be passed at all + ]) + .build()?; + + assert_eq!(req.url().as_str(), "http://example.com/frobnicate?foo=10"); + + Ok(()) + } + + #[test] + fn uuid_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + let req = endpoint + .get("frobnicate") + .query(&[("session_id", uuid::Uuid::nil())]) + .build()?; + + assert_eq!( + req.url().as_str(), + "http://example.com/frobnicate?session_id=00000000-0000-0000-0000-000000000000" + ); + + Ok(()) + } } diff --git a/proxy/src/http/server.rs b/proxy/src/http/server.rs new file mode 100644 index 0000000000..5a75718742 --- /dev/null +++ b/proxy/src/http/server.rs @@ -0,0 +1,27 @@ +use anyhow::anyhow; +use hyper::{Body, Request, Response, StatusCode}; +use std::net::TcpListener; +use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; + +async fn status_handler(_: Request) -> Result, ApiError> { + json_response(StatusCode::OK, "") +} + +fn make_router() -> RouterBuilder { + let router = endpoint::make_router(); + router.get("/v1/status", status_handler) +} + +pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { + scopeguard::defer! { + println!("http has shut down"); + } + + let service = || RouterService::new(make_router().build()?); + + hyper::Server::from_tcp(http_listener)? + .serve(service().map_err(|e| anyhow!(e))?) + .await?; + + Ok(()) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index efe45f6386..f2dc7425ba 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -23,7 +23,7 @@ use anyhow::{bail, Context}; use clap::{self, Arg}; use config::ProxyConfig; use futures::FutureExt; -use std::{future::Future, net::SocketAddr}; +use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; use utils::project_git_version; @@ -36,23 +36,6 @@ async fn flatten_err( f.map(|r| r.context("join error").and_then(|x| x)).await } -/// A proper parser for auth backend parameter. -impl clap::ValueEnum for auth::BackendType<()> { - fn value_variants<'a>() -> &'a [Self] { - use auth::BackendType::*; - &[Console(()), Postgres(()), Link] - } - - fn to_possible_value<'a>(&self) -> Option> { - use auth::BackendType::*; - Some(clap::PossibleValue::new(match self { - Console(_) => "console", - Postgres(_) => "postgres", - Link => "link", - })) - } -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let arg_matches = clap::App::new("Neon proxy/router") @@ -69,7 +52,7 @@ async fn main() -> anyhow::Result<()> { Arg::new("auth-backend") .long("auth-backend") .takes_value(true) - .value_parser(clap::builder::EnumValueParser::>::new()) + .possible_values(["console", "postgres", "link"]) .default_value("link"), ) .arg( @@ -135,23 +118,30 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; - let auth_backend = *arg_matches - .try_get_one::>("auth-backend")? - .unwrap(); - - let auth_urls = config::AuthUrls { - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, - auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + let auth_backend = match arg_matches.value_of("auth-backend").unwrap() { + "console" => { + let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + let endpoint = http::Endpoint::new(url, reqwest::Client::new()); + auth::BackendType::Console(Cow::Owned(endpoint), ()) + } + "postgres" => { + let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + auth::BackendType::Postgres(Cow::Owned(url), ()) + } + "link" => { + let url = arg_matches.value_of("uri").unwrap().parse()?; + auth::BackendType::Link(Cow::Owned(url)) + } + other => bail!("unsupported auth backend: {other}"), }; let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, - auth_urls, })); println!("Version: {GIT_VERSION}"); - println!("Authentication backend: {:?}", config.auth_backend); + println!("Authentication backend: {}", config.auth_backend); // Check that we can bind to address before further initialization println!("Starting http on {}", http_address); @@ -164,7 +154,7 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let tasks = [ - tokio::spawn(http::thread_main(http_listener)), + tokio::spawn(http::server::thread_main(http_listener)), tokio::spawn(proxy::thread_main(config, proxy_listener)), tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), ] diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 72cb822910..efb1b6f358 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,6 +1,6 @@ use crate::auth; use crate::cancellation::{self, CancelMap}; -use crate::config::{AuthUrls, ProxyConfig, TlsConfig}; +use crate::config::{ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; @@ -99,6 +99,7 @@ async fn handle_client( let common_name = tls.and_then(|tls| tls.common_name.as_deref()); let result = config .auth_backend + .as_ref() .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) .transpose(); @@ -107,7 +108,7 @@ async fn handle_client( let client = Client::new(stream, creds, ¶ms); cancel_map - .with_session(|session| client.connect_to_db(&config.auth_urls, session)) + .with_session(|session| client.connect_to_db(session)) .await } @@ -179,7 +180,7 @@ struct Client<'a, S> { /// The underlying libpq protocol stream. stream: PqStream, /// Client credentials that we care about. - creds: auth::BackendType>, + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, /// KV-dictionary with PostgreSQL connection params. params: &'a StartupMessageParams, } @@ -188,7 +189,7 @@ impl<'a, S> Client<'a, S> { /// Construct a new connection context. fn new( stream: PqStream, - creds: auth::BackendType>, + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, params: &'a StartupMessageParams, ) -> Self { Self { @@ -201,19 +202,22 @@ impl<'a, S> Client<'a, S> { impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. - async fn connect_to_db( - self, - urls: &AuthUrls, - session: cancellation::Session<'_>, - ) -> anyhow::Result<()> { + async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> { let Self { mut stream, creds, params, } = self; + let extra = auth::ConsoleReqExtra { + // Currently it's OK to generate a new UUID **here**, but + // it might be better to move this to `cancellation::Session`. + session_id: uuid::Uuid::new_v4(), + application_name: params.get("application_name"), + }; + // Authenticate and connect to a compute node. - let auth = creds.authenticate(urls, &mut stream).await; + let auth = creds.authenticate(&extra, &mut stream).await; let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; let reported_auth_ok = node.reported_auth_ok; diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 76d6ad0e66..92c64bb8ad 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -1,8 +1,8 @@ use anyhow::bail; -use url::form_urlencoded::Serializer; /// A [url](url::Url) type with additional guarantees. -#[derive(Debug, Clone)] +#[repr(transparent)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ApiUrl(url::Url); impl ApiUrl { @@ -11,11 +11,6 @@ impl ApiUrl { self.0 } - /// See [`url::Url::query_pairs_mut`]. - pub fn query_pairs_mut(&mut self) -> Serializer<'_, url::UrlQuery<'_>> { - self.0.query_pairs_mut() - } - /// See [`url::Url::path_segments_mut`]. pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { // We've already verified that it works during construction. @@ -72,10 +67,7 @@ mod tests { let mut b = url.parse::().expect("unexpected parsing failure"); a.path_segments_mut().unwrap().push("method"); - a.query_pairs_mut().append_pair("key", "value"); - b.path_segments_mut().push("method"); - b.query_pairs_mut().append_pair("key", "value"); assert_eq!(a, b.into_inner()); } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index dc4cbb5284..3670ca5fea 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -43,6 +43,7 @@ tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["once_cell", "std"] } +uuid = { version = "0.8", features = ["getrandom", "serde", "std", "v4"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } From f3073a4db93e2d4e39e2bbef03ed6b742ef3afa0 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 22 Sep 2022 08:35:06 +0300 Subject: [PATCH 34/90] R-Tree layer map (#2317) Replace the layer array and linear search with R-tree So far, the in-memory layer map that holds information about layer files that exist, has used a simple Vec, in no particular order, to hold information about all the layers. That obviously doesn't scale very well; with thousands of layer files the linear search was consuming a lot of CPU. Replace it with a two-dimensional R-tree, with Key and LSN ranges as the dimensions. For the R-tree, use the 'rstar' crate. To be able to use that, we convert the Keys and LSNs into 256-bit integers. 64 bits would be enough to represent LSNs, and 128 bits would be enough to represent Keys. However, we use 256 bits, because rstar internally performs multiplication to calculate the area of rectangles, and the result of multiplying two 128 bit integers doesn't necessarily fit in 128 bits, causing integer overflow and, if overflow-checks are enabled, panic. To avoid that, we use 256 bit integers. Add a performance test that creates a lot of layer files, to demonstrate the benefit. --- Cargo.lock | 222 +++++++++++++- pageserver/Cargo.toml | 3 + pageserver/src/repository.rs | 13 + pageserver/src/tenant/delta_layer.rs | 2 +- pageserver/src/tenant/layer_map.rs | 347 +++++++++++++++++----- pageserver/src/tenant/timeline.rs | 2 +- test_runner/performance/test_layer_map.py | 39 +++ workspace_hack/Cargo.toml | 3 +- 8 files changed, 548 insertions(+), 83 deletions(-) create mode 100644 test_runner/performance/test_layer_map.py diff --git a/Cargo.lock b/Cargo.lock index 0579d381cc..ddb10352b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -37,6 +37,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "amplify_num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27d3d00d3d115395a7a8a4dc045feb7aa82b641e485f7e15f4e67ac16f4f56d" + [[package]] name = "ansi_term" version = "0.12.1" @@ -135,6 +141,15 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-polyfill" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c041a8d9751a520ee19656232a18971f18946a7900f1520ee4400002244dd89" +dependencies = [ + "critical-section", +] + [[package]] name = "atty" version = "0.2.14" @@ -212,6 +227,21 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "bare-metal" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5deb64efa5bd81e31fcd1938615a6d98c82eafcbcd787162b6f63b91d6bac5b3" +dependencies = [ + "rustc_version 0.2.3", +] + +[[package]] +name = "bare-metal" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fe8f5a8a398345e52358e18ff07cc17a568fbca5c6f73873d3a62056309603" + [[package]] name = "base64" version = "0.13.0" @@ -250,6 +280,18 @@ dependencies = [ "which", ] +[[package]] +name = "bit_field" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4" + +[[package]] +name = "bitfield" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" + [[package]] name = "bitflags" version = "1.3.2" @@ -528,6 +570,18 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +[[package]] +name = "cortex-m" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70858629a458fdfd39f9675c4dc309411f2a3f83bede76988d81bf1a0ecee9e0" +dependencies = [ + "bare-metal 0.2.5", + "bitfield", + "embedded-hal", + "volatile-register", +] + [[package]] name = "cpp_demangle" version = "0.3.5" @@ -552,7 +606,7 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ - "rustc_version", + "rustc_version 0.4.0", ] [[package]] @@ -600,6 +654,18 @@ dependencies = [ "itertools", ] +[[package]] +name = "critical-section" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95da181745b56d4bd339530ec393508910c909c784e8962d15d722bacf0bcbcd" +dependencies = [ + "bare-metal 1.0.0", + "cfg-if", + "cortex-m", + "riscv", +] + [[package]] name = "crossbeam-channel" version = "0.5.6" @@ -844,6 +910,16 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" +[[package]] +name = "embedded-hal" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35949884794ad573cf46071e41c9b60efb0cb311e3ca01f7af807af1debc66ff" +dependencies = [ + "nb 0.1.3", + "void", +] + [[package]] name = "encoding_rs" version = "0.8.31" @@ -1165,6 +1241,15 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1174,6 +1259,19 @@ dependencies = [ "ahash", ] +[[package]] +name = "heapless" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version 0.4.0", + "spin 0.9.4", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.3.3" @@ -1491,6 +1589,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "libm" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" + [[package]] name = "lock_api" version = "0.4.7" @@ -1649,6 +1753,21 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nb" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "801d31da0513b6ec5214e9bf433a77966320625a37860f910be265be6e18d06f" +dependencies = [ + "nb 1.0.0", +] + +[[package]] +name = "nb" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae" + [[package]] name = "nix" version = "0.23.1" @@ -1716,6 +1835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1828,6 +1948,7 @@ checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4" name = "pageserver" version = "0.1.0" dependencies = [ + "amplify_num", "anyhow", "async-stream", "async-trait", @@ -1852,6 +1973,7 @@ dependencies = [ "itertools", "metrics", "nix", + "num-traits", "once_cell", "postgres", "postgres-protocol", @@ -1861,6 +1983,7 @@ dependencies = [ "rand", "regex", "remote_storage", + "rstar", "scopeguard", "serde", "serde_json", @@ -2515,12 +2638,33 @@ dependencies = [ "cc", "libc", "once_cell", - "spin", + "spin 0.5.2", "untrusted", "web-sys", "winapi", ] +[[package]] +name = "riscv" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6907ccdd7a31012b70faf2af85cd9e5ba97657cc3987c4f13f8e4d2c2a088aba" +dependencies = [ + "bare-metal 1.0.0", + "bit_field", + "riscv-target", +] + +[[package]] +name = "riscv-target" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88aa938cda42a0cf62a20cfe8d139ff1af20c2e681212b5b34adb5a58333f222" +dependencies = [ + "lazy_static", + "regex", +] + [[package]] name = "routerify" version = "3.0.0" @@ -2534,6 +2678,17 @@ dependencies = [ "regex", ] +[[package]] +name = "rstar" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa" +dependencies = [ + "heapless", + "num-traits", + "smallvec", +] + [[package]] name = "rstest" version = "0.12.0" @@ -2543,7 +2698,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "rustc_version", + "rustc_version 0.4.0", "syn", ] @@ -2565,7 +2720,7 @@ dependencies = [ "log", "rusoto_credential", "rusoto_signature", - "rustc_version", + "rustc_version 0.4.0", "serde", "serde_json", "tokio", @@ -2623,7 +2778,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rusoto_credential", - "rustc_version", + "rustc_version 0.4.0", "serde", "sha2 0.9.9", "tokio", @@ -2641,13 +2796,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver 0.9.0", +] + [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver", + "semver 1.0.13", ] [[package]] @@ -2800,12 +2964,27 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + [[package]] name = "semver" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711" +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "serde" version = "1.0.142" @@ -2999,6 +3178,15 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "spin" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +dependencies = [ + "lock_api", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -3675,6 +3863,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcell" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002" + [[package]] name = "vcpkg" version = "0.2.15" @@ -3687,6 +3881,21 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + +[[package]] +name = "volatile-register" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ee8f19f9d74293faf70901bc20ad067dc1ad390d2cbf1e3f75f721ffee908b6" +dependencies = [ + "vcell", +] + [[package]] name = "wal_craft" version = "0.1.0" @@ -3952,6 +4161,7 @@ dependencies = [ "regex-syntax", "scopeguard", "serde", + "stable_deref_trait", "syn", "time 0.3.12", "tokio", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 85ece97d9b..1ec7ec4f98 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -54,6 +54,9 @@ once_cell = "1.13.0" crossbeam-utils = "0.8.5" fail = "0.5.0" git-version = "0.3.5" +rstar = "0.9.3" +num-traits = "0.2.15" +amplify_num = "0.4.1" postgres_ffi = { path = "../libs/postgres_ffi" } etcd_broker = { path = "../libs/etcd_broker" } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index cfcc87a2ed..0c2fedd7d5 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -24,6 +24,19 @@ pub struct Key { pub const KEY_SIZE: usize = 18; impl Key { + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. + /// As long as Neon does not support tablespace (because of lack of access to local file system), + /// we can assume that only some predefined namespace OIDs are used which can fit in u16 + pub fn to_i128(&self) -> i128 { + assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + (((self.field1 & 0xf) as i128) << 120) + | (((self.field2 & 0xFFFF) as i128) << 104) + | ((self.field3 as i128) << 72) + | ((self.field4 as i128) << 40) + | ((self.field5 as i128) << 32) + | self.field6 as i128 + } + pub fn next(&self) -> Key { self.add(1) } diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index 892000c20b..57c5be91a4 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -713,7 +713,7 @@ impl DeltaLayerWriter { for buf in block_buf.blocks { file.write_all(buf.as_ref())?; } - + assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 let summary = Summary { magic: DELTA_FILE_MAGIC, diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 8abeebf54c..495833e3ae 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -15,9 +15,15 @@ use crate::repository::Key; use crate::tenant::inmemory_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; use crate::tenant::storage_layer::{range_eq, range_overlaps}; +use amplify_num::i256; use anyhow::Result; +use num_traits::identities::{One, Zero}; +use num_traits::{Bounded, Num, Signed}; +use rstar::{RTree, RTreeObject, AABB}; +use std::cmp::Ordering; use std::collections::VecDeque; use std::ops::Range; +use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; use std::sync::Arc; use tracing::*; use utils::lsn::Lsn; @@ -47,14 +53,163 @@ pub struct LayerMap { pub frozen_layers: VecDeque>, /// All the historic layers are kept here + historic_layers: RTree, - /// TODO: This is a placeholder implementation of a data structure - /// to hold information about all the layer files on disk and in - /// S3. Currently, it's just a vector and all operations perform a - /// linear scan over it. That obviously becomes slow as the - /// number of layers grows. I'm imagining that an R-tree or some - /// other 2D data structure would be the long-term solution here. - historic_layers: Vec>, + /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. + /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. + l0_delta_layers: Vec>, +} + +struct LayerRTreeObject { + layer: Arc, +} + +// Representation of Key as numeric type. +// We can not use native implementation of i128, because rstar::RTree +// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi). +// Overflow will cause panic in debug mode and incorrect area calculation in release mode, +// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work). +// By using i256 as the type, even though all the actual values would fit in i128, we can be +// sure that multiplication doesn't overflow. +// + +#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)] +struct IntKey(i256); + +impl Copy for IntKey {} + +impl IntKey { + fn from(i: i128) -> Self { + IntKey(i256::from(i)) + } +} + +impl Bounded for IntKey { + fn min_value() -> Self { + IntKey(i256::MIN) + } + fn max_value() -> Self { + IntKey(i256::MAX) + } +} + +impl Signed for IntKey { + fn is_positive(&self) -> bool { + self.0 > i256::ZERO + } + fn is_negative(&self) -> bool { + self.0 < i256::ZERO + } + fn signum(&self) -> Self { + match self.0.cmp(&i256::ZERO) { + Ordering::Greater => IntKey(i256::ONE), + Ordering::Less => IntKey(-i256::ONE), + Ordering::Equal => IntKey(i256::ZERO), + } + } + fn abs(&self) -> Self { + IntKey(self.0.abs()) + } + fn abs_sub(&self, other: &Self) -> Self { + if self.0 <= other.0 { + IntKey(i256::ZERO) + } else { + IntKey(self.0 - other.0) + } + } +} + +impl Neg for IntKey { + type Output = Self; + fn neg(self) -> Self::Output { + IntKey(-self.0) + } +} + +impl Rem for IntKey { + type Output = Self; + fn rem(self, rhs: Self) -> Self::Output { + IntKey(self.0 % rhs.0) + } +} + +impl Div for IntKey { + type Output = Self; + fn div(self, rhs: Self) -> Self::Output { + IntKey(self.0 / rhs.0) + } +} + +impl Add for IntKey { + type Output = Self; + fn add(self, rhs: Self) -> Self::Output { + IntKey(self.0 + rhs.0) + } +} + +impl Sub for IntKey { + type Output = Self; + fn sub(self, rhs: Self) -> Self::Output { + IntKey(self.0 - rhs.0) + } +} + +impl Mul for IntKey { + type Output = Self; + fn mul(self, rhs: Self) -> Self::Output { + IntKey(self.0 * rhs.0) + } +} + +impl One for IntKey { + fn one() -> Self { + IntKey(i256::ONE) + } +} + +impl Zero for IntKey { + fn zero() -> Self { + IntKey(i256::ZERO) + } + fn is_zero(&self) -> bool { + self.0 == i256::ZERO + } +} + +impl Num for IntKey { + type FromStrRadixErr = ::FromStrRadixErr; + fn from_str_radix(str: &str, radix: u32) -> Result { + Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?))) + } +} + +impl PartialEq for LayerRTreeObject { + fn eq(&self, other: &Self) -> bool { + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + Arc::ptr_eq(&self.layer, &other.layer) + } +} + +impl RTreeObject for LayerRTreeObject { + type Envelope = AABB<[IntKey; 2]>; + fn envelope(&self) -> Self::Envelope { + let key_range = self.layer.get_key_range(); + let lsn_range = self.layer.get_lsn_range(); + AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive + ) + } } /// Return value of LayerMap::search @@ -80,19 +235,24 @@ impl LayerMap { // Find the latest image layer that covers the given key let mut latest_img: Option> = None; let mut latest_img_lsn: Option = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0i128)], + [ + IntKey::from(key.to_i128()), + IntKey::from(end_lsn.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } + assert!(l.get_key_range().contains(&key)); let img_lsn = l.get_lsn_range().start; - - if img_lsn >= end_lsn { - // too new - continue; - } + assert!(img_lsn < end_lsn); if Lsn(img_lsn.0 + 1) == end_lsn { // found exact match return Ok(Some(SearchResult { @@ -108,19 +268,24 @@ impl LayerMap { // Search the delta layers let mut latest_delta: Option> = None; - for l in self.historic_layers.iter() { + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); if l.get_lsn_range().start >= end_lsn { - // too new - continue; + info!( + "Candidate delta layer {}..{} is too new for lsn {}", + l.get_lsn_range().start, + l.get_lsn_range().end, + end_lsn + ); } - + assert!(l.get_lsn_range().start < end_lsn); if l.get_lsn_range().end >= end_lsn { // this layer contains the requested point in the key/lsn space. // No need to search any further @@ -170,7 +335,10 @@ impl LayerMap { /// Insert an on-disk layer /// pub fn insert_historic(&mut self, layer: Arc) { - self.historic_layers.push(layer); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + self.l0_delta_layers.push(layer.clone()); + } + self.historic_layers.insert(LayerRTreeObject { layer }); NUM_ONDISK_LAYERS.inc(); } @@ -180,17 +348,22 @@ impl LayerMap { /// This should be called when the corresponding file on disk has been deleted. /// pub fn remove_historic(&mut self, layer: Arc) { - let len_before = self.historic_layers.len(); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + let len_before = self.l0_delta_layers.len(); - // FIXME: ptr_eq might fail to return true for 'dyn' - // references. Clippy complains about this. In practice it - // seems to work, the assertion below would be triggered - // otherwise but this ought to be fixed. - #[allow(clippy::vtable_address_comparisons)] - self.historic_layers - .retain(|other| !Arc::ptr_eq(other, &layer)); - - assert_eq!(self.historic_layers.len(), len_before - 1); + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + self.l0_delta_layers + .retain(|other| !Arc::ptr_eq(other, &layer)); + assert_eq!(self.l0_delta_layers.len(), len_before - 1); + } + assert!(self + .historic_layers + .remove(&LayerRTreeObject { layer }) + .is_some()); NUM_ONDISK_LAYERS.dec(); } @@ -207,15 +380,26 @@ impl LayerMap { loop { let mut made_progress = false; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [ + IntKey::from(range_remain.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(range_remain.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } let img_lsn = l.get_lsn_range().start; - if !l.is_incremental() - && l.get_key_range().contains(&range_remain.start) - && lsn_range.contains(&img_lsn) - { + if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) { made_progress = true; let img_key_end = l.get_key_range().end; @@ -232,8 +416,8 @@ impl LayerMap { } } - pub fn iter_historic_layers(&self) -> impl Iterator> { - self.historic_layers.iter() + pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { + self.historic_layers.iter().map(|e| e.layer.clone()) } /// Find the last image layer that covers 'key', ignoring any image layers @@ -241,19 +425,22 @@ impl LayerMap { fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { let mut candidate_lsn = Lsn(0); let mut candidate = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0)], + [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); let this_lsn = l.get_lsn_range().start; - if this_lsn > lsn { - continue; - } + assert!(this_lsn <= lsn); if this_lsn < candidate_lsn { // our previous candidate was better continue; @@ -279,10 +466,19 @@ impl LayerMap { lsn: Lsn, ) -> Result, Option>)>> { let mut points = vec![key_range.start]; - for l in self.historic_layers.iter() { - if l.get_lsn_range().start > lsn { - continue; - } + let envelope = AABB::from_corners( + [IntKey::from(key_range.start.to_i128()), IntKey::from(0)], + [ + IntKey::from(key_range.end.to_i128()), + IntKey::from(lsn.0 as i128), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + assert!(l.get_lsn_range().start <= lsn); let range = l.get_key_range(); if key_range.contains(&range.start) { points.push(l.get_key_range().start); @@ -315,16 +511,29 @@ impl LayerMap { /// given key and LSN range. pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { let mut result = 0; - for l in self.historic_layers.iter() { + if lsn_range.start >= lsn_range.end { + return Ok(0); + } + let envelope = AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !range_overlaps(&l.get_lsn_range(), lsn_range) { - continue; - } - if !range_overlaps(&l.get_key_range(), key_range) { - continue; - } + assert!(range_overlaps(&l.get_lsn_range(), lsn_range)); + assert!(range_overlaps(&l.get_key_range(), key_range)); // We ignore level0 delta layers. Unless the whole keyspace fits // into one partition @@ -341,17 +550,7 @@ impl LayerMap { /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - let mut deltas = Vec::new(); - for l in self.historic_layers.iter() { - if !l.is_incremental() { - continue; - } - if l.get_key_range() != (Key::MIN..Key::MAX) { - continue; - } - deltas.push(Arc::clone(l)); - } - Ok(deltas) + Ok(self.l0_delta_layers.clone()) } /// debugging function to print out the contents of the layer map @@ -370,8 +569,8 @@ impl LayerMap { } println!("historic_layers:"); - for layer in self.historic_layers.iter() { - layer.dump(verbose)?; + for e in self.historic_layers.iter() { + e.layer.dump(verbose)?; } println!("End dump LayerMap"); Ok(()) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b80d023c7f..6de1d44876 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2050,7 +2050,7 @@ impl Timeline { l.filename().display(), l.is_incremental(), ); - layers_to_remove.push(Arc::clone(l)); + layers_to_remove.push(Arc::clone(&l)); } // Actually delete the layers from disk and remove them from the map. diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py new file mode 100644 index 0000000000..d71fb6d12c --- /dev/null +++ b/test_runner/performance/test_layer_map.py @@ -0,0 +1,39 @@ +import time + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# +# Benchmark searching the layer map, when there are a lot of small layer files. +# +def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): + + env = neon_env_builder.init_start() + n_iters = 10 + n_records = 100000 + + # We want to have a lot of lot of layer files to exercise the layer map. Make + # gc_horizon and checkpoint_distance very small, so that we get a lot of small layer files. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "100 m", + "gc_horizon": "1048576", + "checkpoint_distance": "8192", + "compaction_period": "1 s", + "compaction_threshold": "1", + "compaction_target_size": "8192", + } + ) + + env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant) + pg = env.postgres.create_start("test_layer_map", tenant_id=tenant) + cur = pg.connect().cursor() + cur.execute("create table t(x integer)") + for i in range(n_iters): + cur.execute(f"insert into t values (generate_series(1,{n_records}))") + time.sleep(1) + + cur.execute("vacuum t") + with zenbenchmark.record_duration("test_query"): + cur.execute("SELECT count(*) from t") + assert cur.fetchone() == (n_iters * n_records,) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 3670ca5fea..f37a42945e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -30,7 +30,7 @@ memchr = { version = "2", features = ["std", "use_std"] } nom = { version = "7", features = ["alloc", "std"] } num-bigint = { version = "0.4", features = ["std"] } num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } -num-traits = { version = "0.2", features = ["i128", "std"] } +num-traits = { version = "0.2", features = ["i128", "libm", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } @@ -38,6 +38,7 @@ regex-automata = { version = "0.1", features = ["regex-syntax", "std"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +stable_deref_trait = { version = "1", features = ["alloc", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } From e764c1e60fd8e7afaf346bc70f0b9269097e8a1a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 22 Sep 2022 01:02:53 +0300 Subject: [PATCH 35/90] remove self argument from several spans --- pageserver/src/page_service.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9e159f7391..7de6403b83 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -663,7 +663,7 @@ impl PageServerHandler { Ok(lsn) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_rel_exists_request( &self, timeline: &Timeline, @@ -680,7 +680,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_nblocks_request( &self, timeline: &Timeline, @@ -697,7 +697,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] async fn handle_db_size_request( &self, timeline: &Timeline, @@ -717,7 +717,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, From 86bf4919817d34a2e56590596eb5f8270ce8b79e Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 17:09:28 +0300 Subject: [PATCH 36/90] Support pg 15 - Split postgres_ffi into two version specific files. - Preserve pg_version in timeline metadata. - Use pg_version in safekeeper code. Check for postgres major version mismatch. - Clean up the code to use DEFAULT_PG_VERSION constant everywhere, instead of hardcoding. - Parameterize python tests: use DEFAULT_PG_VERSION env and pg_version fixture. To run tests using a specific PostgreSQL version, pass the DEFAULT_PG_VERSION environment variable: 'DEFAULT_PG_VERSION='15' ./scripts/pytest test_runner/regress' Currently don't all tests pass, because rust code relies on the default version of PostgreSQL in a few places. --- control_plane/src/bin/neon_local.rs | 95 +++++++++++-- control_plane/src/compute.rs | 49 +++++-- control_plane/src/local_env.rs | 48 +++++-- control_plane/src/storage.rs | 22 ++- libs/postgres_ffi/src/lib.rs | 129 +++++++++++++++++- libs/postgres_ffi/src/nonrelfile_utils.rs | 2 +- libs/postgres_ffi/src/pg_constants.rs | 19 +-- libs/postgres_ffi/src/pg_constants_v14.rs | 5 + libs/postgres_ffi/src/pg_constants_v15.rs | 10 ++ libs/postgres_ffi/src/relfile_utils.rs | 25 ++-- libs/postgres_ffi/src/waldecoder.rs | 49 +------ libs/postgres_ffi/src/xlog_utils.rs | 38 +++++- pageserver/src/basebackup.rs | 82 +++++------ pageserver/src/bin/update_metadata.rs | 2 + pageserver/src/config.rs | 45 ++++-- pageserver/src/http/models.rs | 1 + pageserver/src/http/routes.rs | 1 + pageserver/src/import_datadir.rs | 20 +-- pageserver/src/lib.rs | 2 + pageserver/src/page_service.rs | 31 ++++- pageserver/src/pgdatadir_mapping.rs | 10 +- pageserver/src/reltag.rs | 6 +- pageserver/src/storage_sync.rs | 12 +- pageserver/src/storage_sync/index.rs | 23 +++- pageserver/src/tenant.rs | 49 ++++--- pageserver/src/tenant/metadata.rs | 9 ++ pageserver/src/tenant/timeline.rs | 17 ++- pageserver/src/walingest.rs | 83 +++++++---- .../src/walreceiver/connection_manager.rs | 2 +- .../src/walreceiver/walreceiver_connection.rs | 4 +- pageserver/src/walrecord.rs | 38 ++++-- pageserver/src/walredo.rs | 30 ++-- safekeeper/src/json_ctrl.rs | 11 +- safekeeper/src/safekeeper.rs | 19 ++- safekeeper/src/send_wal.rs | 2 +- safekeeper/src/wal_backup.rs | 3 +- safekeeper/src/wal_storage.rs | 10 +- test_runner/fixtures/neon_fixtures.py | 30 +++- test_runner/regress/test_import.py | 5 + test_runner/regress/test_pg_regress.py | 18 ++- test_runner/regress/test_wal_acceptor.py | 9 +- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 43 files changed, 777 insertions(+), 292 deletions(-) create mode 100644 libs/postgres_ffi/src/pg_constants_v14.rs create mode 100644 libs/postgres_ffi/src/pg_constants_v15.rs diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index e16fd8764a..92782ea235 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -39,6 +39,8 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); +const DEFAULT_PG_VERSION: &str = "14"; + fn default_conf(etcd_binary_path: &Path) -> String { format!( r#" @@ -105,6 +107,13 @@ fn main() -> Result<()> { .takes_value(true) .required(false); + let pg_version_arg = Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(false) + .takes_value(true) + .default_value(DEFAULT_PG_VERSION); + let port_arg = Arg::new("port") .long("port") .required(false) @@ -146,6 +155,7 @@ fn main() -> Result<()> { .required(false) .value_name("config"), ) + .arg(pg_version_arg.clone()) ) .subcommand( App::new("timeline") @@ -164,7 +174,9 @@ fn main() -> Result<()> { .subcommand(App::new("create") .about("Create a new blank timeline") .arg(tenant_id_arg.clone()) - .arg(branch_name_arg.clone())) + .arg(branch_name_arg.clone()) + .arg(pg_version_arg.clone()) + ) .subcommand(App::new("import") .about("Import timeline from basebackup directory") .arg(tenant_id_arg.clone()) @@ -178,7 +190,9 @@ fn main() -> Result<()> { .arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true) .help("Wal to add after base")) .arg(Arg::new("end-lsn").long("end-lsn").takes_value(true) - .help("Lsn the basebackup ends at"))) + .help("Lsn the basebackup ends at")) + .arg(pg_version_arg.clone()) + ) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) @@ -188,6 +202,7 @@ fn main() -> Result<()> { .arg(tenant_id_arg.clone()) .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + .arg(pg_version_arg.clone()) ) .subcommand(App::new("config") .arg(tenant_id_arg.clone()) @@ -239,8 +254,9 @@ fn main() -> Result<()> { Arg::new("config-only") .help("Don't do basebackup, create compute node with only config files") .long("config-only") - .required(false) - )) + .required(false)) + .arg(pg_version_arg.clone()) + ) .subcommand(App::new("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") .arg(pg_node_arg.clone()) @@ -248,7 +264,9 @@ fn main() -> Result<()> { .arg(branch_name_arg.clone()) .arg(timeline_id_arg.clone()) .arg(lsn_arg.clone()) - .arg(port_arg.clone())) + .arg(port_arg.clone()) + .arg(pg_version_arg.clone()) + ) .subcommand( App::new("stop") .arg(pg_node_arg.clone()) @@ -501,9 +519,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { default_conf(&EtcdBroker::locate_etcd()?) }; + let pg_version = init_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + let mut env = LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - env.init().context("Failed to initialize neon repository")?; + env.init(pg_version) + .context("Failed to initialize neon repository")?; let initial_tenant_id = env .default_tenant_id .expect("default_tenant_id should be generated by the `env.init()` call above"); @@ -515,6 +540,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { Some(initial_tenant_id), initial_timeline_id_arg, &pageserver_config_overrides(init_match), + pg_version, ) .unwrap_or_else(|e| { eprintln!("pageserver init failed: {e}"); @@ -557,8 +583,19 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; - let timeline_info = - pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?; + let pg_version = create_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = pageserver.timeline_create( + new_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info .local @@ -607,7 +644,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_branch_name = create_match .value_of("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; - let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?; + + let pg_version = create_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = + pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info @@ -655,7 +700,14 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?; println!("Creating node for imported timeline ..."); env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - cplane.new_node(tenant_id, name, timeline_id, None, None)?; + + let pg_version = import_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; println!("Done"); } Some(("branch", branch_match)) => { @@ -682,6 +734,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - None, start_lsn, Some(ancestor_timeline_id), + None, )?; let new_timeline_id = timeline_info.timeline_id; @@ -797,7 +850,14 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?; + + let pg_version = sub_args + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?; } "start" => { let port: Option = match sub_args.value_of("port") { @@ -835,16 +895,23 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .map(Lsn::from_str) .transpose() .context("Failed to parse Lsn from the request")?; + let pg_version = sub_args + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X // stop // start <-- will also use port X even without explicit port argument println!( - "Starting new postgres {} on timeline {} ...", - node_name, timeline_id + "Starting new postgres (v{}) {} on timeline {} ...", + pg_version, node_name, timeline_id ); - let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; + + let node = + cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?; node.start(&auth_token)?; } } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index b678d620df..89994c5647 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -18,7 +18,7 @@ use utils::{ postgres_backend::AuthType, }; -use crate::local_env::LocalEnv; +use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; use crate::postgresql_conf::PostgresConf; use crate::storage::PageServerNode; @@ -81,6 +81,7 @@ impl ComputeControlPlane { timeline_id: TimelineId, lsn: Option, port: Option, + pg_version: u32, ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { @@ -93,6 +94,7 @@ impl ComputeControlPlane { lsn, tenant_id, uses_wal_proposer: false, + pg_version, }); node.create_pgdata()?; @@ -118,6 +120,7 @@ pub struct PostgresNode { pub lsn: Option, // if it's a read-only node. None for primary pub tenant_id: TenantId, uses_wal_proposer: bool, + pg_version: u32, } impl PostgresNode { @@ -152,6 +155,14 @@ impl PostgresNode { let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); + // Read postgres version from PG_VERSION file to determine which postgres version binary to use. + // If it doesn't exist, assume broken data directory and use default pg version. + let pg_version_path = entry.path().join("PG_VERSION"); + + let pg_version_str = + fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); + let pg_version = u32::from_str(&pg_version_str)?; + // parse recovery_target_lsn, if any let recovery_target_lsn: Option = conf.parse_field_optional("recovery_target_lsn", &context)?; @@ -167,17 +178,24 @@ impl PostgresNode { lsn: recovery_target_lsn, tenant_id, uses_wal_proposer, + pg_version, }) } - fn sync_safekeepers(&self, auth_token: &Option) -> Result { - let pg_path = self.env.pg_bin_dir().join("postgres"); + fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { + let pg_path = self.env.pg_bin_dir(pg_version).join("postgres"); let mut cmd = Command::new(&pg_path); cmd.arg("--sync-safekeepers") .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version).to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version).to_str().unwrap(), + ) .env("PGDATA", self.pgdata().to_str().unwrap()) .stdout(Stdio::piped()) // Comment this to avoid capturing stderr (useful if command hangs) @@ -259,8 +277,8 @@ impl PostgresNode { }) } - // Connect to a page server, get base backup, and untar it to initialize a - // new data directory + // Write postgresql.conf with default configuration + // and PG_VERSION file to the data directory of a new node. fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); @@ -357,6 +375,9 @@ impl PostgresNode { let mut file = File::create(self.pgdata().join("postgresql.conf"))?; file.write_all(conf.to_string().as_bytes())?; + let mut file = File::create(self.pgdata().join("PG_VERSION"))?; + file.write_all(self.pg_version.to_string().as_bytes())?; + Ok(()) } @@ -368,7 +389,7 @@ impl PostgresNode { // latest data from the pageserver. That is a bit clumsy but whole bootstrap // procedure evolves quite actively right now, so let's think about it again // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token)?; + let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; if lsn == Lsn(0) { None } else { @@ -401,7 +422,7 @@ impl PostgresNode { } fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { - let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl"); + let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl"); let mut cmd = Command::new(pg_ctl_path); cmd.args( [ @@ -417,8 +438,14 @@ impl PostgresNode { .concat(), ) .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + ); if let Some(token) = auth_token { cmd.env("ZENITH_AUTH_TOKEN", token); } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 7afaad26dc..14bb4cf346 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -20,6 +20,8 @@ use utils::{ use crate::safekeeper::SafekeeperNode; +pub const DEFAULT_PG_VERSION: u32 = 14; + // // This data structures represents neon_local CLI config // @@ -195,12 +197,40 @@ impl Default for SafekeeperConf { } impl LocalEnv { - // postgres installation paths - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + let mut path = self.pg_distrib_dir.clone(); + + if pg_version != DEFAULT_PG_VERSION { + // step up to the parent directory + // We assume that the pg_distrib subdirs + // for different pg versions + // are located in the same directory + // and follow the naming convention: v14, v15, etc. + path.pop(); + + match pg_version { + 14 => return path.join(format!("v{pg_version}")), + 15 => return path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), + }; + } + + path } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + + pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("bin"), + 15 => self.pg_distrib_dir(pg_version).join("bin"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("lib"), + 15 => self.pg_distrib_dir(pg_version).join("lib"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } } pub fn pageserver_bin(&self) -> anyhow::Result { @@ -290,6 +320,8 @@ impl LocalEnv { // Find postgres binaries. // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. if env.pg_distrib_dir == Path::new("") { if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { env.pg_distrib_dir = postgres_bin.into(); @@ -384,7 +416,7 @@ impl LocalEnv { // // Initialize a new Neon repository // - pub fn init(&mut self) -> anyhow::Result<()> { + pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> { // check if config already exists let base_path = &self.base_data_dir; ensure!( @@ -397,10 +429,10 @@ impl LocalEnv { "directory '{}' already exists. Perhaps already initialized?", base_path.display() ); - if !self.pg_distrib_dir.join("bin/postgres").exists() { + if !self.pg_bin_dir(pg_version).join("postgres").exists() { bail!( "Can't find postgres binary at {}", - self.pg_distrib_dir.display() + self.pg_bin_dir(pg_version).display() ); } for binary in ["pageserver", "safekeeper"] { diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 3bbbdc5865..95ade14fbf 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -112,11 +112,15 @@ impl PageServerNode { create_tenant: Option, initial_timeline_id: Option, config_overrides: &[&str], + pg_version: u32, ) -> anyhow::Result { let id = format!("id={}", self.env.pageserver.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. - let pg_distrib_dir_param = - format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()); + let pg_distrib_dir_param = format!( + "pg_distrib_dir='{}'", + self.env.pg_distrib_dir(pg_version).display() + ); + let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); let listen_http_addr_param = format!( "listen_http_addr='{}'", @@ -159,7 +163,7 @@ impl PageServerNode { self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?; let init_result = self - .try_init_timeline(create_tenant, initial_timeline_id) + .try_init_timeline(create_tenant, initial_timeline_id, pg_version) .context("Failed to create initial tenant and timeline for pageserver"); match &init_result { Ok(initial_timeline_id) => { @@ -175,10 +179,16 @@ impl PageServerNode { &self, new_tenant_id: Option, new_timeline_id: Option, + pg_version: u32, ) -> anyhow::Result { let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; - let initial_timeline_info = - self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; + let initial_timeline_info = self.timeline_create( + initial_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; Ok(initial_timeline_info.timeline_id) } @@ -497,6 +507,7 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, + pg_version: Option, ) -> anyhow::Result { self.http_request( Method::POST, @@ -506,6 +517,7 @@ impl PageServerNode { new_timeline_id, ancestor_start_lsn, ancestor_timeline_id, + pg_version, }) .send()? .error_from_body()? diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index f43232ed0c..25e1f6029c 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -7,6 +7,8 @@ // https://github.com/rust-lang/rust-bindgen/issues/1651 #![allow(deref_nullptr)] +use bytes::Bytes; +use utils::bin_ser::SerializeError; use utils::lsn::Lsn; macro_rules! postgres_ffi { @@ -24,11 +26,11 @@ macro_rules! postgres_ffi { stringify!($version), ".rs" )); + + include!(concat!("pg_constants_", stringify!($version), ".rs")); } pub mod controlfile_utils; pub mod nonrelfile_utils; - pub mod pg_constants; - pub mod relfile_utils; pub mod waldecoder; pub mod xlog_utils; @@ -44,6 +46,9 @@ macro_rules! postgres_ffi { postgres_ffi!(v14); postgres_ffi!(v15); +pub mod pg_constants; +pub mod relfile_utils; + // Export some widely used datatypes that are unlikely to change across Postgres versions pub use v14::bindings::{uint32, uint64, Oid}; pub use v14::bindings::{BlockNumber, OffsetNumber}; @@ -52,8 +57,11 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; +pub use v14::bindings::{PageHeaderData, XLogRecord}; pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +pub use v14::bindings::{CheckPoint, ControlFileData}; + // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; @@ -63,6 +71,50 @@ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; +// Export some version independent functions that are used outside of this mod +pub use v14::xlog_utils::encode_logical_message; +pub use v14::xlog_utils::get_current_timestamp; +pub use v14::xlog_utils::to_pg_timestamp; +pub use v14::xlog_utils::XLogFileName; + +pub use v14::bindings::DBState_DB_SHUTDOWNED; + +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { + if version == 14 { + bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0 + } else { + assert_eq!(version, 15); + bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0 + } +} + +pub fn generate_wal_segment( + segno: u64, + system_id: u64, + pg_version: u32, +) -> Result { + match pg_version { + 14 => v14::xlog_utils::generate_wal_segment(segno, system_id), + 15 => v15::xlog_utils::generate_wal_segment(segno, system_id), + _ => Err(SerializeError::BadInput), + } +} + +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, + pg_version: u32, +) -> anyhow::Result<(Bytes, u64)> { + match pg_version { + 14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + 15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + _ => anyhow::bail!("Unknown version {}", pg_version), + } +} + // PG timeline is always 1, changing it doesn't have any useful meaning in Neon. // // NOTE: this is not to be confused with Neon timelines; different concept! @@ -74,7 +126,7 @@ pub const PG_TLI: u32 = 1; // See TransactionIdIsNormal in transam.h pub const fn transaction_id_is_normal(id: TransactionId) -> bool { - id > v14::pg_constants::FIRST_NORMAL_TRANSACTION_ID + id > pg_constants::FIRST_NORMAL_TRANSACTION_ID } // See TransactionIdPrecedes in transam.c @@ -109,3 +161,74 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); } + +pub mod waldecoder { + + use crate::{v14, v15}; + use bytes::{Buf, Bytes, BytesMut}; + use std::num::NonZeroU32; + use thiserror::Error; + use utils::lsn::Lsn; + + pub enum State { + WaitingForRecord, + ReassemblingRecord { + recordbuf: BytesMut, + contlen: NonZeroU32, + }, + SkippingEverything { + skip_until_lsn: Lsn, + }, + } + + pub struct WalStreamDecoder { + pub lsn: Lsn, + pub pg_version: u32, + pub inputbuf: BytesMut, + pub state: State, + } + + #[derive(Error, Debug, Clone)] + #[error("{msg} at {lsn}")] + pub struct WalDecodeError { + pub msg: String, + pub lsn: Lsn, + } + + impl WalStreamDecoder { + pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder { + WalStreamDecoder { + lsn, + pg_version, + inputbuf: BytesMut::new(), + state: State::WaitingForRecord, + } + } + + // The latest LSN position fed to the decoder. + pub fn available(&self) -> Lsn { + self.lsn + self.inputbuf.remaining() as u64 + } + + pub fn feed_bytes(&mut self, buf: &[u8]) { + self.inputbuf.extend_from_slice(buf); + } + + pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + match self.pg_version { + 14 => { + use self::v14::waldecoder::WalStreamDecoderHandler; + self.poll_decode_internal() + } + 15 => { + use self::v15::waldecoder::WalStreamDecoderHandler; + self.poll_decode_internal() + } + _ => Err(WalDecodeError { + msg: format!("Unknown version {}", self.pg_version), + lsn: self.lsn, + }), + } + } + } +} diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index 1de1d367e0..01e5554b8a 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,7 +1,7 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use super::pg_constants; +use crate::pg_constants; use crate::transaction_id_precedes; use bytes::BytesMut; use log::*; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 8cc9fa7af6..6aaa739a69 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -1,14 +1,16 @@ //! //! Misc constants, copied from PostgreSQL headers. //! +//! Only place version-independent constants here. +//! //! TODO: These probably should be auto-generated using bindgen, //! rather than copied by hand. Although on the other hand, it's nice //! to have them all here in one place, and have the ability to add //! comments on them. //! -use super::bindings::{PageHeaderData, XLogRecord}; use crate::BLCKSZ; +use crate::{PageHeaderData, XLogRecord}; // // From pg_tablespace_d.h @@ -16,14 +18,6 @@ use crate::BLCKSZ; pub const DEFAULTTABLESPACE_OID: u32 = 1663; pub const GLOBALTABLESPACE_OID: u32 = 1664; -// -// Fork numbers, from relpath.h -// -pub const MAIN_FORKNUM: u8 = 0; -pub const FSM_FORKNUM: u8 = 1; -pub const VISIBILITYMAP_FORKNUM: u8 = 2; -pub const INIT_FORKNUM: u8 = 3; - // From storage_xlog.h pub const XLOG_SMGR_CREATE: u8 = 0x10; pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; @@ -114,7 +108,6 @@ pub const XLOG_NEXTOID: u8 = 0x30; pub const XLOG_SWITCH: u8 = 0x40; pub const XLOG_FPI_FOR_HINT: u8 = 0xA0; pub const XLOG_FPI: u8 = 0xB0; -pub const DB_SHUTDOWNED: u32 = 1; // From multixact.h pub const FIRST_MULTIXACT_ID: u32 = 1; @@ -169,10 +162,6 @@ pub const RM_HEAP_ID: u8 = 10; pub const XLR_INFO_MASK: u8 = 0x0F; pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; -// from dbcommands_xlog.h -pub const XLOG_DBASE_CREATE: u8 = 0x00; -pub const XLOG_DBASE_DROP: u8 = 0x10; - pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; @@ -197,8 +186,6 @@ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous /* Information stored in bimg_info */ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */ -pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ -pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ /* From transam.h */ pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3; diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs new file mode 100644 index 0000000000..810898ee80 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -0,0 +1,5 @@ +pub const XLOG_DBASE_CREATE: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x10; + +pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ +pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs new file mode 100644 index 0000000000..6fa5eb008c --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -0,0 +1,10 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index f3476acc9c..1dc9f367ff 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -1,10 +1,17 @@ //! //! Common utilities for dealing with PostgreSQL relation files. //! -use super::pg_constants; use once_cell::sync::OnceCell; use regex::Regex; +// +// Fork numbers, from relpath.h +// +pub const MAIN_FORKNUM: u8 = 0; +pub const FSM_FORKNUM: u8 = 1; +pub const VISIBILITYMAP_FORKNUM: u8 = 2; +pub const INIT_FORKNUM: u8 = 3; + #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum FilePathError { #[error("invalid relation fork name")] @@ -23,10 +30,10 @@ impl From for FilePathError { pub fn forkname_to_number(forkname: Option<&str>) -> Result { match forkname { // "main" is not in filenames, it's implicit if the fork name is not present - None => Ok(pg_constants::MAIN_FORKNUM), - Some("fsm") => Ok(pg_constants::FSM_FORKNUM), - Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM), - Some("init") => Ok(pg_constants::INIT_FORKNUM), + None => Ok(MAIN_FORKNUM), + Some("fsm") => Ok(FSM_FORKNUM), + Some("vm") => Ok(VISIBILITYMAP_FORKNUM), + Some("init") => Ok(INIT_FORKNUM), Some(_) => Err(FilePathError::InvalidForkName), } } @@ -34,10 +41,10 @@ pub fn forkname_to_number(forkname: Option<&str>) -> Result { /// Convert Postgres fork number to the right suffix of the relation data file. pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { match forknum { - pg_constants::MAIN_FORKNUM => None, - pg_constants::FSM_FORKNUM => Some("fsm"), - pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"), - pg_constants::INIT_FORKNUM => Some("init"), + MAIN_FORKNUM => None, + FSM_FORKNUM => Some("fsm"), + VISIBILITYMAP_FORKNUM => Some("vm"), + INIT_FORKNUM => Some("init"), _ => Some("UNKNOWN FORKNUM"), } } diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 4d79e4b1d1..5b46d52321 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -8,6 +8,7 @@ //! to look deeper into the WAL records to also understand which blocks they modify, the code //! for that is in pageserver/src/walrecord.rs //! +use super::super::waldecoder::{State, WalDecodeError, WalStreamDecoder}; use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use super::xlog_utils::*; use crate::WAL_SEGMENT_SIZE; @@ -16,55 +17,19 @@ use crc32c::*; use log::*; use std::cmp::min; use std::num::NonZeroU32; -use thiserror::Error; use utils::lsn::Lsn; -enum State { - WaitingForRecord, - ReassemblingRecord { - recordbuf: BytesMut, - contlen: NonZeroU32, - }, - SkippingEverything { - skip_until_lsn: Lsn, - }, -} - -pub struct WalStreamDecoder { - lsn: Lsn, - inputbuf: BytesMut, - state: State, -} - -#[derive(Error, Debug, Clone)] -#[error("{msg} at {lsn}")] -pub struct WalDecodeError { - msg: String, - lsn: Lsn, +pub trait WalStreamDecoderHandler { + fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError>; + fn poll_decode_internal(&mut self) -> Result, WalDecodeError>; + fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError>; } // // WalRecordStream is a Stream that returns a stream of WAL records // FIXME: This isn't a proper rust stream // -impl WalStreamDecoder { - pub fn new(lsn: Lsn) -> WalStreamDecoder { - WalStreamDecoder { - lsn, - inputbuf: BytesMut::new(), - state: State::WaitingForRecord, - } - } - - // The latest LSN position fed to the decoder. - pub fn available(&self) -> Lsn { - self.lsn + self.inputbuf.remaining() as u64 - } - - pub fn feed_bytes(&mut self, buf: &[u8]) { - self.inputbuf.extend_from_slice(buf); - } - +impl WalStreamDecoderHandler for WalStreamDecoder { fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { let validate_impl = || { if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 { @@ -125,7 +90,7 @@ impl WalStreamDecoder { /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. /// - pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + fn poll_decode_internal(&mut self) -> Result, WalDecodeError> { // Run state machine that validates page headers, and reassembles records // that cross page boundaries. loop { diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index f8606b6e47..8389a6e971 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -9,12 +9,13 @@ use crc32c::crc32c_append; +use super::super::waldecoder::WalStreamDecoder; use super::bindings::{ - CheckPoint, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData, - XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, + CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz, + XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, }; -use super::pg_constants; -use super::waldecoder::WalStreamDecoder; +use super::PG_MAJORVERSION; +use crate::pg_constants; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; @@ -113,6 +114,30 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } } +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, +) -> anyhow::Result<(Bytes, u64)> { + let mut pg_control = ControlFileData::decode(pg_control_bytes)?; + let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; + + // Generate new pg_control needed for bootstrap + checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; + + //reset some fields we don't want to preserve + //TODO Check this. + //We may need to determine the value from twophase data. + checkpoint.oldestActiveXid = 0; + + //save new values in pg_control + pg_control.checkPoint = 0; + pg_control.checkPointCopy = checkpoint; + pg_control.state = DBState_DB_SHUTDOWNED; + + Ok((pg_control.encode(), pg_control.system_identifier)) +} + pub fn get_current_timestamp() -> TimestampTz { to_pg_timestamp(SystemTime::now()) } @@ -144,7 +169,10 @@ pub fn find_end_of_wal( let mut result = start_lsn; let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; - let mut decoder = WalStreamDecoder::new(start_lsn); + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + info!("find_end_of_wal PG_VERSION: {}", pg_version); + + let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); // loop over segments loop { diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index eca6a3c87f..d0a57a473b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -25,10 +25,10 @@ use tracing::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; -use postgres_ffi::v14::{CheckPoint, ControlFileData}; +use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; use postgres_ffi::TransactionId; +use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; @@ -129,15 +129,15 @@ where // TODO include checksum // Create pgdata subdirs structure - for dir in pg_constants::PGDATA_SUBDIRS.iter() { + for dir in PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; self.ar.append(&header, &mut io::empty())?; } // Send empty config files. - for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() { + for filepath in PGDATA_SPECIAL_FILES.iter() { if *filepath == "pg_hba.conf" { - let data = pg_constants::PG_HBA.as_bytes(); + let data = PG_HBA.as_bytes(); let header = new_tar_header(filepath, data.len() as u64)?; self.ar.append(&header, data)?; } else { @@ -267,16 +267,12 @@ where None }; - // TODO pass this as a parameter - let pg_version = "14"; + if spcnode == GLOBALTABLESPACE_OID { + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; - if spcnode == pg_constants::GLOBALTABLESPACE_OID { - let version_bytes = pg_version.as_bytes(); - let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; - - let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace @@ -305,7 +301,7 @@ where return Ok(()); } // User defined tablespaces are not supported - ensure!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); + ensure!(spcnode == DEFAULTTABLESPACE_OID); // Append dir path for each database let path = format!("base/{}", dbnode); @@ -314,9 +310,10 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); - let version_bytes = pg_version.as_bytes(); - let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; @@ -348,30 +345,6 @@ where // Also send zenith.signal file with extra bootstrap data. // fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn) - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn) - .context("failed get control bytes")?; - let mut pg_control = ControlFileData::decode(&pg_control_bytes)?; - let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - - // Generate new pg_control needed for bootstrap - checkpoint.redo = normalize_lsn(self.lsn, WAL_SEGMENT_SIZE).0; - - //reset some fields we don't want to preserve - //TODO Check this. - //We may need to determine the value from twophase data. - checkpoint.oldestActiveXid = 0; - - //save new values in pg_control - pg_control.checkPoint = 0; - pg_control.checkPointCopy = checkpoint; - pg_control.state = pg_constants::DB_SHUTDOWNED; - // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -388,8 +361,23 @@ where zenith_signal.as_bytes(), )?; + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn) + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn) + .context("failed get control bytes")?; + + let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; + //send pg_control - let pg_control_bytes = pg_control.encode(); let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar.append(&header, &pg_control_bytes[..])?; @@ -398,8 +386,10 @@ where let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; - let wal_seg = generate_wal_segment(segno, pg_control.system_identifier) - .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; + + let wal_seg = + postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 16359c2532..e66049c457 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -50,6 +50,7 @@ fn main() -> Result<()> { meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), + meta.pg_version(), ); update_meta = true; } @@ -62,6 +63,7 @@ fn main() -> Result<()> { meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), + meta.pg_version(), ); update_meta = true; } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 945ee098ea..a4346c0190 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -21,6 +21,7 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::DEFAULT_PG_VERSION; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; @@ -209,7 +210,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join("pg_install/v14")), + .join(format!("pg_install/v{}", DEFAULT_PG_VERSION))), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), @@ -374,13 +375,40 @@ impl PageServerConf { // // Postgres distribution paths // + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + let mut path = self.pg_distrib_dir.clone(); - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + if pg_version != DEFAULT_PG_VERSION { + // step up to the parent directory + // We assume that the pg_distrib subdirs + // for different pg versions + // are located in the same directory + // and follow the naming convention: v14, v15, etc. + path.pop(); + + match pg_version { + 14 => return path.join(format!("v{pg_version}")), + 15 => return path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), + }; + } + + path } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("bin"), + 15 => self.pg_distrib_dir(pg_version).join("bin"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("lib"), + 15 => self.pg_distrib_dir(pg_version).join("lib"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } } /// Parse a configuration file (pageserver.toml) into a PageServerConf struct, @@ -449,10 +477,11 @@ impl PageServerConf { ); } - if !conf.pg_distrib_dir.join("bin/postgres").exists() { + let pg_version = DEFAULT_PG_VERSION; + if !conf.pg_bin_dir(pg_version).join("postgres").exists() { bail!( "Can't find postgres binary at {}", - conf.pg_distrib_dir.display() + conf.pg_bin_dir(pg_version).display() ); } @@ -863,7 +892,7 @@ broker_endpoints = ['{broker_endpoint}'] let workdir = tempdir_path.join("workdir"); fs::create_dir_all(&workdir)?; - let pg_distrib_dir = tempdir_path.join("pg_distrib"); + let pg_distrib_dir = tempdir_path.join(format!("pg_distrib/v{DEFAULT_PG_VERSION}")); fs::create_dir_all(&pg_distrib_dir)?; let postgres_bin_dir = pg_distrib_dir.join("bin"); fs::create_dir_all(&postgres_bin_dir)?; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 2d7d560d2a..851fa881a0 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -21,6 +21,7 @@ pub struct TimelineCreateRequest { #[serde(default)] #[serde_as(as = "Option")] pub ancestor_start_lsn: Option, + pub pg_version: Option, } #[serde_as] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c676dfacd2..6892c0b391 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -173,6 +173,7 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index c1e736d552..23c4351b4e 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -16,11 +16,13 @@ use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; -use postgres_ffi::v14::relfile_utils::*; -use postgres_ffi::v14::waldecoder::*; -use postgres_ffi::v14::xlog_utils::*; -use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::*; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::ControlFileData; +use postgres_ffi::DBState_DB_SHUTDOWNED; use postgres_ffi::Oid; +use postgres_ffi::XLogFileName; use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; @@ -236,7 +238,7 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { - let mut waldecoder = WalStreamDecoder::new(startpoint); + let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); @@ -354,7 +356,7 @@ pub fn import_wal_from_tar( end_lsn: Lsn, ) -> Result<()> { // Set up walingest mutable state - let mut waldecoder = WalStreamDecoder::new(start_lsn); + let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version); let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; @@ -439,7 +441,7 @@ fn import_file( len: usize, ) -> Result> { if file_path.starts_with("global") { - let spcnode = pg_constants::GLOBALTABLESPACE_OID; + let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; let dbnode = 0; match file_path @@ -467,7 +469,7 @@ fn import_file( debug!("imported relmap file") } "PG_VERSION" => { - debug!("ignored"); + debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len)?; @@ -495,7 +497,7 @@ fn import_file( debug!("imported relmap file") } "PG_VERSION" => { - debug!("ignored"); + debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len)?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index e918a39457..0bd5e242d3 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -36,6 +36,8 @@ use crate::task_mgr::TaskKind; /// format, bump this! pub const STORAGE_FORMAT_VERSION: u16 = 3; +pub const DEFAULT_PG_VERSION: u32 = 14; + // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7de6403b83..fed5d0dcc4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -43,9 +43,9 @@ use crate::task_mgr::TaskKind; use crate::tenant::Timeline; use crate::tenant_mgr; use crate::CheckpointConfig; -use postgres_ffi::v14::xlog_utils::to_pg_timestamp; -use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::to_pg_timestamp; use postgres_ffi::BLCKSZ; // Wrapped in libpq CopyData @@ -498,12 +498,16 @@ impl PageServerHandler { timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, + pg_version: u32, ) -> anyhow::Result<()> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let timeline = tenant_mgr::get_tenant(tenant_id, true)? - .create_empty_timeline(timeline_id, base_lsn)?; + let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline( + timeline_id, + base_lsn, + pg_version, + )?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -958,16 +962,31 @@ impl postgres_backend_async::Handler for PageServerHandler { // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN" let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() == 4); + ensure!(params.len() >= 4); let tenant_id = TenantId::from_str(params[0])?; let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; + let pg_version = if params.len() == 5 { + u32::from_str(params[4])? + } else { + // If version is not provided, assume default. + // TODO: this may lead to weird errors if the version is wrong. + crate::DEFAULT_PG_VERSION + }; + self.check_permission(Some(tenant_id))?; match self - .handle_import_basebackup(pgb, tenant_id, timeline_id, base_lsn, end_lsn) + .handle_import_basebackup( + pgb, + tenant_id, + timeline_id, + base_lsn, + end_lsn, + pg_version, + ) .await { Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 9d4b438dc4..fc9867dc05 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,7 +13,7 @@ use crate::tenant::Timeline; use crate::walrecord::NeonWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; -use postgres_ffi::v14::pg_constants; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; use postgres_ffi::{Oid, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; @@ -125,8 +125,7 @@ impl Timeline { return Ok(nblocks); } - if (tag.forknum == pg_constants::FSM_FORKNUM - || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM) + if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) && !self.get_rel_exists(tag, lsn, latest)? { // FIXME: Postgres sometimes calls smgrcreate() to create @@ -1090,6 +1089,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // 03 misc // controlfile // checkpoint +// pg_version // // Below is a full list of the keyspace allocation: // @@ -1128,7 +1128,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // // Checkpoint: // 03 00000000 00000000 00000000 00 00000001 - //-- Section 01: relation data and metadata const DBDIR_KEY: Key = Key { @@ -1402,8 +1401,9 @@ fn is_slru_block_key(key: Key) -> bool { pub fn create_test_timeline( tenant: &crate::tenant::Tenant, timeline_id: utils::id::TimelineId, + pg_version: u32, ) -> Result> { - let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; + let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index e3d08f8b3d..43d38bd986 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -2,8 +2,8 @@ use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::relfile_utils::forknumber_to_name; +use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; +use postgres_ffi::relfile_utils::forknumber_to_name; use postgres_ffi::Oid; /// @@ -78,7 +78,7 @@ impl fmt::Display for RelTag { impl RelTag { pub fn to_segfile_name(&self, segno: u32) -> String { - let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID { + let mut name = if self.spcnode == GLOBALTABLESPACE_OID { "global/".to_string() } else { format!("base/{}/", self.dbnode) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 489d0ad4ed..892a34a76f 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -1445,7 +1445,17 @@ mod test_utils { } pub(super) fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { - TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) + TimelineMetadata::new( + disk_consistent_lsn, + None, + None, + Lsn(0), + Lsn(0), + Lsn(0), + // Any version will do + // but it should be consistent with the one in the tests + crate::DEFAULT_PG_VERSION, + ) } } diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 13495ffefe..db37c7b411 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -341,13 +341,21 @@ mod tests { use super::*; use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + use crate::DEFAULT_PG_VERSION; #[test] fn index_part_conversion() { let harness = TenantHarness::create("index_part_conversion").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); - let metadata = - TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); let remote_timeline = RemoteTimeline { timeline_layers: HashSet::from([ timeline_path.join("layer_1"), @@ -464,8 +472,15 @@ mod tests { fn index_part_conversion_negatives() { let harness = TenantHarness::create("index_part_conversion_negatives").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); - let metadata = - TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); let conversion_result = IndexPart::from_remote_timeline( &timeline_path, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ca97796870..5860e13534 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -171,6 +171,7 @@ impl Tenant { &self, new_timeline_id: TimelineId, initdb_lsn: Lsn, + pg_version: u32, ) -> Result> { // XXX: keep the lock to avoid races during timeline creation let mut timelines = self.timelines.lock().unwrap(); @@ -186,7 +187,7 @@ impl Tenant { } let new_metadata = - TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); + TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn, pg_version,); let new_timeline = self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -387,6 +388,11 @@ impl Tenant { let mut timelines_accessor = self.timelines.lock().unwrap(); for (timeline_id, metadata) in sorted_timelines { + info!( + "Attaching timeline {} pg_version {}", + timeline_id, + metadata.pg_version() + ); let timeline = self .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor) .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; @@ -613,7 +619,7 @@ impl Tenant { }; let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); - + let pg_version = new_metadata.pg_version(); let new_timeline = Arc::new(Timeline::new( self.conf, Arc::clone(&self.tenant_conf), @@ -623,6 +629,7 @@ impl Tenant { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, + pg_version, )); new_timeline @@ -984,6 +991,7 @@ impl Tenant { start_lsn, *src_timeline.latest_gc_cutoff_lsn.read(), src_timeline.initdb_lsn, + src_timeline.pg_version, ); let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; info!("branched timeline {dst} from {src} at {start_lsn}"); @@ -1319,6 +1327,7 @@ pub mod harness { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + _pg_version: u32, ) -> Result { let s = format!( "redo for {} to get to {}, with {} and {} records", @@ -1345,6 +1354,7 @@ mod tests { use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; use crate::tenant::harness::*; + use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; use once_cell::sync::Lazy; @@ -1356,7 +1366,7 @@ mod tests { #[test] fn test_basic() -> Result<()> { let tenant = TenantHarness::create("test_basic")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1378,9 +1388,9 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); - let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; - match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -1404,7 +1414,7 @@ mod tests { #[test] fn test_branch() -> Result<()> { let tenant = TenantHarness::create("test_branch")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); use std::str::from_utf8; @@ -1499,7 +1509,7 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -1529,7 +1539,7 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), @@ -1555,7 +1565,7 @@ mod tests { RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? .load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; @@ -1573,7 +1583,7 @@ mod tests { fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1590,7 +1600,7 @@ mod tests { fn test_parent_keeps_data_forever_after_branching() -> Result<()> { let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1618,7 +1628,8 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + let tline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } @@ -1638,7 +1649,7 @@ mod tests { // create two timelines { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; @@ -1674,7 +1685,7 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; let tenant = harness.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1711,7 +1722,7 @@ mod tests { #[test] fn test_images() -> Result<()> { let tenant = TenantHarness::create("test_images")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1761,7 +1772,7 @@ mod tests { #[test] fn test_bulk_insert() -> Result<()> { let tenant = TenantHarness::create("test_bulk_insert")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let mut lsn = Lsn(0x10); @@ -1801,7 +1812,7 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { let tenant = TenantHarness::create("test_random_updates")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 1000; @@ -1871,7 +1882,7 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { let tenant = TenantHarness::create("test_traverse_branches")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 1000; @@ -1950,7 +1961,7 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 606acbf2f1..41790b4d11 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -63,6 +63,7 @@ struct TimelineMetadataBody { ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, + pg_version: u32, } impl TimelineMetadata { @@ -73,6 +74,7 @@ impl TimelineMetadata { ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, + pg_version: u32, ) -> Self { Self { hdr: TimelineMetadataHeader { @@ -87,6 +89,7 @@ impl TimelineMetadata { ancestor_lsn, latest_gc_cutoff_lsn, initdb_lsn, + pg_version, }, } } @@ -160,6 +163,10 @@ impl TimelineMetadata { pub fn initdb_lsn(&self) -> Lsn { self.body.initdb_lsn } + + pub fn pg_version(&self) -> u32 { + self.body.pg_version + } } /// Save timeline metadata to file @@ -212,6 +219,8 @@ mod tests { Lsn(0), Lsn(0), Lsn(0), + // Any version will do here, so use the default + crate::DEFAULT_PG_VERSION, ); let metadata_bytes = original_metadata diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6de1d44876..019de81d64 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -37,7 +37,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp; use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; -use postgres_ffi::v14::xlog_utils::to_pg_timestamp; +use postgres_ffi::to_pg_timestamp; use utils::{ id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, @@ -61,6 +61,8 @@ pub struct Timeline { pub tenant_id: TenantId, pub timeline_id: TimelineId, + pub pg_version: u32, + pub layers: RwLock, last_freeze_at: AtomicLsn, @@ -533,6 +535,7 @@ impl Timeline { tenant_id: TenantId, walredo_mgr: Arc, upload_layers: bool, + pg_version: u32, ) -> Timeline { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -541,6 +544,7 @@ impl Timeline { tenant_conf, timeline_id, tenant_id, + pg_version, layers: RwLock::new(LayerMap::default()), walredo_mgr, @@ -1260,6 +1264,7 @@ impl Timeline { self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, + self.pg_version, ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -2133,9 +2138,13 @@ impl Timeline { let last_rec_lsn = data.records.last().unwrap().0; - let img = - self.walredo_mgr - .request_redo(key, request_lsn, base_img, data.records)?; + let img = self.walredo_mgr.request_redo( + key, + request_lsn, + base_img, + data.records, + self.pg_version, + )?; if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index bede4ac13e..1d5cab38b9 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,8 +34,9 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walrecord::*; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils::*; use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; @@ -82,7 +83,8 @@ impl<'a> WalIngest<'a> { decoded: &mut DecodedWALRecord, ) -> Result<()> { modification.lsn = lsn; - decode_wal_record(recdata, decoded).context("failed decoding wal record")?; + decode_wal_record(recdata, decoded, self.timeline.pg_version) + .context("failed decoding wal record")?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -113,18 +115,49 @@ impl<'a> WalIngest<'a> { let truncate = XlSmgrTruncate::decode(&mut buf); self.ingest_xlog_smgr_truncate(modification, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { - if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_CREATE - { - let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb)?; - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_DROP - { - let dropdb = XlDropDatabase::decode(&mut buf); - for tablespace_id in dropdb.tablespace_ids { - trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + debug!( + "handle RM_DBASE_ID for Postgres version {:?}", + self.timeline.pg_version + ); + if self.timeline.pg_version == 14 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + { + let createdb = XlCreateDatabase::decode(&mut buf); + debug!("XLOG_DBASE_CREATE v14"); + + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + } + } + } else if self.timeline.pg_version == 15 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG + { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY + { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + } } } } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { @@ -291,7 +324,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0 + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version) { // Extract page image from FPI record let img_len = blk.bimg_len as usize; @@ -392,7 +425,7 @@ impl<'a> WalIngest<'a> { // Clear the VM bits if required. if new_heap_blkno.is_some() || old_heap_blkno.is_some() { let vm_rel = RelTag { - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + forknum: VISIBILITYMAP_FORKNUM, spcnode: decoded.blocks[0].rnode_spcnode, dbnode: decoded.blocks[0].rnode_dbnode, relnode: decoded.blocks[0].rnode_relnode, @@ -568,7 +601,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::MAIN_FORKNUM, + forknum: MAIN_FORKNUM, }; self.put_rel_truncation(modification, rel, rec.blkno)?; } @@ -577,7 +610,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::FSM_FORKNUM, + forknum: FSM_FORKNUM, }; // FIXME: 'blkno' stored in the WAL record is the new size of the @@ -600,7 +633,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + forknum: VISIBILITYMAP_FORKNUM, }; // FIXME: Like with the FSM above, the logic to truncate the VM @@ -672,7 +705,7 @@ impl<'a> WalIngest<'a> { )?; for xnode in &parsed.xnodes { - for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM { + for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM { let rel = RelTag { forknum, spcnode: xnode.spcnode, @@ -1032,6 +1065,8 @@ mod tests { use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; + use crate::DEFAULT_PG_VERSION; + /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { spcnode: 0, @@ -1059,7 +1094,7 @@ mod tests { #[test] fn test_relsize() -> Result<()> { let tenant = TenantHarness::create("test_relsize")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1187,7 +1222,7 @@ mod tests { #[test] fn test_drop_extend() -> Result<()> { let tenant = TenantHarness::create("test_drop_extend")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1227,7 +1262,7 @@ mod tests { #[test] fn test_truncate_extend() -> Result<()> { let tenant = TenantHarness::create("test_truncate_extend")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) @@ -1315,7 +1350,7 @@ mod tests { #[test] fn test_large_rel() -> Result<()> { let tenant = TenantHarness::create("test_large_rel")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 148372c9d0..a82e69e5ba 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1366,7 +1366,7 @@ mod tests { }, timeline: harness .load() - .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION) .expect("Failed to create an empty timeline for dummy wal connection manager"), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 29c4cea882..5ac9a3ef7a 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -29,7 +29,7 @@ use crate::{ walingest::WalIngest, walrecord::DecodedWALRecord, }; -use postgres_ffi::v14::waldecoder::WalStreamDecoder; +use postgres_ffi::waldecoder::WalStreamDecoder; use utils::id::TenantTimelineId; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; @@ -166,7 +166,7 @@ pub async fn handle_walreceiver_connection( let physical_stream = ReplicationStream::new(copy_stream); pin!(physical_stream); - let mut waldecoder = WalStreamDecoder::new(startpoint); + let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index dbf9bf9d33..258e1a445f 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -3,12 +3,11 @@ //! use anyhow::Result; use bytes::{Buf, Bytes}; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::XLOG_SIZE_OF_XLOG_RECORD; -use postgres_ffi::v14::XLogRecord; +use postgres_ffi::pg_constants; use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; +use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; use tracing::*; use utils::bin_ser::DeserializeError; @@ -390,6 +389,16 @@ impl XlXactParsedRecord { xid = buf.get_u32_le(); trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE"); } + + if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 { + let nitems = buf.get_i32_le(); + debug!( + "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}", + nitems + ); + //FIXME: do we need to handle dropped stats here? + } + XlXactParsedRecord { xid, info, @@ -517,6 +526,7 @@ impl XlMultiXactTruncate { pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, + pg_version: u32, ) -> Result<(), DeserializeError> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; @@ -610,9 +620,21 @@ pub fn decode_wal_record( blk.hole_offset = buf.get_u16_le(); blk.bimg_info = buf.get_u8(); - blk.apply_image = (blk.bimg_info & pg_constants::BKPIMAGE_APPLY) != 0; + blk.apply_image = if pg_version == 14 { + (blk.bimg_info & postgres_ffi::v14::bindings::BKPIMAGE_APPLY) != 0 + } else { + assert_eq!(pg_version, 15); + (blk.bimg_info & postgres_ffi::v15::bindings::BKPIMAGE_APPLY) != 0 + }; - if blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED != 0 { + let blk_img_is_compressed = + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); + + if blk_img_is_compressed { + debug!("compressed block image , pg_version = {}", pg_version); + } + + if blk_img_is_compressed { if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 { blk.hole_length = buf.get_u16_le(); } else { @@ -665,9 +687,7 @@ pub fn decode_wal_record( * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED * flag is set. */ - if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0) - && blk.bimg_len == BLCKSZ - { + if !blk_img_is_compressed && blk.bimg_len == BLCKSZ { // TODO /* report_invalid_record(state, @@ -683,7 +703,7 @@ pub fn decode_wal_record( * IS_COMPRESSED flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 - && blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0 + && !blk_img_is_compressed && blk.bimg_len != BLCKSZ { // TODO diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 79c2edc96e..15a9408dc9 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -46,11 +46,12 @@ use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::NeonWalRecord; use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; -use postgres_ffi::v14::pg_constants; use postgres_ffi::BLCKSZ; /// @@ -82,6 +83,7 @@ pub trait WalRedoManager: Send + Sync { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result; } @@ -144,6 +146,7 @@ impl WalRedoManager for PostgresRedoManager { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result { if records.is_empty() { error!("invalid WAL redo request with no records"); @@ -166,6 +169,7 @@ impl WalRedoManager for PostgresRedoManager { img, &records[batch_start..i], self.conf.wal_redo_timeout, + pg_version, ) }; img = Some(result?); @@ -184,6 +188,7 @@ impl WalRedoManager for PostgresRedoManager { img, &records[batch_start..], self.conf.wal_redo_timeout, + pg_version, ) } } @@ -212,6 +217,7 @@ impl PostgresRedoManager { base_img: Option, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, + pg_version: u32, ) -> Result { let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; @@ -222,7 +228,7 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id)?; + let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); @@ -326,7 +332,7 @@ impl PostgresRedoManager { // sanity check that this is modifying the correct relation let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; assert!( - rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM, + rel.forknum == VISIBILITYMAP_FORKNUM, "ClearVisibilityMapFlags record on unexpected rel {}", rel ); @@ -570,7 +576,11 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenant_id: &TenantId) -> Result { + fn launch( + conf: &PageServerConf, + tenant_id: &TenantId, + pg_version: u32, + ) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. @@ -588,12 +598,12 @@ impl PostgresRedoProcess { fs::remove_dir_all(&datadir)?; } info!("running initdb in {}", datadir.display()); - let initdb = Command::new(conf.pg_bin_dir().join("initdb")) + let initdb = Command::new(conf.pg_bin_dir(pg_version).join("initdb")) .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .close_fds() .output() .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; @@ -619,14 +629,14 @@ impl PostgresRedoProcess { } // Start postgres itself - let mut child = Command::new(conf.pg_bin_dir().join("postgres")) + let mut child = Command::new(conf.pg_bin_dir(pg_version).join("postgres")) .arg("--wal-redo") .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .env("PGDATA", &datadir) // The redo process is not trusted, so it runs in seccomp mode // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 2456eb0752..3de410d117 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -22,7 +22,7 @@ use crate::safekeeper::{ use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; use crate::timeline::Timeline; use crate::GlobalTimelines; -use postgres_ffi::v14::xlog_utils; +use postgres_ffi::encode_logical_message; use postgres_ffi::WAL_SEGMENT_SIZE; use utils::{ lsn::Lsn, @@ -47,6 +47,7 @@ pub struct AppendLogicalMessage { epoch_start_lsn: Lsn, begin_lsn: Lsn, truncate_lsn: Lsn, + pg_version: u32, } #[derive(Serialize, Deserialize)] @@ -68,7 +69,7 @@ pub fn handle_json_ctrl( info!("JSON_CTRL request: {:?}", append_request); // need to init safekeeper state before AppendRequest - let tli = prepare_safekeeper(spg.ttid)?; + let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?; // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { @@ -95,11 +96,11 @@ pub fn handle_json_ctrl( /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(ttid: TenantTimelineId) -> Result> { +fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result> { GlobalTimelines::create( ttid, ServerInfo { - pg_version: 0, // unknown + pg_version, wal_seg_size: WAL_SEGMENT_SIZE as u32, system_id: 0, }, @@ -135,7 +136,7 @@ struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { - let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); + let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); let sk_state = tli.get_state().1; let begin_lsn = msg.begin_lsn; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 65340ac0ed..eec24faf2f 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -27,7 +27,7 @@ use utils::{ pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_FORMAT_VERSION: u32 = 6; const SK_PROTOCOL_VERSION: u32 = 2; -const UNKNOWN_SERVER_VERSION: u32 = 0; +pub const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. pub type Term = u64; @@ -594,15 +594,20 @@ where SK_PROTOCOL_VERSION ); } - // Postgres upgrade is not treated as fatal error - if msg.pg_version != self.state.server.pg_version + /* Postgres major version mismatch is treated as fatal error + * because safekeepers parse WAL headers and the format + * may change between versions. + */ + if msg.pg_version / 10000 != self.state.server.pg_version / 10000 && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { - warn!( + bail!( "incompatible server version {}, expected {}", - msg.pg_version, self.state.server.pg_version + msg.pg_version, + self.state.server.pg_version ); } + if msg.tenant_id != self.state.tenant_id { bail!( "invalid tenant ID, got {}, expected {}", @@ -634,6 +639,10 @@ where let mut state = self.state.clone(); state.server.system_id = msg.system_id; + state.server.wal_seg_size = msg.wal_seg_size; + if msg.pg_version != UNKNOWN_SERVER_VERSION { + state.server.pg_version = msg.pg_version; + } self.state.persist(&state)?; } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 5a38558e9c..2829c875ed 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -8,7 +8,7 @@ use crate::GlobalTimelines; use anyhow::{bail, Context, Result}; use bytes::Bytes; -use postgres_ffi::v14::xlog_utils::get_current_timestamp; +use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::min; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 0d5321fb3a..c82a003161 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -11,7 +11,8 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNoOffsetToRecPtr}; +use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; +use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; use remote_storage::GenericRemoteStorage; use tokio::fs::File; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 692bd18342..44dc313ef6 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -29,13 +29,14 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; +use crate::safekeeper::UNKNOWN_SERVER_VERSION; use crate::wal_backup::read_object; use crate::SafeKeeperConf; -use postgres_ffi::v14::xlog_utils::XLogFileName; +use postgres_ffi::XLogFileName; use postgres_ffi::XLOG_BLCKSZ; -use postgres_ffi::v14::waldecoder::WalStreamDecoder; +use postgres_ffi::waldecoder::WalStreamDecoder; use tokio::io::{AsyncReadExt, AsyncSeekExt}; @@ -139,7 +140,7 @@ impl PhysicalStorage { write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(write_lsn), + decoder: WalStreamDecoder::new(write_lsn, UNKNOWN_SERVER_VERSION), file: None, }) } @@ -291,7 +292,8 @@ impl Storage for PhysicalStorage { self.decoder.available(), startpos, ); - self.decoder = WalStreamDecoder::new(startpos); + let pg_version = self.decoder.pg_version; + self.decoder = WalStreamDecoder::new(startpos, pg_version); } self.decoder.feed_bytes(buf); loop { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1e83ee3839..c1ebc6aa7d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,7 +59,7 @@ Env = Dict[str, str] Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_POSTGRES_DIR = "pg_install/v14" +DEFAULT_PG_VERSION_DEFAULT = "14" DEFAULT_BRANCH_NAME = "main" BASE_PORT = 15000 @@ -71,6 +71,7 @@ base_dir = "" neon_binpath = "" pg_distrib_dir = "" top_output_dir = "" +pg_version = "" def pytest_configure(config): @@ -100,12 +101,21 @@ def pytest_configure(config): Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. + global pg_version + pg_version = os.environ.get("DEFAULT_PG_VERSION", DEFAULT_PG_VERSION_DEFAULT) + global pg_distrib_dir + + # TODO get rid of the POSTGRES_DISTRIB_DIR env var ? + # use DEFAULT_PG_VERSION instead to generate the path env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") if env_postgres_bin: pg_distrib_dir = env_postgres_bin else: - pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) + pg_distrib_dir = os.path.normpath( + os.path.join(base_dir, "pg_install/v{}".format(pg_version)) + ) + log.info(f"pg_distrib_dir is {pg_distrib_dir}") if os.getenv("REMOTE_ENV"): # When testing against a remote server, we only need the client binary. @@ -1185,6 +1195,7 @@ class AbstractNeonCli(abc.ABC): env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) + env_vars["DEFAULT_PG_VERSION"] = str(pg_version) if self.env.rust_log_override is not None: env_vars["RUST_LOG"] = self.env.rust_log_override for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): @@ -1251,6 +1262,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id), "--timeline-id", str(timeline_id), + "--pg-version", + pg_version, ] ) else: @@ -1262,6 +1275,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id), "--timeline-id", str(timeline_id), + "--pg-version", + pg_version, ] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) @@ -1296,6 +1311,8 @@ class NeonCli(AbstractNeonCli): new_branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + pg_version, ] res = self.raw_cli(cmd) @@ -1317,6 +1334,8 @@ class NeonCli(AbstractNeonCli): branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + pg_version, ] res = self.raw_cli(cmd) @@ -1395,6 +1414,9 @@ class NeonCli(AbstractNeonCli): cmd = ["init", f"--config={tmp.name}"] if initial_timeline_id: cmd.extend(["--timeline-id", str(initial_timeline_id)]) + + cmd.extend(["--pg-version", pg_version]) + append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, @@ -1476,6 +1498,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id or self.env.initial_tenant), "--branch-name", branch_name, + "--pg-version", + pg_version, ] if lsn is not None: args.extend(["--lsn", str(lsn)]) @@ -1500,6 +1524,8 @@ class NeonCli(AbstractNeonCli): "start", "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + pg_version, ] if lsn is not None: args.append(f"--lsn={lsn}") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 885a0dc26f..417595ae4d 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -14,6 +14,7 @@ from fixtures.neon_fixtures import ( PgBin, Postgres, pg_distrib_dir, + pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -96,6 +97,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build end_lsn, "--wal-tarfile", wal, + "--pg-version", + pg_version, ] ) @@ -248,6 +251,8 @@ def _import( str(lsn), "--base-tarfile", os.path.join(tar_output_file), + "--pg-version", + pg_version, ] ) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index aa5a65f446..4934fb9354 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,7 +5,13 @@ import os from pathlib import Path import pytest -from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir +from fixtures.neon_fixtures import ( + NeonEnv, + base_dir, + check_restored_datadir_content, + pg_distrib_dir, + pg_version, +) # Run the main PostgreSQL regression tests, in src/test/regress. @@ -26,8 +32,8 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") - src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(pg_version) bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") @@ -80,8 +86,8 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/isolation") - src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/isolation") + build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/isolation".format(pg_version)) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/isolation".format(pg_version)) bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "isolation_schedule") pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") @@ -124,7 +130,7 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) src_path = os.path.join(base_dir, "test_runner/sql_regress") bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 931de0f1e3..73e26bd207 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -29,6 +29,7 @@ from fixtures.neon_fixtures import ( SafekeeperPort, available_remote_storages, neon_binpath, + pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -634,6 +635,9 @@ class ProposerPostgres(PgProtocol): } basepath = self.pg_bin.run_capture(command, env) + + log.info(f"postgres --sync-safekeepers output: {basepath}") + stdout_filename = basepath + ".stdout" with open(stdout_filename, "r") as stdout_f: @@ -662,7 +666,9 @@ class ProposerPostgres(PgProtocol): # insert wal in all safekeepers and run sync on proposer def test_sync_safekeepers( - neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + port_distributor: PortDistributor, ): # We don't really need the full environment for this test, just the @@ -699,6 +705,7 @@ def test_sync_safekeepers( "begin_lsn": int(begin_lsn), "epoch_start_lsn": int(epoch_start_lsn), "truncate_lsn": int(epoch_start_lsn), + "pg_version": int(pg_version) * 10000, }, ) lsn = Lsn(res["inserted_wal"]["end_lsn"]) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 19d948fd47..796770565f 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 +Subproject commit 796770565ff668b585e80733b8d679961ad50e93 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 5b8b3eeef5..9383aaa9c2 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 5b8b3eeef5ec34c0cad9377833906a1387841d04 +Subproject commit 9383aaa9c2616fd81cfafb058fe0d692f5e43ac3 From 9dfede81467aaaabf21518a949ce870d735155e5 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 18:34:30 +0300 Subject: [PATCH 37/90] Handle backwards-compatibility of TimelineMetadata. This commit bumps TimelineMetadata format version and makes it independent from STORAGE_FORMAT_VERSION. --- pageserver/src/lib.rs | 6 +- pageserver/src/tenant/metadata.rs | 161 ++++++++++++++++++++++++++---- 2 files changed, 148 insertions(+), 19 deletions(-) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 0bd5e242d3..7937f72de7 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -31,9 +31,11 @@ use crate::task_mgr::TaskKind; /// Current storage format version /// -/// This is embedded in the metadata file, and also in the header of all the -/// layer files. If you make any backwards-incompatible changes to the storage +/// This is embedded in the header of all the layer files. +/// If you make any backwards-incompatible changes to the storage /// format, bump this! +/// Note that TimelineMetadata uses its own version number to track +/// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; pub const DEFAULT_PG_VERSION: u32 = 14; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 41790b4d11..6d18153b4c 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -20,7 +20,12 @@ use utils::{ use crate::config::PageServerConf; use crate::virtual_file::VirtualFile; -use crate::STORAGE_FORMAT_VERSION; + +/// Use special format number to enable backward compatibility. +const METADATA_FORMAT_VERSION: u16 = 4; + +/// Previous supported format versions. +const METADATA_OLD_FORMAT_VERSION: u16 = 3; /// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. /// @@ -34,19 +39,19 @@ const METADATA_MAX_SIZE: usize = 512; #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, - body: TimelineMetadataBody, + body: TimelineMetadataBodyV2, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataHeader { checksum: u32, // CRC of serialized metadata body size: u16, // size of serialized metadata - format_version: u16, // storage format version (used for compatibility checks) + format_version: u16, // metadata format version (used for compatibility checks) } const METADATA_HDR_SIZE: usize = std::mem::size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -struct TimelineMetadataBody { +struct TimelineMetadataBodyV2 { disk_consistent_lsn: Lsn, // This is only set if we know it. We track it in memory when the page // server is running, but we only track the value corresponding to @@ -66,6 +71,26 @@ struct TimelineMetadataBody { pg_version: u32, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct TimelineMetadataBodyV1 { + disk_consistent_lsn: Lsn, + // This is only set if we know it. We track it in memory when the page + // server is running, but we only track the value corresponding to + // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a + // lot. We only store it in the metadata file when we flush *all* the + // in-memory data so that 'last_record_lsn' is the same as + // 'disk_consistent_lsn'. That's OK, because after page server restart, as + // soon as we reprocess at least one record, we will have a valid + // 'prev_record_lsn' value in memory again. This is only really needed when + // doing a clean shutdown, so that there is no more WAL beyond + // 'disk_consistent_lsn' + prev_record_lsn: Option, + ancestor_timeline: Option, + ancestor_lsn: Lsn, + latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, +} + impl TimelineMetadata { pub fn new( disk_consistent_lsn: Lsn, @@ -80,9 +105,9 @@ impl TimelineMetadata { hdr: TimelineMetadataHeader { checksum: 0, size: 0, - format_version: STORAGE_FORMAT_VERSION, + format_version: METADATA_FORMAT_VERSION, }, - body: TimelineMetadataBody { + body: TimelineMetadataBodyV2 { disk_consistent_lsn, prev_record_lsn, ancestor_timeline, @@ -94,16 +119,43 @@ impl TimelineMetadata { } } + fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result { + let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; + + // backward compatible only up to this version + ensure!( + hdr.format_version == METADATA_OLD_FORMAT_VERSION, + "unsupported metadata format version {}", + hdr.format_version + ); + + let metadata_size = hdr.size as usize; + + let body: TimelineMetadataBodyV1 = + TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + + let body = TimelineMetadataBodyV2 { + disk_consistent_lsn: body.disk_consistent_lsn, + prev_record_lsn: body.prev_record_lsn, + ancestor_timeline: body.ancestor_timeline, + ancestor_lsn: body.ancestor_lsn, + latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn, + initdb_lsn: body.initdb_lsn, + pg_version: 14, // All timelines created before this version had pg_version 14 + }; + + hdr.format_version = METADATA_FORMAT_VERSION; + + Ok(Self { hdr, body }) + } + pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result { ensure!( metadata_bytes.len() == METADATA_MAX_SIZE, "metadata bytes size is wrong" ); let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; - ensure!( - hdr.format_version == STORAGE_FORMAT_VERSION, - "format version mismatch" - ); + let metadata_size = hdr.size as usize; ensure!( metadata_size <= METADATA_MAX_SIZE, @@ -114,13 +166,20 @@ impl TimelineMetadata { hdr.checksum == calculated_checksum, "metadata checksum mismatch" ); - let body = TimelineMetadataBody::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; - ensure!( - body.disk_consistent_lsn.is_aligned(), - "disk_consistent_lsn is not aligned" - ); - Ok(TimelineMetadata { hdr, body }) + if hdr.format_version != METADATA_FORMAT_VERSION { + // If metadata has the old format, + // upgrade it and return the result + TimelineMetadata::upgrade_timeline_metadata(&metadata_bytes) + } else { + let body = + TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + ensure!( + body.disk_consistent_lsn.is_aligned(), + "disk_consistent_lsn is not aligned" + ); + Ok(TimelineMetadata { hdr, body }) + } } pub fn to_bytes(&self) -> anyhow::Result> { @@ -128,7 +187,7 @@ impl TimelineMetadata { let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); let hdr = TimelineMetadataHeader { size: metadata_size as u16, - format_version: STORAGE_FORMAT_VERSION, + format_version: METADATA_FORMAT_VERSION, checksum: crc32c::crc32c(&body_bytes), }; let hdr_bytes = hdr.ser()?; @@ -235,4 +294,72 @@ mod tests { "Metadata that was serialized to bytes and deserialized back should not change" ); } + + // Generate old version metadata and read it with current code. + // Ensure that it is upgraded correctly + #[test] + fn test_metadata_upgrade() { + #[derive(Debug, Clone, PartialEq, Eq)] + struct TimelineMetadataV1 { + hdr: TimelineMetadataHeader, + body: TimelineMetadataBodyV1, + } + + let metadata_v1 = TimelineMetadataV1 { + hdr: TimelineMetadataHeader { + checksum: 0, + size: 0, + format_version: METADATA_OLD_FORMAT_VERSION, + }, + body: TimelineMetadataBodyV1 { + disk_consistent_lsn: Lsn(0x200), + prev_record_lsn: Some(Lsn(0x100)), + ancestor_timeline: Some(TIMELINE_ID), + ancestor_lsn: Lsn(0), + latest_gc_cutoff_lsn: Lsn(0), + initdb_lsn: Lsn(0), + }, + }; + + impl TimelineMetadataV1 { + pub fn to_bytes(&self) -> anyhow::Result> { + let body_bytes = self.body.ser()?; + let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); + let hdr = TimelineMetadataHeader { + size: metadata_size as u16, + format_version: METADATA_OLD_FORMAT_VERSION, + checksum: crc32c::crc32c(&body_bytes), + }; + let hdr_bytes = hdr.ser()?; + let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE]; + metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes); + metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes); + Ok(metadata_bytes) + } + } + + let metadata_bytes = metadata_v1 + .to_bytes() + .expect("Should serialize correct metadata to bytes"); + + // This should deserialize to the latest version format + let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes) + .expect("Should deserialize its own bytes"); + + let expected_metadata = TimelineMetadata::new( + Lsn(0x200), + Some(Lsn(0x100)), + Some(TIMELINE_ID), + Lsn(0), + Lsn(0), + Lsn(0), + 14, // All timelines created before this version had pg_version 14 + ); + + assert_eq!( + deserialized_metadata.body, expected_metadata.body, + "Metadata of the old version {} should be upgraded to the latest version {}", + METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION + ); + } } From 03c606f7c5fbb1bcd2ba79ea0d21849d298c1400 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 19:40:04 +0300 Subject: [PATCH 38/90] Pass pg_version parameter to timeline import command. Add pg_version field to LocalTimelineInfo. Use pg_version in the export_import_between_pageservers script --- control_plane/src/bin/neon_local.rs | 12 ++++++------ control_plane/src/storage.rs | 6 ++++-- pageserver/src/http/models.rs | 1 + pageserver/src/http/routes.rs | 1 + pageserver/src/page_service.rs | 13 +++---------- scripts/export_import_between_pageservers.py | 4 +++- 6 files changed, 18 insertions(+), 19 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 92782ea235..93947d5326 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -695,18 +695,18 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - // TODO validate both or none are provided let pg_wal = end_lsn.zip(wal_tarfile); - let mut cplane = ComputeControlPlane::load(env.clone())?; - println!("Importing timeline into pageserver ..."); - pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?; - println!("Creating node for imported timeline ..."); - env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - let pg_version = import_match .value_of("pg-version") .unwrap() .parse::() .context("Failed to parse postgres version from the argument string")?; + let mut cplane = ComputeControlPlane::load(env.clone())?; + println!("Importing timeline into pageserver ..."); + pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?; + println!("Creating node for imported timeline ..."); + env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; + cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; println!("Done"); } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 95ade14fbf..9032f99971 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -547,6 +547,7 @@ impl PageServerNode { timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, + pg_version: u32, ) -> anyhow::Result<()> { let mut client = self.pg_connection_config.connect(NoTls).unwrap(); @@ -565,8 +566,9 @@ impl PageServerNode { }; // Import base - let import_cmd = - format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); + let import_cmd = format!( + "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" + ); let mut writer = client.copy_in(&import_cmd)?; io::copy(&mut base_reader, &mut writer)?; writer.finish()?; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 851fa881a0..d5559653b2 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -138,6 +138,7 @@ pub struct LocalTimelineInfo { pub last_received_msg_lsn: Option, /// the timestamp (in microseconds) of the last received message pub last_received_msg_ts: Option, + pub pg_version: u32, } #[serde_as] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6892c0b391..a55c6c973e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -130,6 +130,7 @@ fn local_timeline_info_from_timeline( wal_source_connstr, last_received_msg_lsn, last_received_msg_ts, + pg_version: timeline.pg_version, }; Ok(info) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index fed5d0dcc4..368b4c8bee 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -959,22 +959,15 @@ impl postgres_backend_async::Handler for PageServerHandler { // 1. Get start/end LSN from backup_manifest file // 2. Run: // cat my_backup/base.tar | psql -h $PAGESERVER \ - // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN" + // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() >= 4); + ensure!(params.len() == 5); let tenant_id = TenantId::from_str(params[0])?; let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - - let pg_version = if params.len() == 5 { - u32::from_str(params[4])? - } else { - // If version is not provided, assume default. - // TODO: this may lead to weird errors if the version is wrong. - crate::DEFAULT_PG_VERSION - }; + let pg_version = u32::from_str(params[4])?; self.check_permission(Some(tenant_id))?; diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index af847be49e..0fccf5199d 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -470,9 +470,10 @@ def import_timeline( last_lsn, prev_lsn, tar_filename, + pg_version, ): # Import timelines to new pageserver - import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}" + import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}" full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") @@ -594,6 +595,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, + timeline["local"]["pg_version"], ) # Re-export and compare From a4397d43e997247f703b28baa81d5ffa727a65bd Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 20:16:22 +0300 Subject: [PATCH 39/90] Rename waldecoder -> waldecoder_handler.rs. Add comments --- libs/postgres_ffi/src/lib.rs | 8 +++++--- .../src/{waldecoder.rs => waldecoder_handler.rs} | 11 +++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) rename libs/postgres_ffi/src/{waldecoder.rs => waldecoder_handler.rs} (95%) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 25e1f6029c..1a6620a180 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -31,7 +31,7 @@ macro_rules! postgres_ffi { } pub mod controlfile_utils; pub mod nonrelfile_utils; - pub mod waldecoder; + pub mod waldecoder_handler; pub mod xlog_utils; pub const PG_MAJORVERSION: &str = stringify!($version); @@ -216,12 +216,14 @@ pub mod waldecoder { pub fn poll_decode(&mut self) -> Result, WalDecodeError> { match self.pg_version { + // This is a trick to support both versions simultaneously. + // See WalStreamDecoderHandler comments. 14 => { - use self::v14::waldecoder::WalStreamDecoderHandler; + use self::v14::waldecoder_handler::WalStreamDecoderHandler; self.poll_decode_internal() } 15 => { - use self::v15::waldecoder::WalStreamDecoderHandler; + use self::v15::waldecoder_handler::WalStreamDecoderHandler; self.poll_decode_internal() } _ => Err(WalDecodeError { diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder_handler.rs similarity index 95% rename from libs/postgres_ffi/src/waldecoder.rs rename to libs/postgres_ffi/src/waldecoder_handler.rs index 5b46d52321..b4d50375bd 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder_handler.rs @@ -26,8 +26,15 @@ pub trait WalStreamDecoderHandler { } // -// WalRecordStream is a Stream that returns a stream of WAL records -// FIXME: This isn't a proper rust stream +// This is a trick to support several postgres versions simultaneously. +// +// Page decoding code depends on postgres bindings, so it is compiled for each version. +// Thus WalStreamDecoder implements several WalStreamDecoderHandler traits. +// WalStreamDecoder poll_decode() method dispatches to the right handler based on the postgres version. +// Other methods are internal and are not dispatched. +// +// It is similar to having several impl blocks for the same struct, +// but the impls here are in different modules, so need to use a trait. // impl WalStreamDecoderHandler for WalStreamDecoder { fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { From a69e060f0f0683683c33fa39128173aadc35a04b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 20:38:59 +0300 Subject: [PATCH 40/90] fix clippy warning --- pageserver/src/tenant/metadata.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6d18153b4c..3fb9ccd936 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -170,7 +170,7 @@ impl TimelineMetadata { if hdr.format_version != METADATA_FORMAT_VERSION { // If metadata has the old format, // upgrade it and return the result - TimelineMetadata::upgrade_timeline_metadata(&metadata_bytes) + TimelineMetadata::upgrade_timeline_metadata(metadata_bytes) } else { let body = TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; From d45de3d58f12fb143963faf61cd874831e3cc6a9 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 15 Sep 2022 17:27:10 +0300 Subject: [PATCH 41/90] update build scripts to match pg_distrib_dir versioning schema --- .github/actions/run-python-test-set/action.yml | 4 ++-- .github/workflows/pg_clients.yml | 4 ++-- Dockerfile | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index e69cb28df1..fc3b1c9c37 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -85,7 +85,7 @@ runs: # PLATFORM will be embedded in the perf test report # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} - export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install/v14} + export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -126,7 +126,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v14/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index d04d002811..0600f9234f 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -58,12 +58,12 @@ jobs: env: REMOTE_ENV: 1 BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14 + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install shell: bash -euxo pipefail {0} run: | # Test framework expects we have psql binary; # but since we don't really need it in this test, let's mock it - mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql"; + mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql"; ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ --tb=short \ diff --git a/Dockerfile b/Dockerfile index 213934a844..876a20cc1a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -68,8 +68,8 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin -# v14 is default for now -COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ +COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ +COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. @@ -78,7 +78,7 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ && /usr/local/bin/pageserver -D /data/.neon/ --init \ -c "id=1234" \ -c "broker_endpoints=['http://etcd:2379']" \ - -c "pg_distrib_dir='/usr/local'" \ + -c "pg_distrib_dir='/usr/local/'" \ -c "listen_pg_addr='0.0.0.0:6400'" \ -c "listen_http_addr='0.0.0.0:9898'" From 5dddeb8d88354621a6b1e690057b16ce1a5c6a79 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 15 Sep 2022 17:40:29 +0300 Subject: [PATCH 42/90] Use non-versioned pg_distrib dir --- control_plane/src/local_env.rs | 29 ++++++++++-------------- control_plane/src/storage.rs | 2 +- docs/settings.md | 2 ++ pageserver/src/config.rs | 40 ++++++++++------------------------ pageserver/src/http/routes.rs | 2 +- 5 files changed, 26 insertions(+), 49 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 14bb4cf346..f4fbc99420 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -197,25 +197,18 @@ impl Default for SafekeeperConf { } impl LocalEnv { + pub fn pg_distrib_dir_raw(&self) -> PathBuf { + self.pg_distrib_dir.clone() + } + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { - let mut path = self.pg_distrib_dir.clone(); + let path = self.pg_distrib_dir.clone(); - if pg_version != DEFAULT_PG_VERSION { - // step up to the parent directory - // We assume that the pg_distrib subdirs - // for different pg versions - // are located in the same directory - // and follow the naming convention: v14, v15, etc. - path.pop(); - - match pg_version { - 14 => return path.join(format!("v{pg_version}")), - 15 => return path.join(format!("v{pg_version}")), - _ => panic!("Unsupported postgres version: {}", pg_version), - }; + match pg_version { + 14 => path.join(format!("v{pg_version}")), + 15 => path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), } - - path } pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { @@ -319,7 +312,7 @@ impl LocalEnv { let mut env: LocalEnv = toml::from_str(toml)?; // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14". + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". // Note that later in the code we assume, that distrib dirs follow the same pattern // for all postgres versions. if env.pg_distrib_dir == Path::new("") { @@ -327,7 +320,7 @@ impl LocalEnv { env.pg_distrib_dir = postgres_bin.into(); } else { let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install/v14") + env.pg_distrib_dir = cwd.join("pg_install") } } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 9032f99971..bfbd6e91c3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -118,7 +118,7 @@ impl PageServerNode { // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", - self.env.pg_distrib_dir(pg_version).display() + self.env.pg_distrib_dir_raw().display() ); let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); diff --git a/docs/settings.md b/docs/settings.md index 30db495dbe..878681fce1 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -155,6 +155,8 @@ for other files and for sockets for incoming connections. #### pg_distrib_dir A directory with Postgres installation to use during pageserver activities. +Since pageserver supports several postgres versions, `pg_distrib_dir` contains +a subdirectory for each version with naming convention `v{PG_MAJOR_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. The default distrib dir is `./pg_install/`. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a4346c0190..b75f8f8265 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -21,7 +21,6 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::DEFAULT_PG_VERSION; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; @@ -210,7 +209,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join(format!("pg_install/v{}", DEFAULT_PG_VERSION))), + .join(format!("pg_install",))), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), @@ -376,24 +375,13 @@ impl PageServerConf { // Postgres distribution paths // pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { - let mut path = self.pg_distrib_dir.clone(); + let path = self.pg_distrib_dir.clone(); - if pg_version != DEFAULT_PG_VERSION { - // step up to the parent directory - // We assume that the pg_distrib subdirs - // for different pg versions - // are located in the same directory - // and follow the naming convention: v14, v15, etc. - path.pop(); - - match pg_version { - 14 => return path.join(format!("v{pg_version}")), - 15 => return path.join(format!("v{pg_version}")), - _ => panic!("Unsupported postgres version: {}", pg_version), - }; + match pg_version { + 14 => path.join(format!("v{pg_version}")), + 15 => path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), } - - path } pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { @@ -477,14 +465,6 @@ impl PageServerConf { ); } - let pg_version = DEFAULT_PG_VERSION; - if !conf.pg_bin_dir(pg_version).join("postgres").exists() { - bail!( - "Can't find postgres binary at {}", - conf.pg_bin_dir(pg_version).display() - ); - } - conf.default_tenant_conf = t_conf.merge(TenantConf::default()); Ok(conf) @@ -654,6 +634,7 @@ mod tests { use tempfile::{tempdir, TempDir}; use super::*; + use crate::DEFAULT_PG_VERSION; const ALL_BASE_VALUES_TOML: &str = r#" # Initial configuration file created by 'pageserver --init' @@ -892,9 +873,10 @@ broker_endpoints = ['{broker_endpoint}'] let workdir = tempdir_path.join("workdir"); fs::create_dir_all(&workdir)?; - let pg_distrib_dir = tempdir_path.join(format!("pg_distrib/v{DEFAULT_PG_VERSION}")); - fs::create_dir_all(&pg_distrib_dir)?; - let postgres_bin_dir = pg_distrib_dir.join("bin"); + let pg_distrib_dir = tempdir_path.join("pg_distrib"); + let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}")); + fs::create_dir_all(&pg_distrib_dir_versioned)?; + let postgres_bin_dir = pg_distrib_dir_versioned.join("bin"); fs::create_dir_all(&postgres_bin_dir)?; fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a55c6c973e..72cbb0e819 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -174,7 +174,7 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. From 1255ef806feea438c45dc3ee808ab53deefca6c6 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Sun, 18 Sep 2022 21:10:00 +0300 Subject: [PATCH 43/90] pass version to wal_craft.rs --- libs/postgres_ffi/src/xlog_utils.rs | 3 ++- libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs | 15 ++++++++++++++- libs/postgres_ffi/wal_craft/src/lib.rs | 15 +++++++++++++-- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 8389a6e971..038e0491a0 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -471,7 +471,8 @@ mod tests { .join("..") .join(".."); let cfg = Conf { - pg_distrib_dir: top_path.join(format!("pg_install/{PG_MAJORVERSION}")), + pg_version: PG_MAJORVERSION, + pg_distrib_dir: top_path.join(format!("pg_install")), datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), }; if cfg.datadir.exists() { diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 2a607db6dc..9b9f76de7c 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -37,9 +37,16 @@ fn main() -> Result<()> { Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install/v14)") + .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install)") .default_value("/usr/local") ) + .arg( + Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(true) + .takes_value(true) + ) ) .subcommand( App::new("in-existing") @@ -82,8 +89,14 @@ fn main() -> Result<()> { } Ok(()) } + Some(("with-initdb", arg_matches)) => { let cfg = Conf { + pg_version: arg_matches + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?, pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), datadir: arg_matches.value_of("datadir").unwrap().into(), }; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 2ad92d776d..7ffe19e209 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -15,6 +15,7 @@ use tempfile::{tempdir, TempDir}; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Conf { + pub pg_version: u32, pub pg_distrib_dir: PathBuf, pub datadir: PathBuf, } @@ -36,12 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { }); impl Conf { + pub fn pg_distrib_dir(&self) -> PathBuf { + let path = self.pg_distrib_dir.clone(); + + match self.pg_version { + 14 => path.join(format!("v{}", self.pg_version)), + 15 => path.join(format!("v{}", self.pg_version)), + _ => panic!("Unsupported postgres version: {}", self.pg_version), + } + } + fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + self.pg_distrib_dir().join("bin") } fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + self.pg_distrib_dir().join("lib") } pub fn wal_dir(&self) -> PathBuf { From 0fde59aa4628c3e25048e014e1519e3c83462092 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Sun, 18 Sep 2022 22:45:29 +0300 Subject: [PATCH 44/90] use pg_version in python tests --- scripts/export_import_between_pageservers.py | 33 ++++-- test_runner/fixtures/neon_fixtures.py | 106 ++++++++++++------- test_runner/regress/test_import.py | 5 +- test_runner/regress/test_pg_regress.py | 26 ++--- test_runner/regress/test_wal_acceptor.py | 3 +- test_runner/regress/test_wal_restore.py | 4 +- 6 files changed, 109 insertions(+), 68 deletions(-) diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 0fccf5199d..1285d0476b 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -80,11 +80,13 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path, pg_distrib_dir): + def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") + self.env["LD_LIBRARY_PATH"] = os.path.join( + str(pg_distrib_dir), "v{}".format(pg_version), "lib" + ) def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -484,7 +486,7 @@ def import_timeline( with open(stdout_filename, "w") as stdout_f: with open(stderr_filename2, "w") as stderr_f: print(f"(capturing output to {stdout_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) subprocess.run( full_cmd, stdout=stdout_f, @@ -503,7 +505,15 @@ def import_timeline( def export_timeline( - args, psql_path, pageserver_connstr, tenant_id, timeline_id, last_lsn, prev_lsn, tar_filename + args, + psql_path, + pageserver_connstr, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename, + pg_version, ): # Choose filenames incomplete_filename = tar_filename + ".incomplete" @@ -518,13 +528,13 @@ def export_timeline( with open(incomplete_filename, "w") as stdout_f: with open(stderr_filename, "w") as stderr_f: print(f"(capturing output to {incomplete_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) subprocess.run( cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True ) # Add missing rels - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin) # Log more info @@ -533,7 +543,8 @@ def export_timeline( def main(args: argparse.Namespace): - psql_path = str(Path(args.pg_distrib_dir) / "bin" / "psql") + # any psql version will do here. use current DEFAULT_PG_VERSION = 14 + psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql") old_pageserver_host = args.old_pageserver_host new_pageserver_host = args.new_pageserver_host @@ -566,6 +577,8 @@ def main(args: argparse.Namespace): args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" ) + pg_version = timeline["local"]["pg_version"] + # Export timeline from old pageserver if args.only_import is False: last_lsn, prev_lsn = get_rlsn( @@ -582,6 +595,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, + pg_version, ) # Import into new pageserver @@ -595,7 +609,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, - timeline["local"]["pg_version"], + pg_version, ) # Re-export and compare @@ -609,6 +623,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, re_export_filename, + pg_version, ) # Check the size is the same diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c1ebc6aa7d..3c60437426 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,8 +59,8 @@ Env = Dict[str, str] Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_PG_VERSION_DEFAULT = "14" DEFAULT_BRANCH_NAME = "main" +DEFAULT_PG_VERSION_DEFAULT = "14" BASE_PORT = 15000 WORKER_PORT_NUM = 1000 @@ -71,7 +71,7 @@ base_dir = "" neon_binpath = "" pg_distrib_dir = "" top_output_dir = "" -pg_version = "" +default_pg_version = "" def pytest_configure(config): @@ -101,29 +101,36 @@ def pytest_configure(config): Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. - global pg_version - pg_version = os.environ.get("DEFAULT_PG_VERSION", DEFAULT_PG_VERSION_DEFAULT) + global default_pg_version + log.info(f"default_pg_version is {default_pg_version}") + env_default_pg_version = os.environ.get("DEFAULT_PG_VERSION") + if env_default_pg_version: + default_pg_version = env_default_pg_version + log.info(f"default_pg_version is set to {default_pg_version}") + else: + default_pg_version = DEFAULT_PG_VERSION_DEFAULT global pg_distrib_dir - # TODO get rid of the POSTGRES_DISTRIB_DIR env var ? - # use DEFAULT_PG_VERSION instead to generate the path env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") if env_postgres_bin: pg_distrib_dir = env_postgres_bin else: - pg_distrib_dir = os.path.normpath( - os.path.join(base_dir, "pg_install/v{}".format(pg_version)) - ) + pg_distrib_dir = os.path.normpath(os.path.join(base_dir, "pg_install")) log.info(f"pg_distrib_dir is {pg_distrib_dir}") + psql_bin_path = os.path.join(pg_distrib_dir, "v{}".format(default_pg_version), "bin/psql") + postgres_bin_path = os.path.join( + pg_distrib_dir, "v{}".format(default_pg_version), "bin/postgres" + ) + if os.getenv("REMOTE_ENV"): # When testing against a remote server, we only need the client binary. - if not os.path.exists(os.path.join(pg_distrib_dir, "bin/psql")): - raise Exception('psql not found at "{}"'.format(pg_distrib_dir)) + if not os.path.exists(psql_bin_path): + raise Exception('psql not found at "{}"'.format(psql_bin_path)) else: - if not os.path.exists(os.path.join(pg_distrib_dir, "bin/postgres")): - raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) + if not os.path.exists(postgres_bin_path): + raise Exception('postgres not found at "{}"'.format(postgres_bin_path)) if os.getenv("REMOTE_ENV"): # we are in remote env and do not have neon binaries locally @@ -549,6 +556,7 @@ class NeonEnvBuilder: self.env: Optional[NeonEnv] = None self.remote_storage_prefix: Optional[str] = None self.keep_remote_storage_contents: bool = True + self.pg_version = default_pg_version def init(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -761,6 +769,7 @@ class NeonEnv: self.broker = config.broker self.remote_storage = config.remote_storage self.remote_storage_users = config.remote_storage_users + self.pg_version = config.pg_version # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -1195,7 +1204,6 @@ class AbstractNeonCli(abc.ABC): env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) - env_vars["DEFAULT_PG_VERSION"] = str(pg_version) if self.env.rust_log_override is not None: env_vars["RUST_LOG"] = self.env.rust_log_override for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): @@ -1263,7 +1271,7 @@ class NeonCli(AbstractNeonCli): "--timeline-id", str(timeline_id), "--pg-version", - pg_version, + self.env.pg_version, ] ) else: @@ -1276,7 +1284,7 @@ class NeonCli(AbstractNeonCli): "--timeline-id", str(timeline_id), "--pg-version", - pg_version, + self.env.pg_version, ] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) @@ -1302,7 +1310,9 @@ class NeonCli(AbstractNeonCli): return res def create_timeline( - self, new_branch_name: str, tenant_id: Optional[TenantId] = None + self, + new_branch_name: str, + tenant_id: Optional[TenantId] = None, ) -> TimelineId: cmd = [ "timeline", @@ -1312,7 +1322,7 @@ class NeonCli(AbstractNeonCli): "--tenant-id", str(tenant_id or self.env.initial_tenant), "--pg-version", - pg_version, + self.env.pg_version, ] res = self.raw_cli(cmd) @@ -1326,7 +1336,11 @@ class NeonCli(AbstractNeonCli): return TimelineId(str(created_timeline_id)) - def create_root_branch(self, branch_name: str, tenant_id: Optional[TenantId] = None): + def create_root_branch( + self, + branch_name: str, + tenant_id: Optional[TenantId] = None, + ): cmd = [ "timeline", "create", @@ -1335,7 +1349,7 @@ class NeonCli(AbstractNeonCli): "--tenant-id", str(tenant_id or self.env.initial_tenant), "--pg-version", - pg_version, + self.env.pg_version, ] res = self.raw_cli(cmd) @@ -1405,7 +1419,9 @@ class NeonCli(AbstractNeonCli): return timelines_cli def init( - self, config_toml: str, initial_timeline_id: Optional[TimelineId] = None + self, + config_toml: str, + initial_timeline_id: Optional[TimelineId] = None, ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) @@ -1415,7 +1431,7 @@ class NeonCli(AbstractNeonCli): if initial_timeline_id: cmd.extend(["--timeline-id", str(initial_timeline_id)]) - cmd.extend(["--pg-version", pg_version]) + cmd.extend(["--pg-version", self.env.pg_version]) append_pageserver_param_overrides( params_to_update=cmd, @@ -1443,7 +1459,10 @@ class NeonCli(AbstractNeonCli): log.info(f"pageserver_enabled_features success: {res.stdout}") return json.loads(res.stdout) - def pageserver_start(self, overrides=()) -> "subprocess.CompletedProcess[str]": + def pageserver_start( + self, + overrides=(), + ) -> "subprocess.CompletedProcess[str]": start_args = ["pageserver", "start", *overrides] append_pageserver_param_overrides( params_to_update=start_args, @@ -1499,7 +1518,7 @@ class NeonCli(AbstractNeonCli): "--branch-name", branch_name, "--pg-version", - pg_version, + self.env.pg_version, ] if lsn is not None: args.extend(["--lsn", str(lsn)]) @@ -1525,7 +1544,7 @@ class NeonCli(AbstractNeonCli): "--tenant-id", str(tenant_id or self.env.initial_tenant), "--pg-version", - pg_version, + self.env.pg_version, ] if lsn is not None: args.append(f"--lsn={lsn}") @@ -1655,11 +1674,13 @@ def append_pageserver_param_overrides( class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path): + def __init__(self, log_dir: Path, pg_version: str): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") + self.pg_version = pg_version + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") + self.pg_lib_dir = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "lib") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") + self.env["LD_LIBRARY_PATH"] = self.pg_lib_dir def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -1714,8 +1735,8 @@ class PgBin: @pytest.fixture(scope="function") -def pg_bin(test_output_dir: Path) -> PgBin: - return PgBin(test_output_dir) +def pg_bin(test_output_dir: Path, pg_version: str) -> PgBin: + return PgBin(test_output_dir, pg_version) class VanillaPostgres(PgProtocol): @@ -1762,12 +1783,19 @@ class VanillaPostgres(PgProtocol): self.stop() +@pytest.fixture(scope="session") +def pg_version() -> str: + return default_pg_version + + @pytest.fixture(scope="function") def vanilla_pg( - test_output_dir: Path, port_distributor: PortDistributor + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, ) -> Iterator[VanillaPostgres]: pgdatadir = test_output_dir / "pgdata-vanilla" - pg_bin = PgBin(test_output_dir) + pg_bin = PgBin(test_output_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: yield vanilla_pg @@ -1803,8 +1831,8 @@ class RemotePostgres(PgProtocol): @pytest.fixture(scope="function") -def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: - pg_bin = PgBin(test_output_dir) +def remote_pg(test_output_dir: Path, pg_version: str) -> Iterator[RemotePostgres]: + pg_bin = PgBin(test_output_dir, pg_version) connstr = os.getenv("BENCHMARK_CONNSTR") if connstr is None: @@ -2533,7 +2561,11 @@ def list_files_to_compare(pgdata_dir: Path): # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): +def check_restored_datadir_content( + test_output_dir: Path, + env: NeonEnv, + pg: Postgres, +): # Get the timeline ID. We need it for the 'basebackup' command timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) @@ -2544,7 +2576,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) - pg_bin = PgBin(test_output_dir) + pg_bin = PgBin(test_output_dir, env.pg_version) psql_path = os.path.join(pg_bin.pg_bin_path, "psql") cmd = rf""" @@ -2557,7 +2589,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} + psql_env = {"LD_LIBRARY_PATH": pg_bin.pg_lib_dir} result = subprocess.run(cmd, env=psql_env, capture_output=True, text=True, shell=True) # Print captured stdout/stderr if basebackup cmd failed. diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 417595ae4d..c84d282a4d 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( PgBin, Postgres, pg_distrib_dir, - pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -98,7 +97,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build "--wal-tarfile", wal, "--pg-version", - pg_version, + env.pg_version, ] ) @@ -252,7 +251,7 @@ def _import( "--base-tarfile", os.path.join(tar_output_file), "--pg-version", - pg_version, + env.pg_version, ] ) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 4934fb9354..f23811b671 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,13 +5,7 @@ import os from pathlib import Path import pytest -from fixtures.neon_fixtures import ( - NeonEnv, - base_dir, - check_restored_datadir_content, - pg_distrib_dir, - pg_version, -) +from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir # Run the main PostgreSQL regression tests, in src/test/regress. @@ -32,9 +26,9 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) - src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(pg_version) - bindir = os.path.join(pg_distrib_dir, "bin") + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(env.pg_version) + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") @@ -86,9 +80,11 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/isolation".format(pg_version)) - src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/isolation".format(pg_version)) - bindir = os.path.join(pg_distrib_dir, "bin") + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/isolation".format(env.pg_version)) + src_path = os.path.join( + base_dir, "vendor/postgres-v{}/src/test/isolation".format(env.pg_version) + ) + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "isolation_schedule") pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") @@ -130,9 +126,9 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) src_path = os.path.join(base_dir, "test_runner/sql_regress") - bindir = os.path.join(pg_distrib_dir, "bin") + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 73e26bd207..d5a5ec2f36 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -29,7 +29,6 @@ from fixtures.neon_fixtures import ( SafekeeperPort, available_remote_storages, neon_binpath, - pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -705,7 +704,7 @@ def test_sync_safekeepers( "begin_lsn": int(begin_lsn), "epoch_start_lsn": int(epoch_start_lsn), "truncate_lsn": int(epoch_start_lsn), - "pg_version": int(pg_version) * 10000, + "pg_version": int(env.pg_version) * 10000, }, ) lsn = Lsn(res["inserted_wal"]["end_lsn"]) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 21921a3bc2..db6f1e5137 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -26,11 +26,11 @@ def test_wal_restore( env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" - with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: + with VanillaPostgres(data_dir, PgBin(test_output_dir, env.pg_version), port) as restored: pg_bin.run_capture( [ os.path.join(base_dir, "libs/utils/scripts/restore_from_wal.sh"), - os.path.join(pg_distrib_dir, "bin"), + os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin"), str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"), str(data_dir), str(port), From 8d890b3cbb150136dd6a7eab9556bd006fe18823 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 08:38:30 +0300 Subject: [PATCH 45/90] fix clippy warnings --- libs/postgres_ffi/src/xlog_utils.rs | 6 ++++-- pageserver/src/config.rs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 038e0491a0..2c16cc9cd9 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -466,13 +466,15 @@ mod tests { fn test_end_of_wal(test_name: &str) { use wal_craft::*; + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); let cfg = Conf { - pg_version: PG_MAJORVERSION, - pg_distrib_dir: top_path.join(format!("pg_install")), + pg_version, + pg_distrib_dir: top_path.join("pg_install"), datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), }; if cfg.datadir.exists() { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b75f8f8265..a52a3e8262 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -209,7 +209,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join(format!("pg_install",))), + .join("pg_install")), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), From 862902f9e5846b7edef14b296557a926efec5264 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 14:38:51 +0300 Subject: [PATCH 46/90] Update readme and openapi spec --- pageserver/src/http/openapi_spec.yml | 3 +++ test_runner/README.md | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 1f2eba05ec..4e748207c8 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -307,6 +307,7 @@ paths: description: | Create a timeline. Returns new timeline id on success.\ If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. + If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver. requestBody: content: application/json: @@ -322,6 +323,8 @@ paths: ancestor_start_lsn: type: string format: hex + pg_version: + type: integer responses: "201": description: TimelineInfo diff --git a/test_runner/README.md b/test_runner/README.md index 79b2418af6..d6ee5730ac 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -60,6 +60,12 @@ Useful environment variables: `NEON_BIN`: The directory where neon binaries can be found. `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found. +Since pageserver supports several postgres versions, `POSTGRES_DISTRIB_DIR` must contain +a subdirectory for each version with naming convention `v{PG_VERSION}/`. +Inside that dir, a `bin/postgres` binary should be present. +`DEFAULT_PG_VERSION`: The version of Postgres to use, +This is used to construct full path to the postgres binaries. +Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"` `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. From ed6b75e3018922f1110cb451de94e634d860e2ad Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 15:03:11 +0300 Subject: [PATCH 47/90] show pg_version in create_timeline info span --- pageserver/src/http/routes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 72cbb0e819..55429420a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -191,7 +191,7 @@ async fn timeline_create_handler(mut request: Request) -> Result Err(ApiError::InternalServerError(err)), } } - .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) .await?; Ok(match new_timeline_info { From 3618c242b9ffbf678f7e68472a5d256ad51cc538 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 15:14:01 +0300 Subject: [PATCH 48/90] use version specific find_end_of_wal function --- safekeeper/src/wal_storage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 44dc313ef6..9e198fc148 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -14,7 +14,7 @@ use std::pin::Pin; use tokio::io::AsyncRead; use postgres_ffi::v14::xlog_utils::{ - find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, + IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::{max, min}; From d8d3cd49f4ad753443364574a31d84fc56557b46 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 15:31:05 +0300 Subject: [PATCH 49/90] Update libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs Co-authored-by: MMeent --- libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 9b9f76de7c..9563298cd8 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -37,7 +37,7 @@ fn main() -> Result<()> { Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install)") + .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)") .default_value("/usr/local") ) .arg( From eba419fda360bdc4a2025474b2afcd92d0ff369b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 18:15:34 +0300 Subject: [PATCH 50/90] Clean up the pg_version choice code --- libs/postgres_ffi/src/lib.rs | 13 ++++++------- pageserver/src/walingest.rs | 2 +- pageserver/src/walrecord.rs | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 1a6620a180..95ecc7b061 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -79,14 +79,13 @@ pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; -pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { - if version == 14 { - bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0 - } else { - assert_eq!(version, 15); - bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { + match version { + 14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0), + 15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 - || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0), + _ => anyhow::bail!("Unknown version {}", version), } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1d5cab38b9..d3d2c6d9b2 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -324,7 +324,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version) + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? { // Extract page image from FPI record let img_len = blk.bimg_len as usize; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 258e1a445f..38fb9a4247 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -527,7 +527,7 @@ pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, pg_version: u32, -) -> Result<(), DeserializeError> { +) -> Result<()> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -628,7 +628,7 @@ pub fn decode_wal_record( }; let blk_img_is_compressed = - postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; if blk_img_is_compressed { debug!("compressed block image , pg_version = {}", pg_version); From d098542ddeb1b01b5e05e299f4979ad1677f127a Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 18:46:20 +0300 Subject: [PATCH 51/90] Make test_timeline_size_metrics more stable: Compare size with Vanilla postgres size instead of hardcoded value --- test_runner/regress/test_timeline_size.py | 38 +++++++++++++++++++---- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 979d1a107f..3a482be5db 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -3,6 +3,7 @@ import random import re import time from contextlib import closing +from pathlib import Path import psycopg2.errors import psycopg2.extras @@ -11,7 +12,10 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient, + PgBin, + PortDistributor, Postgres, + VanillaPostgres, assert_timeline_local, wait_for_last_flush_lsn, ) @@ -327,7 +331,12 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # The timeline logical and physical sizes are also exposed as prometheus metrics. # Test the metrics. -def test_timeline_size_metrics(neon_simple_env: NeonEnv): +def test_timeline_size_metrics( + neon_simple_env: NeonEnv, + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, +): env = neon_simple_env pageserver_http = env.pageserver.http_client() @@ -369,11 +378,28 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): assert matches tl_logical_size_metric = int(matches.group(1)) - # An empty database is around 8 MB. There at least 3 databases, 'postgres', - # 'template0', 'template1'. So the total size should be about 32 MB. This isn't - # very accurate and can change with different PostgreSQL versions, so allow a - # couple of MB of slack. - assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024) + pgdatadir = test_output_dir / "pgdata-vanilla" + pg_bin = PgBin(test_output_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure([f"port={port}"]) + vanilla_pg.start() + + # Create database based on template0 because we can't connect to template0 + vanilla_pg.safe_psql("CREATE TABLE foo (t text)") + vanilla_pg.safe_psql( + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""" + ) + vanilla_size_sum = vanilla_pg.safe_psql( + "select sum(pg_database_size(oid)) from pg_database" + )[0][0] + + # Compare the size with Vanilla postgres. + # Allow some slack, because the logical size metric includes some things like + # the SLRUs that are not included in pg_database_size(). + assert math.isclose(tl_logical_size_metric, vanilla_size_sum, abs_tol=2 * 1024 * 1024) # The sum of the sizes of all databases, as seen by pg_database_size(), should also # be close. Again allow some slack, the logical size metric includes some things like From 1fa7d6aebf4df5e55f4f4c98e9cdba507a7d2345 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 18:48:58 +0300 Subject: [PATCH 52/90] Use DEFAULT_PG_VERSION env in CI pytest --- .github/actions/run-python-test-set/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index fc3b1c9c37..bed0bc69dc 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -86,6 +86,7 @@ runs: # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} + export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 From 64f64d563777cf311624f7bbc23e06ab9a9b7b3d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 10:02:43 +0300 Subject: [PATCH 53/90] Fix after rebase: bump vendor/postgres-v14 to match main --- vendor/postgres-v14 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 796770565f..19d948fd47 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 796770565ff668b585e80733b8d679961ad50e93 +Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 From 2d012f0d324a0c764e3956171c981fa2e0455464 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 12:37:13 +0300 Subject: [PATCH 54/90] Fix rebase conflicts in pageserver code --- pageserver/src/tenant.rs | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5860e13534..ed41641277 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -186,8 +186,15 @@ impl Tenant { bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") } - let new_metadata = - TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn, pg_version,); + let new_metadata = TimelineMetadata::new( + Lsn(0), + None, + None, + Lsn(0), + initdb_lsn, + initdb_lsn, + pg_version, + ); let new_timeline = self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -207,6 +214,7 @@ impl Tenant { new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, + pg_version: u32, ) -> Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); @@ -249,7 +257,7 @@ impl Tenant { self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? } - None => self.bootstrap_timeline(new_timeline_id)?, + None => self.bootstrap_timeline(new_timeline_id, pg_version)?, }; // Have added new timeline into the tenant, now its background tasks are needed. @@ -1001,7 +1009,11 @@ impl Tenant { /// - run initdb to init temporary instance and get bootstrap data /// - after initialization complete, remove the temp dir. - fn bootstrap_timeline(&self, timeline_id: TimelineId) -> Result> { + fn bootstrap_timeline( + &self, + timeline_id: TimelineId, + pg_version: u32, + ) -> Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. let initdb_path = path_with_suffix_extension( @@ -1012,7 +1024,7 @@ impl Tenant { ); // Init temporarily repo to get bootstrap data - run_initdb(self.conf, &initdb_path)?; + run_initdb(self.conf, &initdb_path, pg_version)?; let pgdata_path = initdb_path; let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); @@ -1021,7 +1033,7 @@ impl Tenant { // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = self.create_empty_timeline(timeline_id, lsn)?; + let timeline = self.create_empty_timeline(timeline_id, lsn, pg_version)?; import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -1094,10 +1106,10 @@ impl Tenant { /// Create the cluster temporarily in 'initdbpath' directory inside the repository /// to get bootstrap data for timeline initialization. -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { +fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) -> Result<()> { info!("running initdb in {}... ", initdbpath.display()); - let initdb_path = conf.pg_bin_dir().join("initdb"); + let initdb_path = conf.pg_bin_dir(pg_version).join("initdb"); let initdb_output = Command::new(initdb_path) .args(&["-D", &initdbpath.to_string_lossy()]) .args(&["-U", &conf.superuser]) @@ -1107,8 +1119,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // so no need to fsync it .arg("--no-sync") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .stdout(Stdio::null()) .output() .context("failed to execute initdb")?; From 5e151192f5b4bc1df4162914426fd026193fae0c Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 12:43:11 +0300 Subject: [PATCH 55/90] Fix rebase conflicts in safekeeper code --- safekeeper/src/timeline.rs | 8 +++++++- safekeeper/src/wal_storage.rs | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index ec29e13931..c16fc9f40c 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -24,12 +24,12 @@ use utils::{ pq_proto::ReplicationFeedback, }; -use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, SafekeeperMemState, ServerInfo, }; use crate::send_wal::HotStandbyFeedback; +use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; use crate::wal_storage; @@ -103,6 +103,10 @@ impl SharedState { bail!(TimelineError::UninitializedWalSegSize(*ttid)); } + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(*ttid)); + } + // We don't want to write anything to disk, because we may have existing timeline there. // These functions should not change anything on disk. let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; @@ -270,6 +274,8 @@ pub enum TimelineError { AlreadyExists(TenantTimelineId), #[error("Timeline {0} is not initialized, wal_seg_size is zero")] UninitializedWalSegSize(TenantTimelineId), + #[error("Timeline {0} is not initialized, pg_version is unknown")] + UninitialinzedPgVersion(TenantTimelineId), } /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 9e198fc148..95ad71bbbd 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -13,9 +13,7 @@ use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; -use postgres_ffi::v14::xlog_utils::{ - IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, -}; +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::{max, min}; @@ -29,7 +27,6 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; -use crate::safekeeper::UNKNOWN_SERVER_VERSION; use crate::wal_backup::read_object; use crate::SafeKeeperConf; @@ -117,7 +114,19 @@ impl PhysicalStorage { let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { - find_end_of_wal(&timeline_dir, wal_seg_size, state.commit_lsn)? + match state.server.pg_version / 10000 { + 14 => postgres_ffi::v14::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + 15 => postgres_ffi::v15::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + _ => bail!("unsupported postgres version"), + } }; // TODO: do we really know that write_lsn is fully flushed to disk? @@ -140,7 +149,7 @@ impl PhysicalStorage { write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(write_lsn, UNKNOWN_SERVER_VERSION), + decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000), file: None, }) } From 262fa3be0911a5e8ed7c310012cb064e5e39f470 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 22 Sep 2022 17:07:08 +0300 Subject: [PATCH 56/90] pageserver pg proto: add missing auth checks (#2494) Fixes #1858 --- pageserver/src/page_service.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 368b4c8bee..758faa4d9a 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1023,6 +1023,9 @@ impl postgres_backend_async::Handler for PageServerHandler { let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); let tenant_id = TenantId::from_str(params[0])?; + + self.check_permission(Some(tenant_id))?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), @@ -1067,14 +1070,14 @@ impl postgres_backend_async::Handler for PageServerHandler { let caps = re .captures(query_string) .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); + self.check_permission(Some(tenant_id))?; + + let timeline = get_local_timeline(tenant_id, timeline_id)?; pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"lsn", )]))?; From 7138db927947515aae31cca0132a16e9d98469d4 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 15:48:35 +0300 Subject: [PATCH 57/90] Fix paths to postgres binaries in the deploy script --- .github/ansible/get_binaries.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index f44a1ca50a..f96cff247f 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -24,7 +24,8 @@ tar -xzf postgres_install.tar.gz -C neon_install docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ -docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/ +docker cp ${ID}:/usr/local/v14/bin/postgres neon_install/bin/v14 +docker cp ${ID}:/usr/local/v15/bin/postgres neon_install/bin/v15 docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball From 8b42c184e77f1284902e60fe29c353b7d8322eb1 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 16:06:32 +0300 Subject: [PATCH 58/90] Update LD_LIBRARY_PATH in deploy scripts --- .github/ansible/deploy.yaml | 4 ++-- .github/ansible/systemd/pageserver.service | 2 +- .github/ansible/systemd/safekeeper.service | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index 6982445558..7409051574 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -58,7 +58,7 @@ creates: "/storage/pageserver/data/tenants" environment: NEON_REPO_DIR: "/storage/pageserver/data" - LD_LIBRARY_PATH: "/usr/local/lib" + LD_LIBRARY_PATH: "/usr/local/v14/lib" become: true tags: - pageserver @@ -132,7 +132,7 @@ creates: "/storage/safekeeper/data/safekeeper.id" environment: NEON_REPO_DIR: "/storage/safekeeper/data" - LD_LIBRARY_PATH: "/usr/local/lib" + LD_LIBRARY_PATH: "/usr/local/v14/lib" become: true tags: - safekeeper diff --git a/.github/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service index bb78054fa3..688c7e7b87 100644 --- a/.github/ansible/systemd/pageserver.service +++ b/.github/ansible/systemd/pageserver.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=pageserver -Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service index d5c6d00017..36af414761 100644 --- a/.github/ansible/systemd/safekeeper.service +++ b/.github/ansible/systemd/safekeeper.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=safekeeper -Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed From 7c1695e87d91f3ebac6c64ca699304c15568559d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 16:11:46 +0300 Subject: [PATCH 59/90] fix psql path in export_import_between_pageservers script --- scripts/export_import_between_pageservers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 1285d0476b..6f6c3864dd 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -710,8 +710,8 @@ if __name__ == "__main__": "--psql-path", dest="psql_path", required=False, - default="/usr/local/bin/psql", - help="Path to the psql binary. Default: /usr/local/bin/psql", + default="/usr/local/v14/bin/psql", + help="Path to the psql binary. Default: /usr/local/v14/bin/psql", ) parser.add_argument( "--only-import", From eb9200abc82ba9634b9fdf229415df7dffb7a38b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 17:11:52 +0300 Subject: [PATCH 60/90] Use version-specific path in pytest CI script --- .github/actions/run-python-test-set/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index bed0bc69dc..f3531004a1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -127,7 +127,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v14/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v{$DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. From c81ede8644ea8cdd71b102235f7cd2fffa2a53d2 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 20:51:31 +0300 Subject: [PATCH 61/90] Hotfix for safekeeper timelines with unknown pg_version. Assume DEFAULT_PG_VERSION = 14 --- safekeeper/src/wal_storage.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 95ad71bbbd..eee7c703f9 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -125,7 +125,17 @@ impl PhysicalStorage { wal_seg_size, state.commit_lsn, )?, - _ => bail!("unsupported postgres version"), + pg_majorversion => { + // This is a quik hack to work with old timelines that don't have + // pg_version in the control file. We can remove it after this is fixed properly. + const DEFAULT_PG_MAJOR_VERSION: u32 = 14; + warn!("unknown postgres version {pg_majorversion} assume {DEFAULT_PG_MAJOR_VERSION}"); + postgres_ffi::v14::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )? + } } }; From 43560506c070ae1c557c9bdd847ea0497dde1923 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 22 Sep 2022 17:23:02 +0300 Subject: [PATCH 62/90] remove duplicate walreceiver connection span --- pageserver/src/walreceiver/walreceiver_connection.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 5ac9a3ef7a..15cfad1dcd 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -16,7 +16,7 @@ use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; -use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use tracing::{debug, error, info, trace, warn}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; @@ -112,8 +112,7 @@ pub async fn handle_walreceiver_connection( _ = connection_cancellation.changed() => info!("Connection cancelled"), } Ok(()) - } - .instrument(info_span!("walreceiver connection")), + }, ); // Immediately increment the gauge, then create a job to decrement it on task exit. From b0377f750a798f99e71b640d3a07ae76d480435f Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Fri, 23 Sep 2022 10:25:26 +0200 Subject: [PATCH 63/90] Add staging-test region to normal staging rollouts (#2500) --- .github/ansible/staging.hosts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts index c470f8a814..f5accc188a 100644 --- a/.github/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -3,11 +3,15 @@ zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-ps-3 console_region_id=27 zenith-us-stage-ps-4 console_region_id=27 +zenith-us-stage-test-ps-1 console_region_id=28 [safekeepers] zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 zenith-us-stage-sk-6 console_region_id=27 +zenith-us-stage-test-sk-1 console_region_id=28 +zenith-us-stage-test-sk-2 console_region_id=28 +zenith-us-stage-test-sk-3 console_region_id=28 [storage:children] pageservers From 52819898e4c65bcc79206d4ff20af9f1f5f08396 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Fri, 23 Sep 2022 11:25:29 +0200 Subject: [PATCH 64/90] Extend image push step with production ECR (#2465) * Extend image push step with production ECR * Put copy step before auth change * Use correct name * Only push on main * Fix typo --- .github/workflows/build_and_test.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 44db968753..5f84e20452 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -588,7 +588,16 @@ jobs: - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust - - name: Configure docker login + - name: Push images to production ECR + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + run: | + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest + + - name: Configure Docker Hub login run: | # ECR Credential Helper & Docker Hub don't work together in config, hence reset echo "" > /github/home/.docker/config.json @@ -609,7 +618,7 @@ jobs: - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned - - name: Add latest tag to images + - name: Add latest tag to images in Docker Hub if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' From eb0c6bcf1a1b4eed35ba2bb439b5e30905e753f9 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 22 Sep 2022 17:31:16 +0300 Subject: [PATCH 65/90] reenable storage deployments --- .github/ansible/deploy.yaml | 42 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index 7409051574..e206f9d5ba 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -63,18 +63,18 @@ tags: - pageserver - # - name: update remote storage (s3) config - # lineinfile: - # path: /storage/pageserver/data/pageserver.toml - # line: "{{ item }}" - # loop: - # - "[remote_storage]" - # - "bucket_name = '{{ bucket_name }}'" - # - "bucket_region = '{{ bucket_region }}'" - # - "prefix_in_bucket = '{{ inventory_hostname }}'" - # become: true - # tags: - # - pageserver + - name: update remote storage (s3) config + lineinfile: + path: /storage/pageserver/data/pageserver.toml + line: "{{ item }}" + loop: + - "[remote_storage]" + - "bucket_name = '{{ bucket_name }}'" + - "bucket_region = '{{ bucket_region }}'" + - "prefix_in_bucket = '{{ inventory_hostname }}'" + become: true + tags: + - pageserver - name: upload systemd service definition ansible.builtin.template: @@ -87,15 +87,15 @@ tags: - pageserver - # - name: start systemd service - # ansible.builtin.systemd: - # daemon_reload: yes - # name: pageserver - # enabled: yes - # state: restarted - # become: true - # tags: - # - pageserver + - name: start systemd service + ansible.builtin.systemd: + daemon_reload: yes + name: pageserver + enabled: yes + state: restarted + become: true + tags: + - pageserver - name: post version to console when: console_mgmt_base_url is defined From 3e65209a067d7243162d9bd84841425e088a0d9b Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 23 Sep 2022 12:50:36 +0100 Subject: [PATCH 66/90] Nightly Benchmarks: use Postgres binaries from artifacts (#2501) --- .github/actions/download/action.yml | 9 ++++-- .../actions/run-python-test-set/action.yml | 2 +- .github/actions/upload/action.yml | 9 ++++-- .github/workflows/benchmarking.yml | 21 ++++++++++---- .github/workflows/build_and_test.yml | 29 +++++++++++++++++-- 5 files changed, 54 insertions(+), 16 deletions(-) diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index 5aa45164e7..731ef6639d 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -12,6 +12,9 @@ inputs: description: "Allow to skip if file doesn't exist, fail otherwise" default: false required: false + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false runs: using: "composite" @@ -23,18 +26,18 @@ runs: TARGET: ${{ inputs.path }} ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev - PREFIX=artifacts/${GITHUB_RUN_ID} FILENAME=$(basename $ARCHIVE) - S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then echo '::set-output name=SKIPPED::true' exit 0 else - echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist" + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" exit 1 fi fi diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index f3531004a1..cc6ab65b76 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -127,7 +127,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v{$DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index de8df3230f..291a2cf3b0 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -7,6 +7,9 @@ inputs: path: description: "A directory or file to upload" required: true + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false runs: using: "composite" @@ -42,14 +45,14 @@ runs: env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev - PREFIX=artifacts/${GITHUB_RUN_ID} FILENAME=$(basename $ARCHIVE) FILESIZE=$(du -sh ${ARCHIVE} | cut -f1) - time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} + time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${FILENAME} # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary - echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} + echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 4e28223c18..4d91e9fa74 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -46,7 +46,8 @@ jobs: runs-on: [self-hosted, zenith-benchmarker] env: - POSTGRES_DISTRIB_DIR: "/usr/pgsql-14" + POSTGRES_DISTRIB_DIR: /tmp/pg_install + DEFAULT_PG_VERSION: 14 steps: - name: Checkout zenith repo @@ -71,7 +72,7 @@ jobs: echo Poetry poetry --version echo Pgbench - $POSTGRES_DISTRIB_DIR/bin/pgbench --version + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - name: Create Neon Project id: create-neon-project @@ -140,7 +141,8 @@ jobs: env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: "10gb" - POSTGRES_DISTRIB_DIR: /usr + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} @@ -163,10 +165,17 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install Deps + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Add Postgres binaries to PATH run: | - sudo apt -y update - sudo apt install -y postgresql-14 + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version + echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project if: matrix.platform != 'neon-captest-reuse' diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5f84e20452..8a7cdec89c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -268,6 +268,32 @@ jobs: if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data + upload-latest-artifacts: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ regress-tests ] + if: github.ref_name == 'main' + steps: + - name: Copy Neon artifact to the latest directory + shell: bash -euxo pipefail {0} + env: + BUCKET: neon-github-public-dev + PREFIX: artifacts/${{ github.run_id }} + run: | + for build_type in debug release; do + FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst + + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + if [ -z "${S3_KEY}" ]; then + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" + exit 1 + fi + + time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME} + done + benchmarks: runs-on: dev container: @@ -335,9 +361,6 @@ jobs: curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json ./scripts/pysync - # Workaround for https://github.com/neondatabase/cloud/issues/2188 - psql "$TEST_RESULT_CONNSTR" -c "SELECT 1;" || sleep 10 - DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json coverage-report: From bc3ba23e0a485e3fc5434ea093062bc4347915f1 Mon Sep 17 00:00:00 2001 From: MMeent Date: Fri, 23 Sep 2022 14:35:36 +0200 Subject: [PATCH 67/90] Fix extreme metrics bloat in storage sync (#2506) * Fix extreme metrics bloat in storage sync From 78 metrics per (timeline, tenant) pair down to (max) 10 metrics per (timeline, tenant) pair, plus another 117 metrics in a global histogram that replaces the previous per-timeline histogram. * Drop image sync operation metric series when dropping TimelineMetrics. --- pageserver/src/metrics.rs | 45 ++++++++++++++++++++++----- pageserver/src/storage_sync.rs | 56 +++++++++++++++++++--------------- 2 files changed, 69 insertions(+), 32 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 2f03943429..5c2f81d731 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,8 +1,9 @@ use metrics::core::{AtomicU64, GenericCounter}; use metrics::{ - register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, - register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec, - IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, + GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, + UIntGaugeVec, }; use once_cell::sync::Lazy; use utils::id::{TenantId, TimelineId}; @@ -204,12 +205,34 @@ pub static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { .expect("failed to register pageserver remote storage remaining sync items int gauge") }); -pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { +pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { + register_gauge_vec!( + "pageserver_remote_storage_image_sync_duration", + "Time spent to synchronize (up/download) a whole pageserver image", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register per-timeline pageserver image sync time vec") +}); + +pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"]; +pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"]; + +pub static IMAGE_SYNC_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_image_sync_count", + "Number of synchronization operations executed for pageserver images. \ + Grouped by tenant, timeline, operation_kind and status", + &["tenant_id", "timeline_id", "operation_kind", "status"] + ) + .expect("failed to register pageserver image sync count vec") +}); + +pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", - &["tenant_id", "timeline_id", "operation_kind", "status"], + Grouped by operation_kind and status", + &["operation_kind", "status"], vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] ) .expect("failed to register pageserver image sync time histogram vec") @@ -256,7 +279,7 @@ macro_rules! redo_histogram_time_buckets { () => { vec![ 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, - 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, ] }; } @@ -411,6 +434,14 @@ impl Drop for TimelineMetrics { for op in SMGR_QUERY_TIME_OPERATIONS { let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]); } + + for op in IMAGE_SYNC_OPERATION_KINDS { + for status in IMAGE_SYNC_STATUS { + let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]); + } + } + + let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]); } } diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 892a34a76f..776d9214d4 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -178,6 +178,7 @@ use crate::{ TenantTimelineValues, }; +use crate::metrics::{IMAGE_SYNC_COUNT, IMAGE_SYNC_TIME_HISTOGRAM}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use self::download::download_index_parts; @@ -835,7 +836,6 @@ async fn process_sync_task_batch( sync_id, upload_data, sync_start, - "upload", ) .await } @@ -879,7 +879,6 @@ async fn process_sync_task_batch( sync_id, download_data, sync_start, - "download", ) .await; } @@ -911,7 +910,6 @@ async fn process_sync_task_batch( sync_id, delete_data, sync_start, - "delete", ) .instrument(info_span!("delete_timeline_data")) .await; @@ -948,8 +946,9 @@ async fn download_timeline_data( sync_id: TenantTimelineId, new_download_data: SyncData, sync_start: Instant, - task_name: &str, ) -> DownloadStatus { + static TASK_NAME: &str = "download"; + match download_timeline_layers( conf, storage, @@ -961,19 +960,19 @@ async fn download_timeline_data( .await { DownloadedTimeline::Abort => { - register_sync_status(sync_id, sync_start, task_name, None); + register_sync_status(sync_id, sync_start, TASK_NAME, None); if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); } } DownloadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); } DownloadedTimeline::Successful(mut download_data) => { match update_local_metadata(conf, sync_id, current_remote_timeline).await { Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { Ok(()) => { - register_sync_status(sync_id, sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); return DownloadStatus::Downloaded; } Err(e) => { @@ -984,7 +983,7 @@ async fn download_timeline_data( error!("Failed to update local timeline metadata: {e:?}"); download_data.retries += 1; sync_queue.push(sync_id, SyncTask::Download(download_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); } } } @@ -1060,8 +1059,9 @@ async fn delete_timeline_data( sync_id: TenantTimelineId, mut new_delete_data: SyncData, sync_start: Instant, - task_name: &str, ) { + static TASK_NAME: &str = "delete"; + let timeline_delete = &mut new_delete_data.data; if !timeline_delete.deletion_registered { @@ -1077,14 +1077,14 @@ async fn delete_timeline_data( error!("Failed to update remote timeline {sync_id}: {e:?}"); new_delete_data.retries += 1; sync_queue.push(sync_id, SyncTask::Delete(new_delete_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); return; } } timeline_delete.deletion_registered = true; let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await; - register_sync_status(sync_id, sync_start, task_name, Some(sync_status)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(sync_status)); } async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { @@ -1103,8 +1103,8 @@ async fn upload_timeline_data( sync_id: TenantTimelineId, new_upload_data: SyncData, sync_start: Instant, - task_name: &str, ) -> UploadStatus { + static TASK_NAME: &str = "upload"; let mut uploaded_data = match upload_timeline_layers( storage, sync_queue, @@ -1115,7 +1115,7 @@ async fn upload_timeline_data( .await { UploadedTimeline::FailedAndRescheduled(e) => { - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); return UploadStatus::Failed(e); } UploadedTimeline::Successful(upload_data) => upload_data, @@ -1134,14 +1134,14 @@ async fn upload_timeline_data( .await { Ok(()) => { - register_sync_status(sync_id, sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); UploadStatus::Uploaded } Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); UploadStatus::Failed(e) } } @@ -1391,16 +1391,22 @@ fn register_sync_status( let tenant_id = sync_id.tenant_id.to_string(); let timeline_id = sync_id.timeline_id.to_string(); - match sync_status { - Some(true) => { - IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"]) - } - Some(false) => { - IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"]) - } - None => return, - } - .observe(secs_elapsed) + + let sync_status = match sync_status { + Some(true) => "success", + Some(false) => "failure", + None => "abort", + }; + + IMAGE_SYNC_TIME_HISTOGRAM + .with_label_values(&[sync_name, sync_status]) + .observe(secs_elapsed); + IMAGE_SYNC_TIME + .with_label_values(&[&tenant_id, &timeline_id]) + .add(secs_elapsed); + IMAGE_SYNC_COUNT + .with_label_values(&[&tenant_id, &timeline_id, sync_name, sync_status]) + .inc(); } #[cfg(test)] From ebab89ebd22fa77ff0cf6821ff22716642fe8a03 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 23 Sep 2022 13:51:33 +0100 Subject: [PATCH 68/90] test_runner: pass password to pgbench via PGPASSWORD (#2468) --- test_runner/fixtures/log_helper.py | 13 -------- test_runner/fixtures/neon_fixtures.py | 5 +++ test_runner/performance/test_perf_pgbench.py | 34 +++++++++++++------- 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 7d112fce89..17f2402391 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,6 +1,5 @@ import logging import logging.config -import re """ This file configures logging to use in python tests. @@ -30,17 +29,6 @@ LOGGING = { } -class PasswordFilter(logging.Filter): - """Filter out password from logs.""" - - # Good enough to filter our passwords produced by PgProtocol.connstr - FILTER = re.compile(r"(\s*)password=[^\s]+(\s*)") - - def filter(self, record: logging.LogRecord) -> bool: - record.msg = self.FILTER.sub(r"\1password=\2", str(record.msg)) - return True - - def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. @@ -50,6 +38,5 @@ def getLogger(name="root") -> logging.Logger: # default logger for tests log = getLogger() -log.addFilter(PasswordFilter()) logging.config.dictConfig(LOGGING) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3c60437426..aa9fd68df5 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -283,10 +283,15 @@ class PgProtocol: return str(make_dsn(**self.conn_options(**kwargs))) def conn_options(self, **kwargs): + """ + Construct a dictionary of connection options from default values and extra parameters. + An option can be dropped from the returning dictionary by None-valued extra parameter. + """ result = self.default_options.copy() if "dsn" in kwargs: result.update(parse_dsn(kwargs["dsn"])) result.update(kwargs) + result = {k: v for k, v in result.items() if v is not None} # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index e167ddaafa..656826d6a3 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -4,7 +4,7 @@ import os import timeit from datetime import datetime from pathlib import Path -from typing import List +from typing import Dict, List import pytest from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult @@ -24,14 +24,18 @@ def utc_now_timestamp() -> int: return calendar.timegm(datetime.utcnow().utctimetuple()) -def init_pgbench(env: PgCompare, cmdline): +def init_pgbench(env: PgCompare, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + # calculate timestamps and durations separately # timestamp is intended to be used for linking to grafana and logs # duration is actually a metric and uses float instead of int for timestamp start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() with env.record_pageserver_writes("init.pageserver_writes"): - out = env.pg_bin.run_capture(cmdline) + out = env.pg_bin.run_capture(cmdline, env=environ) env.flush() duration = timeit.default_timer() - t0 @@ -48,13 +52,15 @@ def init_pgbench(env: PgCompare, cmdline): env.zenbenchmark.record_pg_bench_init_result("init", res) -def run_pgbench(env: PgCompare, prefix: str, cmdline): +def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + with env.record_pageserver_writes(f"{prefix}.pageserver_writes"): run_start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() - out = env.pg_bin.run_capture( - cmdline, - ) + out = env.pg_bin.run_capture(cmdline, env=environ) run_duration = timeit.default_timer() - t0 run_end_timestamp = utc_now_timestamp() env.flush() @@ -82,10 +88,14 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline): def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType): env.zenbenchmark.record("scale", scale, "", MetricReport.TEST_PARAM) + password = env.pg.default_options.get("password", None) + options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") + # drop password from the connection string by passing password=None and set password separately + connstr = env.pg.connstr(password=None, options=options) + if workload_type == PgBenchLoadType.INIT: # Run initialize - options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") - init_pgbench(env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options=options)]) + init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload @@ -99,8 +109,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P f"-T{duration}", "-P2", "--progress-timestamp", - env.pg.connstr(), + connstr, ], + password=password, ) if workload_type == PgBenchLoadType.SELECT_ONLY: @@ -115,8 +126,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P f"-T{duration}", "-P2", "--progress-timestamp", - env.pg.connstr(), + connstr, ], + password=password, ) env.report_size() From 1dffba9de6a0e30e1cb63c9462c88c2f6587d2f0 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 23 Sep 2022 18:30:44 +0300 Subject: [PATCH 69/90] Write more tests for the proxy... (#1918) And change a few more things in the process. --- proxy/src/auth/backend/console.rs | 12 ++++++++ proxy/src/auth/credentials.rs | 11 +++----- proxy/src/cancellation.rs | 46 +++++++++++++++++++++++++++++++ proxy/src/parse.rs | 28 ++++++++++++++++++- 4 files changed, 89 insertions(+), 8 deletions(-) diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index e5ee07813c..a351b82c6a 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -259,3 +259,15 @@ fn parse_host_port(input: &str) -> Option<(&str, u16)> { let (host, port) = input.split_once(':')?; Some((host, port.parse().ok()?)) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_host_port() { + let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); + assert_eq!(host, "127.0.0.1"); + assert_eq!(port, 5432); + } +} diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index ea71eba010..e43bcf8791 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -54,13 +54,10 @@ impl<'a> ClientCredentials<'a> { let dbname = get_param("database")?; // Project name might be passed via PG's command-line options. - let project_a = params.options_raw().and_then(|options| { - for opt in options { - if let Some(value) = opt.strip_prefix("project=") { - return Some(Cow::Borrowed(value)); - } - } - None + let project_a = params.options_raw().and_then(|mut options| { + options + .find_map(|opt| opt.strip_prefix("project=")) + .map(Cow::Borrowed) }); // Alternative project name is in fact a subdomain from SNI. diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index b7412b6f5b..92f8e35dab 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -52,6 +52,16 @@ impl CancelMap { let session = Session::new(key, self); f(session).await } + + #[cfg(test)] + fn contains(&self, session: &Session) -> bool { + self.0.lock().contains_key(&session.key) + } + + #[cfg(test)] + fn is_empty(&self) -> bool { + self.0.lock().is_empty() + } } /// This should've been a [`std::future::Future`], but @@ -104,3 +114,39 @@ impl<'a> Session<'a> { self.key } } + +#[cfg(test)] +mod tests { + use super::*; + use once_cell::sync::Lazy; + + #[tokio::test] + async fn check_session_drop() -> anyhow::Result<()> { + static CANCEL_MAP: Lazy = Lazy::new(Default::default); + + let (tx, rx) = tokio::sync::oneshot::channel(); + let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move { + assert!(CANCEL_MAP.contains(&session)); + + tx.send(()).expect("failed to send"); + let () = futures::future::pending().await; // sleep forever + + Ok(()) + })); + + // Wait until the task has been spawned. + let () = rx.await.context("failed to hear from the task")?; + + // Drop the session's entry by cancelling the task. + task.abort(); + let error = task.await.expect_err("task should have failed"); + if !error.is_cancelled() { + anyhow::bail!(error); + } + + // Check that the session has been dropped. + assert!(CANCEL_MAP.is_empty()); + + Ok(()) + } +} diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs index 8a05ff9c82..cbd48d91e9 100644 --- a/proxy/src/parse.rs +++ b/proxy/src/parse.rs @@ -1,6 +1,5 @@ //! Small parsing helpers. -use std::convert::TryInto; use std::ffi::CStr; pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { @@ -10,9 +9,36 @@ pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { Some((unsafe { CStr::from_bytes_with_nul_unchecked(cstr) }, other)) } +/// See . pub fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { (bytes.len() >= N).then(|| { let (head, tail) = bytes.split_at(N); (head.try_into().unwrap(), tail) }) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_split_cstr() { + assert!(split_cstr(b"").is_none()); + assert!(split_cstr(b"foo").is_none()); + + let (cstr, rest) = split_cstr(b"\0").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b""); + assert_eq!(rest, b""); + + let (cstr, rest) = split_cstr(b"foo\0bar").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b"foo"); + assert_eq!(rest, b"bar"); + } + + #[test] + fn test_split_at_const() { + assert!(split_at_const::<0>(b"").is_some()); + assert!(split_at_const::<1>(b"").is_none()); + assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k")))); + } +} From 5ccd54c699a3953486ce200c6f8ad3a9e39b8eb0 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 23 Sep 2022 13:08:05 +0300 Subject: [PATCH 70/90] Add support for h3-pg and re-enable plv8 --- Dockerfile.compute-node-v14 | 50 ++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index 8ddf752191..f3773868d0 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -8,9 +8,12 @@ ARG TAG=pinned # Layer "build-deps" # FROM debian:bullseye-slim AS build-deps +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev + libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev # # Layer "pg-build" @@ -37,7 +40,7 @@ RUN cd postgres && \ FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ tar xvzf postgis-3.3.0.tar.gz && \ @@ -59,15 +62,13 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ # Build plv8 # FROM build-deps AS plv8-build -COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 # https://github.com/plv8/plv8/issues/475 # Debian bullseye provides binutils 2.35 when >= 2.38 is necessary -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ - apt update && \ +RUN apt update && \ apt install -y --no-install-recommends -t testing binutils RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ @@ -79,12 +80,45 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +# +# Layer "h3-pg-build" +# Build h3_pg +# +FROM build-deps AS h3-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# packaged cmake is too old +RUN apt update && \ + apt install -y --no-install-recommends -t testing cmake + +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ + tar xvzf h3.tgz && \ + cd h3-4.0.1 && \ + mkdir build && \ + cd build && \ + cmake .. -DCMAKE_BUILD_TYPE=Release && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 make install && \ + cp -R /h3/usr / && \ + rm -rf build + +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ + tar xvzf h3-pg.tgz && \ + cd h3-pg-4.0.1 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control + # # Layer "neon-pg-ext-build" # compile neon extensions # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -132,8 +166,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ chmod 0750 /var/db/postgres/compute && \ echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig -# TODO: Check if we can make the extension setup more modular versus a linear build -# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl From 805bb198c287a5a1ac3e28627165313335c69cc9 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Fri, 23 Sep 2022 11:49:28 -0700 Subject: [PATCH 71/90] Miscellaneous small fixups (#2503) Changes are: * Correct typo "firts" -> "first" * Change to * Fix weird indentation that rustfmt was failing to handle * Use existing `anyhow::{anyhow,bail}!` as `{anyhow,bail}!` if it's already in scope * Spell `Result` as `anyhow::Result` * In general, closer to matching the rest of the codebase * Change usages of `hash_map::Entry` to `Entry` when it's already in scope * A quick search shows our style on this one varies across the files it's used in --- pageserver/src/tenant.rs | 23 +++++++++++------------ pageserver/src/tenant/timeline.rs | 8 +++++--- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ed41641277..c9ad3bf232 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -17,7 +17,6 @@ use tracing::*; use utils::crashsafe_dir::path_with_suffix_extension; use std::cmp::min; -use std::collections::hash_map; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; @@ -246,12 +245,12 @@ impl Tenant { let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); if ancestor_ancestor_lsn > *lsn { // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", - lsn, - ancestor_timeline_id, - ancestor_ancestor_lsn, - ); + bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); } } @@ -406,11 +405,11 @@ impl Tenant { .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; match timelines_accessor.entry(timeline.timeline_id) { - hash_map::Entry::Occupied(_) => anyhow::bail!( + Entry::Occupied(_) => bail!( "Found freshly initialized timeline {} in the tenant map", timeline.timeline_id ), - hash_map::Entry::Vacant(v) => { + Entry::Vacant(v) => { v.insert(timeline); } } @@ -768,7 +767,7 @@ impl Tenant { }) .with_context(|| { format!( - "Failed to fsync on firts save for config {}", + "Failed to fsync on first save for config {}", target_config_path.display() ) })?; @@ -1091,11 +1090,11 @@ impl Tenant { })?; match timelines.entry(new_timeline_id) { - hash_map::Entry::Occupied(_) => anyhow::bail!( + Entry::Occupied(_) => bail!( "Found freshly initialized timeline {} in the tenant map", new_timeline_id ), - hash_map::Entry::Vacant(v) => { + Entry::Vacant(v) => { v.insert(Arc::clone(&new_timeline)); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 019de81d64..74e873e632 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -343,7 +343,9 @@ impl Timeline { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn + Ordering::Greater => { + unreachable!("the returned lsn should never be after the requested lsn") + } } Some((cached_lsn, cached_img)) } @@ -726,10 +728,10 @@ impl Timeline { Ok(()) } - pub fn layer_removal_guard(&self) -> Result, anyhow::Error> { + pub fn layer_removal_guard(&self) -> anyhow::Result> { self.layer_removal_cs .try_lock() - .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + .map_err(|e| anyhow!("cannot lock compaction critical section {e}")) } /// Retrieve current logical size of the timeline. From 093264a69523c5f8f007b35cf26be4e0b11c1de9 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 23 Sep 2022 19:59:27 +0300 Subject: [PATCH 72/90] Fix deploy bin and lib paths for postgres --- .github/ansible/get_binaries.sh | 4 ++-- Dockerfile | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index f96cff247f..dbbd5b454a 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -24,8 +24,8 @@ tar -xzf postgres_install.tar.gz -C neon_install docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ -docker cp ${ID}:/usr/local/v14/bin/postgres neon_install/bin/v14 -docker cp ${ID}:/usr/local/v15/bin/postgres neon_install/bin/v15 +docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ +docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/ docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball diff --git a/Dockerfile b/Dockerfile index 876a20cc1a..69402919ec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,9 +19,8 @@ COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ - && rm -rf pg_install/v14/build \ - && rm -rf pg_install/v15/build \ - && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . + && rm -rf pg_install/build \ + && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . # Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build From 1165686201db64f5c58dbfcb791462f85a513352 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 23 Sep 2022 20:13:58 +0300 Subject: [PATCH 73/90] fix deploy lib paths for postgres --- .github/ansible/get_binaries.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index dbbd5b454a..b2f1fb38e6 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -26,6 +26,8 @@ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/ +docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/ +docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/ docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball From 367cc012903a7dc60d061a17ab61227f97598120 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 26 Sep 2022 10:07:18 +0300 Subject: [PATCH 74/90] Fix deploy paths --- .github/ansible/get_binaries.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index b2f1fb38e6..a484bfb0a0 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -21,6 +21,7 @@ docker pull --quiet neondatabase/neon:${DOCKER_TAG} ID=$(docker create neondatabase/neon:${DOCKER_TAG}) docker cp ${ID}:/data/postgres_install.tar.gz . tar -xzf postgres_install.tar.gz -C neon_install +mkdir neon_install/bin/ docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ From df45c0d0e57477768097c13c2c3299e634f963b8 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 26 Sep 2022 12:16:52 +0300 Subject: [PATCH 75/90] Disable plv8 again --- Dockerfile.compute-node-v14 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index f3773868d0..ed57b29009 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -116,7 +116,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +# plv8 still sometimes crashes during the creation +# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ From d15116f2cc4b26ad36f9cf28c5cf9f9343269cc3 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 23 Sep 2022 14:36:08 +0000 Subject: [PATCH 76/90] Update pg_version for old timelines --- safekeeper/src/control_file_upgrade.rs | 12 ++++++++++++ safekeeper/src/safekeeper.rs | 3 +-- safekeeper/src/timeline.rs | 2 ++ safekeeper/src/wal_storage.rs | 16 +++++----------- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index d8434efb20..1ce9186085 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -248,6 +248,18 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result oldstate.timeline_start_lsn = Lsn(1); oldstate.local_start_lsn = Lsn(1); + return Ok(oldstate); + } else if version == 6 { + info!("reading safekeeper control file version {}", version); + let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + if oldstate.server.pg_version != 0 { + return Ok(oldstate); + } + + // set pg_version to the default v14 + info!("setting pg_version to 140005"); + oldstate.server.pg_version = 140005; + return Ok(oldstate); } bail!("unsupported safekeeper control file version {}", version) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index eec24faf2f..7869aa8b3a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -25,7 +25,7 @@ use utils::{ }; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 6; +pub const SK_FORMAT_VERSION: u32 = 7; const SK_PROTOCOL_VERSION: u32 = 2; pub const UNKNOWN_SERVER_VERSION: u32 = 0; @@ -639,7 +639,6 @@ where let mut state = self.state.clone(); state.server.system_id = msg.system_id; - state.server.wal_seg_size = msg.wal_seg_size; if msg.pg_version != UNKNOWN_SERVER_VERSION { state.server.pg_version = msg.pg_version; } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index c16fc9f40c..dc7503af65 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -314,6 +314,8 @@ impl Timeline { ttid: TenantTimelineId, wal_backup_launcher_tx: Sender, ) -> Result { + let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); + let shared_state = SharedState::restore(&conf, &ttid)?; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(shared_state.sk.state.commit_lsn); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index eee7c703f9..8fbd479d95 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -111,6 +111,10 @@ impl PhysicalStorage { // Find out where stored WAL ends, starting at commit_lsn which is a // known recent record boundary (unless we don't have WAL at all). + // + // NB: find_end_of_wal MUST be backwards compatible with the previously + // written WAL. If find_end_of_wal fails to read any WAL written by an + // older version of the code, we could lose data forever. let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { @@ -125,17 +129,7 @@ impl PhysicalStorage { wal_seg_size, state.commit_lsn, )?, - pg_majorversion => { - // This is a quik hack to work with old timelines that don't have - // pg_version in the control file. We can remove it after this is fixed properly. - const DEFAULT_PG_MAJOR_VERSION: u32 = 14; - warn!("unknown postgres version {pg_majorversion} assume {DEFAULT_PG_MAJOR_VERSION}"); - postgres_ffi::v14::xlog_utils::find_end_of_wal( - &timeline_dir, - wal_seg_size, - state.commit_lsn, - )? - } + _ => bail!("unsupported postgres version: {}", state.server.pg_version), } }; From fb68d01449edb4be9a0d064d69a442dd3688783e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 26 Sep 2022 23:57:02 +0300 Subject: [PATCH 77/90] Preserve task result in TaskHandle by keeping join handle around (#2521) * Preserve task result in TaskHandle by keeping join handle around The solution is not great, but it should hep to debug staging issue I tried to do it in a least destructive way. TaskHandle used only in one place so it is ok to use something less generic unless we want to extend its usage across the codebase. In its current current form for its single usage place it looks too abstract Some problems around this code: 1. Task can drop event sender and continue running 2. Task cannot be joined several times (probably not needed, but still, can be surprising) 3. Had to split task event into two types because ahyhow::Error does not implement clone. So TaskContinueEvent derives clone but usual task evend does not. Clone requirement appears because we clone the current value in next_task_event. Taking it by reference is complicated. 4. Split between Init and Started is artificial and comes from watch::channel requirement to have some initial value. To summarize from 3 and 4. It may be a better idea to use RWLock or a bounded channel instead --- pageserver/src/walreceiver.rs | 76 ++++++++++++++----- .../src/walreceiver/connection_manager.rs | 43 ++++++----- .../src/walreceiver/walreceiver_connection.rs | 16 ++-- 3 files changed, 89 insertions(+), 46 deletions(-) diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index deac299747..c7de24080a 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -31,7 +31,6 @@ use etcd_broker::Client; use itertools::Itertools; use once_cell::sync::OnceCell; use std::future::Future; -use std::sync::Arc; use tokio::sync::watch; use tracing::*; use url::Url; @@ -88,37 +87,44 @@ pub fn is_etcd_client_initialized() -> bool { /// That may lead to certain events not being observed by the listener. #[derive(Debug)] pub struct TaskHandle { - events_receiver: watch::Receiver>, + join_handle: Option>>, + events_receiver: watch::Receiver>, cancellation: watch::Sender<()>, } -#[derive(Debug, Clone)] pub enum TaskEvent { + Update(TaskStateUpdate), + End(anyhow::Result<()>), +} + +#[derive(Debug, Clone)] +pub enum TaskStateUpdate { + Init, Started, - NewEvent(E), - End, + Progress(E), } impl TaskHandle { /// Initializes the task, starting it immediately after the creation. pub fn spawn( - task: impl FnOnce(Arc>>, watch::Receiver<()>) -> Fut + Send + 'static, + task: impl FnOnce(watch::Sender>, watch::Receiver<()>) -> Fut + + Send + + 'static, ) -> Self where - Fut: Future> + Send, - E: Sync + Send + 'static, + Fut: Future> + Send, + E: Send + Sync + 'static, { let (cancellation, cancellation_receiver) = watch::channel(()); - let (events_sender, events_receiver) = watch::channel(TaskEvent::Started); - let events_sender = Arc::new(events_sender); + let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); - let sender = Arc::clone(&events_sender); - let _ = WALRECEIVER_RUNTIME.spawn(async move { - events_sender.send(TaskEvent::Started).ok(); - task(sender, cancellation_receiver).await + let join_handle = WALRECEIVER_RUNTIME.spawn(async move { + events_sender.send(TaskStateUpdate::Started).ok(); + task(events_sender, cancellation_receiver).await }); TaskHandle { + join_handle: Some(join_handle), events_receiver, cancellation, } @@ -126,15 +132,45 @@ impl TaskHandle { async fn next_task_event(&mut self) -> TaskEvent { match self.events_receiver.changed().await { - Ok(()) => self.events_receiver.borrow().clone(), - Err(_task_channel_part_dropped) => TaskEvent::End, + Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), + Err(_task_channel_part_dropped) => { + TaskEvent::End(match self.join_handle.take() { + Some(jh) => { + if !jh.is_finished() { + warn!("sender is dropped while join handle is still alive"); + } + + jh.await + .map_err(|e| anyhow::anyhow!("Failed to join task: {e}")) + .and_then(|x| x) + } + None => { + // Another option is to have an enum, join handle or result and give away the reference to it + Err(anyhow::anyhow!("Task was joined more than once")) + } + }) + } } } /// Aborts current task, waiting for it to finish. - pub async fn shutdown(mut self) { - self.cancellation.send(()).ok(); - // wait until the sender is dropped - while self.events_receiver.changed().await.is_ok() {} + pub async fn shutdown(self) { + match self.join_handle { + Some(jh) => { + self.cancellation.send(()).ok(); + match jh.await { + Ok(Ok(())) => debug!("Shutdown success"), + Ok(Err(e)) => error!("Shutdown task error: {e:?}"), + Err(join_error) => { + if join_error.is_cancelled() { + error!("Shutdown task was cancelled"); + } else { + error!("Shutdown task join error: {join_error}") + } + } + } + } + None => {} + } } } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index a82e69e5ba..29179e9871 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,10 +16,10 @@ use std::{ time::Duration, }; -use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::task_mgr::WALRECEIVER_RUNTIME; use crate::tenant::Timeline; +use crate::{task_mgr, walreceiver::TaskStateUpdate}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -145,19 +145,26 @@ async fn connection_manager_loop_step( let wal_connection = walreceiver_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Started => {}, - TaskEvent::NewEvent(status) => { - if status.has_processed_wal { - // We have advanced last_record_lsn by processing the WAL received - // from this safekeeper. This is good enough to clean unsuccessful - // retries history and allow reconnecting to this safekeeper without - // sleeping for a long time. - walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + TaskEvent::Update(c) => { + match c { + TaskStateUpdate::Init | TaskStateUpdate::Started => {}, + TaskStateUpdate::Progress(status) => { + if status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + } + wal_connection.status = status.to_owned(); + } } - wal_connection.status = status; }, - TaskEvent::End => { - debug!("WAL receiving task finished"); + TaskEvent::End(walreceiver_task_result) => { + match walreceiver_task_result { + Ok(()) => debug!("WAL receiving task finished"), + Err(e) => error!("wal receiver task finished with an error: {e:?}"), + } walreceiver_state.drop_old_connection(false).await; }, } @@ -363,13 +370,13 @@ impl WalreceiverState { async move { super::walreceiver_connection::handle_walreceiver_connection( timeline, - &new_wal_source_connstr, - events_sender.as_ref(), + new_wal_source_connstr, + events_sender, cancellation, connect_timeout, ) .await - .map_err(|e| format!("walreceiver connection handling failure: {e:#}")) + .context("walreceiver connection handling failure") } .instrument(info_span!("walreceiver_connection", id = %id)) }); @@ -885,7 +892,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), @@ -1145,7 +1152,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), @@ -1233,7 +1240,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 15cfad1dcd..ef5baeb570 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -18,8 +18,7 @@ use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, trace, warn}; -use super::TaskEvent; -use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -55,8 +54,8 @@ pub struct WalConnectionStatus { /// messages as we go. pub async fn handle_walreceiver_connection( timeline: Arc, - wal_source_connstr: &str, - events_sender: &watch::Sender>, + wal_source_connstr: String, + events_sender: watch::Sender>, mut cancellation: watch::Receiver<()>, connect_timeout: Duration, ) -> anyhow::Result<()> { @@ -81,7 +80,7 @@ pub async fn handle_walreceiver_connection( streaming_lsn: None, commit_lsn: None, }; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); return Ok(()); } @@ -133,7 +132,7 @@ pub async fn handle_walreceiver_connection( connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); return Ok(()); } @@ -201,7 +200,7 @@ pub async fn handle_walreceiver_connection( } &_ => {} }; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } @@ -267,7 +266,8 @@ pub async fn handle_walreceiver_connection( if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { // We have successfully processed at least one WAL record. connection_status.has_processed_wal = true; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) + { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } From 2233ca2a391e25699b459c76669b7cb5a1396b5f Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Thu, 22 Sep 2022 12:46:20 +0200 Subject: [PATCH 78/90] seqwait.rs unit tests don't check return value --- libs/utils/src/seqwait.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 467b900a13..bf330a482c 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -240,7 +240,6 @@ where mod tests { use super::*; use std::sync::Arc; - use std::thread::sleep; use std::time::Duration; impl MonotonicCounter for i32 { @@ -258,17 +257,19 @@ mod tests { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); - tokio::task::spawn(async move { + let jh1 = tokio::task::spawn(async move { seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); - seq2.wait_for(999).await.expect_err("no 999"); + seq2.wait_for_timeout(999, Duration::from_millis(100)) + .await + .expect_err("no 999"); }); - tokio::task::spawn(async move { + let jh2 = tokio::task::spawn(async move { seq3.wait_for(42).await.expect("wait_for 42"); seq3.wait_for(0).await.expect("wait_for 0"); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_millis(200)).await; let old = seq.advance(99); assert_eq!(old, 0); seq.wait_for(100).await.expect("wait_for 100"); @@ -277,6 +278,9 @@ mod tests { assert_eq!(seq.advance(98), 100); assert_eq!(seq.load(), 100); + jh1.await.unwrap(); + jh2.await.unwrap(); + seq.shutdown(); } @@ -284,15 +288,18 @@ mod tests { async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); - tokio::task::spawn(async move { + let jh = tokio::task::spawn(async move { let timeout = Duration::from_millis(1); let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); - tokio::time::sleep(Duration::from_secs(1)).await; + tokio::time::sleep(Duration::from_millis(200)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); - assert_eq!(old, 0) + assert_eq!(old, 0); + jh.await.unwrap(); + + seq.shutdown(); } } From fc7087b16f79a3c0c04f8ea8c6fdc2cd74472f81 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 27 Sep 2022 10:57:59 +0200 Subject: [PATCH 79/90] Add metric for loaded safekeeper timelines (#2509) --- safekeeper/src/metrics.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 51138df776..095d80623a 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -2,7 +2,7 @@ use std::time::{Instant, SystemTime}; -use ::metrics::{register_histogram, GaugeVec, Histogram, DISK_WRITE_SECONDS_BUCKETS}; +use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, @@ -135,6 +135,7 @@ pub struct TimelineCollector { written_wal_seconds: GaugeVec, flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, + timelines_count: IntGauge, } impl Default for TimelineCollector { @@ -311,6 +312,13 @@ impl TimelineCollector { .unwrap(); descs.extend(collect_timeline_metrics.desc().into_iter().cloned()); + let timelines_count = IntGauge::new( + "safekeeper_timelines", + "Total number of timelines loaded in-memory", + ) + .unwrap(); + descs.extend(timelines_count.desc().into_iter().cloned()); + TimelineCollector { descs, commit_lsn, @@ -330,6 +338,7 @@ impl TimelineCollector { written_wal_seconds, flushed_wal_seconds, collect_timeline_metrics, + timelines_count, } } } @@ -361,6 +370,7 @@ impl Collector for TimelineCollector { self.flushed_wal_seconds.reset(); let timelines = GlobalTimelines::get_all(); + let timelines_count = timelines.len(); for arc_tli in timelines { let tli = arc_tli.info_for_metrics(); @@ -474,6 +484,10 @@ impl Collector for TimelineCollector { self.collect_timeline_metrics.set(elapsed); mfs.extend(self.collect_timeline_metrics.collect()); + // report total number of timelines + self.timelines_count.set(timelines_count as i64); + mfs.extend(self.timelines_count.collect()); + mfs } } From dabb6d2675717dad380805434e1984a7d0a73f96 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 27 Sep 2022 12:36:17 +0200 Subject: [PATCH 80/90] Fix log level for sk startup logs (#2526) --- libs/postgres_ffi/src/xlog_utils.rs | 6 +++--- safekeeper/src/wal_storage.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 2c16cc9cd9..fbd8468a93 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -170,7 +170,7 @@ pub fn find_end_of_wal( let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); - info!("find_end_of_wal PG_VERSION: {}", pg_version); + debug!("find_end_of_wal PG_VERSION: {}", pg_version); let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); @@ -182,7 +182,7 @@ pub fn find_end_of_wal( match open_wal_segment(&seg_file_path)? { None => { // no more segments - info!( + debug!( "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist", result, seg_file_path ); @@ -205,7 +205,7 @@ pub fn find_end_of_wal( match decoder.poll_decode() { Ok(Some(record)) => result = record.0, Err(e) => { - info!( + debug!( "find_end_of_wal reached end at {:?}, decode error: {:?}", result, e ); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 8fbd479d95..bc5e2d7b24 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -137,7 +137,7 @@ impl PhysicalStorage { // If not, maybe it's better to call fsync() here to be sure? let flush_lsn = write_lsn; - info!( + debug!( "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); From 7b2f9dc9080821985525fd81fd33e10967062fb1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 13:33:55 +0300 Subject: [PATCH 81/90] Reuse existing tenants during attach (#2540) --- pageserver/src/storage_sync.rs | 1 + pageserver/src/tenant.rs | 46 ++++----- pageserver/src/tenant_mgr.rs | 27 +++--- .../test_tenants_with_remote_storage.py | 96 +++++++++++++++++++ 4 files changed, 136 insertions(+), 34 deletions(-) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 776d9214d4..bee460d173 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -639,6 +639,7 @@ pub fn spawn_storage_sync_task( (storage, remote_index_clone, sync_queue), max_sync_errors, ) + .instrument(info_span!("storage_sync_loop")) .await; Ok(()) }, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c9ad3bf232..672ee3a488 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -400,16 +400,19 @@ impl Tenant { timeline_id, metadata.pg_version() ); - let timeline = self - .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor) - .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; - - match timelines_accessor.entry(timeline.timeline_id) { - Entry::Occupied(_) => bail!( - "Found freshly initialized timeline {} in the tenant map", - timeline.timeline_id + let ancestor = metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) + .cloned(); + match timelines_accessor.entry(timeline_id) { + Entry::Occupied(_) => warn!( + "Timeline {}/{} already exists in the tenant map, skipping its initialization", + self.tenant_id, timeline_id ), Entry::Vacant(v) => { + let timeline = self + .initialize_new_timeline(timeline_id, metadata, ancestor) + .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; v.insert(timeline); } } @@ -609,21 +612,14 @@ impl Tenant { &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, + ancestor: Option>, ) -> anyhow::Result> { - let ancestor = match new_metadata.ancestor_timeline() { - Some(ancestor_timeline_id) => Some( - timelines - .get(&ancestor_timeline_id) - .cloned() - .with_context(|| { - format!( - "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" - ) - })?, - ), - None => None, - }; + if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { + anyhow::ensure!( + ancestor.is_some(), + "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" + ) + } let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); let pg_version = new_metadata.pg_version(); @@ -1080,8 +1076,12 @@ impl Tenant { ) })?; + let ancestor = new_metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines.get(&ancestor_timeline_id)) + .cloned(); let new_timeline = self - .initialize_new_timeline(new_timeline_id, new_metadata, timelines) + .initialize_new_timeline(new_timeline_id, new_metadata, ancestor) .with_context(|| { format!( "Failed to initialize timeline {}/{}", diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index fcb2c18b79..1efd3d4af4 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -107,6 +107,9 @@ pub fn init_tenant_mgr( /// Ignores other timelines that might be present for tenant, but were not passed as a parameter. /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", /// and the load continues. +/// +/// Attach happens on startup and sucessful timeline downloads +/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered). pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, @@ -122,18 +125,20 @@ pub fn attach_local_tenants( ); debug!("Timelines to attach: {local_timelines:?}"); - let tenant = load_local_tenant(conf, tenant_id, remote_index); - { - match tenants_state::write_tenants().entry(tenant_id) { - hash_map::Entry::Occupied(_) => { - error!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); - continue; - } - hash_map::Entry::Vacant(v) => { - v.insert(Arc::clone(&tenant)); - } + let mut tenants_accessor = tenants_state::write_tenants(); + let tenant = match tenants_accessor.entry(tenant_id) { + hash_map::Entry::Occupied(o) => { + info!("Tenant {tenant_id} was found in pageserver's memory"); + Arc::clone(o.get()) } - } + hash_map::Entry::Vacant(v) => { + info!("Tenant {tenant_id} was not found in pageserver's memory, loading it"); + let tenant = load_local_tenant(conf, tenant_id, remote_index); + v.insert(Arc::clone(&tenant)); + tenant + } + }; + drop(tenants_accessor); if tenant.current_state() == TenantState::Broken { warn!("Skipping timeline load for broken tenant {tenant_id}") diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 83affac062..d8424e22c8 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -7,19 +7,25 @@ # import asyncio +import os +from pathlib import Path from typing import List, Tuple import pytest +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserverHttpClient, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload, + wait_until, ) from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import query_scalar async def tenant_workload(env: NeonEnv, pg: Postgres): @@ -93,3 +99,93 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem # run final checkpoint manually to flush all the data to remote storage pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenants_attached_after_download( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="remote_storage_kind", + ) + + data_id = 1 + data_secret = "very secret secret" + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + for checkpoint_number in range(1, 3): + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE t{checkpoint_number}(id int primary key, secret text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + log.info(f"waiting for checkpoint {checkpoint_number} upload") + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info(f"upload of checkpoint {checkpoint_number} is done") + + ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 + env.postgres.stop_all() + env.pageserver.stop() + + timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + local_layer_deleted = False + for path in Path.iterdir(timeline_dir): + if path.name.startswith("00000"): + # Looks like a layer file. Remove it + os.remove(path) + local_layer_deleted = True + break + assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}" + + ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + env.pageserver.start() + client = env.pageserver.http_client() + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_tenant_to_download_timeline(client, tenant_id), + ) + + restored_timelines = client.timeline_list(tenant_id) + assert ( + len(restored_timelines) == 1 + ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + retored_timeline = restored_timelines[0] + assert retored_timeline["timeline_id"] == str( + timeline_id + ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + + +def expect_tenant_to_download_timeline( + client: NeonPageserverHttpClient, + tenant_id: TenantId, +): + for tenant in client.tenant_list(): + if tenant["id"] == str(tenant_id): + assert not tenant.get( + "has_in_progress_downloads", True + ), f"Tenant {tenant_id} should have no downloads in progress" + return + assert False, f"Tenant {tenant_id} is missing on pageserver" From 4f2ac51bdd21ada43efc2b30ad2b3724ed9331cf Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 30 Sep 2022 12:21:56 +0300 Subject: [PATCH 82/90] Bump rustc to 1.61 --- .github/workflows/build_and_test.yml | 6 +++--- .github/workflows/codestyle.yml | 2 +- rust-toolchain.toml | 9 ++++----- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8a7cdec89c..22042489a8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -127,8 +127,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - v8-${{ runner.os }}-${{ matrix.build_type }}-cargo- + v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + v9-${{ runner.os }}-${{ matrix.build_type }}-cargo- - name: Cache postgres v14 build id: cache_pg_14 @@ -389,7 +389,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 641943199e..6d39958bab 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -106,7 +106,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust + key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - name: Run cargo clippy run: ./run_clippy.sh diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 1a27e92fec..5aa0f8d4e5 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,11 +1,10 @@ [toolchain] # We try to stick to a toolchain version that is widely available on popular distributions, so that most people # can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later -# version, we can consider updating. As of this writing, 1.60 is available on Debian 'experimental' but not yet on -# 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach -# 'testing' soon (and similarly for the other distributions). -# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. -channel = "1.60" # do update GitHub CI cache values for rust builds, when changing this value +# version, we can consider updating. +# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package, +# we use "unstable" version number as the highest version used in the project by default. +channel = "1.61" # do update GitHub CI cache values for rust builds, when changing this value profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From 31123d1fa89f445581826559e8ed440455f01cff Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 3 Oct 2022 17:44:17 +0300 Subject: [PATCH 83/90] Silence clippies, minor doc fix (#2543) * doc: remove stray backtick * chore: clippy::let_unit_value * chore: silence useless_transmute, duplicate_mod * chore: remove allowing deref_nullptr not needed since bindgen 0.60.0. * chore: remove repeated allowed lints they are already allowed from the crate root. --- docs/sourcetree.md | 2 +- libs/postgres_ffi/src/lib.rs | 8 +++++--- libs/postgres_ffi/src/xlog_utils.rs | 6 ------ pageserver/src/tenant/timeline.rs | 2 +- proxy/src/cancellation.rs | 4 ++-- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 8043450a55..c468134b81 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -96,7 +96,7 @@ A single virtual environment with all dependencies is described in the single `P sudo apt install python3.9 ``` - Install `poetry` - - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`. + - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation). - Install dependencies via `./scripts/pysync`. - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) so if you have different version some linting tools can yield different result locally vs in the CI. diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 95ecc7b061..f3dad159be 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -3,9 +3,11 @@ #![allow(non_snake_case)] // bindgen creates some unsafe code with no doc comments. #![allow(clippy::missing_safety_doc)] -// suppress warnings on rust 1.53 due to bindgen unit tests. -// https://github.com/rust-lang/rust-bindgen/issues/1651 -#![allow(deref_nullptr)] +// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code. +#![allow(clippy::useless_transmute)] +// modules included with the postgres_ffi macro depend on the types of the specific version's +// types, and trigger a too eager lint. +#![allow(clippy::duplicate_mod)] use bytes::Bytes; use utils::bin_ser::SerializeError; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index fbd8468a93..953723a8f0 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -57,12 +57,10 @@ pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; /// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG. const XID_CHECKPOINT_INTERVAL: u32 = 1024; -#[allow(non_snake_case)] pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo } -#[allow(non_snake_case)] pub fn XLogSegNoOffsetToRecPtr( segno: XLogSegNo, offset: u32, @@ -71,7 +69,6 @@ pub fn XLogSegNoOffsetToRecPtr( segno * (wal_segsz_bytes as u64) + (offset as u64) } -#[allow(non_snake_case)] pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { format!( "{:>08X}{:>08X}{:>08X}", @@ -81,7 +78,6 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize ) } -#[allow(non_snake_case)] pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; @@ -89,12 +85,10 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli) } -#[allow(non_snake_case)] pub fn IsXLogFileName(fname: &str) -> bool { return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); } -#[allow(non_snake_case)] pub fn IsPartialXLogFileName(fname: &str) -> bool { fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 74e873e632..247e076230 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -627,7 +627,7 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); drop(tenant_conf_guard); let self_clone = Arc::clone(self); - let _ = spawn_connection_manager_task( + spawn_connection_manager_task( self.conf.broker_etcd_prefix.clone(), self_clone, walreceiver_connect_timeout, diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 92f8e35dab..eb9312e6bb 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -129,13 +129,13 @@ mod tests { assert!(CANCEL_MAP.contains(&session)); tx.send(()).expect("failed to send"); - let () = futures::future::pending().await; // sleep forever + futures::future::pending::<()>().await; // sleep forever Ok(()) })); // Wait until the task has been spawned. - let () = rx.await.context("failed to hear from the task")?; + rx.await.context("failed to hear from the task")?; // Drop the session's entry by cancelling the task. task.abort(); From 537b2c1ae6d9c61ae7ed4a02c04a370354b3bcdb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 4 Oct 2022 10:49:39 +0300 Subject: [PATCH 84/90] Remove unnecessary check for open PostgreSQL TCP port. The loop checked if the TCP port is open for connections, by trying to connect to it. That seems unnecessary. By the time the postmaster.pid file says that it's ready, the port should be open. Remove that check. --- compute_tools/src/compute.rs | 9 +-------- compute_tools/src/pg_helpers.rs | 22 ++++++++-------------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 58469b1c97..1e848627e3 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -258,14 +258,7 @@ impl ComputeNode { .spawn() .expect("cannot start postgres process"); - // Try default Postgres port if it is not provided - let port = self - .spec - .cluster - .settings - .find("port") - .unwrap_or_else(|| "5432".to_string()); - wait_for_postgres(&mut pg, &port, pgdata_path)?; + wait_for_postgres(&mut pg, pgdata_path)?; // If connection fails, // it may be the old node with `zenith_admin` superuser. diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index ac065fa60c..8802dae639 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,11 +1,9 @@ use std::fmt::Write; use std::fs::File; use std::io::{BufRead, BufReader}; -use std::net::{SocketAddr, TcpStream}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; -use std::str::FromStr; use std::{fs, thread, time}; use anyhow::{bail, Result}; @@ -230,21 +228,16 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { Ok(postgres_dbs) } -/// Wait for Postgres to become ready to accept connections: -/// - state should be `ready` in the `pgdata/postmaster.pid` -/// - and we should be able to connect to 127.0.0.1:5432 -pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> { +/// Wait for Postgres to become ready to accept connections. It's ready to +/// accept connections when the state-field in `pgdata/postmaster.pid` says +/// 'ready'. +pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); let mut slept: u64 = 0; // ms let pause = time::Duration::from_millis(100); - let timeout = time::Duration::from_millis(10); - let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap(); - loop { - // Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout, - // but postgres starts listening almost immediately, even if it is not really - // ready to accept connections). + // Sleep POSTGRES_WAIT_TIMEOUT at max if slept >= POSTGRES_WAIT_TIMEOUT { bail!("timed out while waiting for Postgres to start"); } @@ -263,10 +256,9 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<() // Pid file could be there and we could read it, but it could be empty, for example. if let Some(Ok(line)) = last_line { let status = line.trim(); - let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); // Now Postgres is ready to accept connections - if status == "ready" && can_connect { + if status == "ready" { break; } } @@ -276,6 +268,8 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<() slept += 100; } + log::info!("PostgreSQL is now running, continuing to configure it"); + Ok(()) } From 9b9bbad462160bf75df7ee69bc83a4da9eee2b38 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 4 Oct 2022 13:00:15 +0300 Subject: [PATCH 85/90] Use 'notify' crate to wait for PostgreSQL startup. Compute node startup time is very important. After launching PostgreSQL, use 'notify' to be notified immediately when it has updated the PID file, instead of polling. The polling loop had 100 ms interval so this shaves up to 100 ms from the startup time. --- Cargo.lock | 70 +++++++++++++++++++++++++++++++++ compute_tools/Cargo.toml | 2 + compute_tools/src/pg_helpers.rs | 62 +++++++++++++++++++++++------ workspace_hack/Cargo.toml | 1 + 4 files changed, 124 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ddb10352b8..69a8fa19ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -497,8 +497,10 @@ dependencies = [ "chrono", "clap 3.2.16", "env_logger", + "futures", "hyper", "log", + "notify", "postgres", "regex", "serde", @@ -1072,6 +1074,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "fsevent-sys" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + [[package]] name = "futures" version = "0.3.21" @@ -1493,6 +1504,26 @@ dependencies = [ "str_stack", ] +[[package]] +name = "inotify" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +dependencies = [ + "bitflags", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + [[package]] name = "instant" version = "0.1.12" @@ -1552,6 +1583,26 @@ dependencies = [ "simple_asn1", ] +[[package]] +name = "kqueue" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6112e8f37b59803ac47a42d14f1f3a59bbf72fc6857ffc5be455e28a691f8e" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587" +dependencies = [ + "bitflags", + "libc", +] + [[package]] name = "kstring" version = "1.0.6" @@ -1797,6 +1848,24 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "notify" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2c66da08abae1c024c01d635253e402341b4060a12e99b31c7594063bf490a" +dependencies = [ + "bitflags", + "crossbeam-channel", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "mio", + "walkdir", + "winapi", +] + [[package]] name = "num-bigint" version = "0.4.3" @@ -4142,6 +4211,7 @@ dependencies = [ "bstr", "bytes", "chrono", + "crossbeam-utils", "either", "fail", "hashbrown", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index b13f7f191d..43cf7ae2dd 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -8,8 +8,10 @@ anyhow = "1.0" chrono = "0.4" clap = "3.0" env_logger = "0.9" +futures = "0.3.13" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } +notify = "5.0.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } regex = "1" serde = { version = "1.0", features = ["derive"] } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 8802dae639..769dbfac73 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,16 +1,19 @@ use std::fmt::Write; +use std::fs; use std::fs::File; use std::io::{BufRead, BufReader}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; -use std::{fs, thread, time}; +use std::time::{Duration, Instant}; use anyhow::{bail, Result}; use postgres::{Client, Transaction}; use serde::Deserialize; -const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds +use notify::{RecursiveMode, Watcher}; + +const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Rust representation of Postgres role info with only those fields /// that matter for us. @@ -233,29 +236,63 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { /// 'ready'. pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); - let mut slept: u64 = 0; // ms - let pause = time::Duration::from_millis(100); + // PostgreSQL writes line "ready" to the postmaster.pid file, when it has + // completed initialization and is ready to accept connections. We want to + // react quickly and perform the rest of our initialization as soon as + // PostgreSQL starts accepting connections. Use 'notify' to be notified + // whenever the PID file is changed, and whenever it changes, read it to + // check if it's now "ready". + // + // You cannot actually watch a file before it exists, so we first watch the + // data directory, and once the postmaster.pid file appears, we switch to + // watch the file instead. We also wake up every 100 ms to poll, just in + // case we miss some events for some reason. Not strictly necessary, but + // better safe than sorry. + let (tx, rx) = std::sync::mpsc::channel(); + let mut watcher = notify::recommended_watcher(move |res| { + let _ = tx.send(res); + })?; + watcher.watch(pgdata, RecursiveMode::NonRecursive)?; + + let started_at = Instant::now(); + let mut postmaster_pid_seen = false; loop { - // Sleep POSTGRES_WAIT_TIMEOUT at max - if slept >= POSTGRES_WAIT_TIMEOUT { - bail!("timed out while waiting for Postgres to start"); - } - if let Ok(Some(status)) = pg.try_wait() { // Postgres exited, that is not what we expected, bail out earlier. let code = status.code().unwrap_or(-1); bail!("Postgres exited unexpectedly with code {}", code); } + let res = rx.recv_timeout(Duration::from_millis(100)); + log::debug!("woken up by notify: {res:?}"); + // If there are multiple events in the channel already, we only need to be + // check once. Swallow the extra events before we go ahead to check the + // pid file. + while let Ok(res) = rx.try_recv() { + log::debug!("swallowing extra event: {res:?}"); + } + // Check that we can open pid file first. if let Ok(file) = File::open(&pid_path) { + if !postmaster_pid_seen { + log::debug!("postmaster.pid appeared"); + watcher + .unwatch(pgdata) + .expect("Failed to remove pgdata dir watch"); + watcher + .watch(&pid_path, RecursiveMode::NonRecursive) + .expect("Failed to add postmaster.pid file watch"); + postmaster_pid_seen = true; + } + let file = BufReader::new(file); let last_line = file.lines().last(); // Pid file could be there and we could read it, but it could be empty, for example. if let Some(Ok(line)) = last_line { let status = line.trim(); + log::debug!("last line of postmaster.pid: {status:?}"); // Now Postgres is ready to accept connections if status == "ready" { @@ -264,8 +301,11 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { } } - thread::sleep(pause); - slept += 100; + // Give up after POSTGRES_WAIT_TIMEOUT. + let duration = started_at.elapsed(); + if duration >= POSTGRES_WAIT_TIMEOUT { + bail!("timed out while waiting for Postgres to start"); + } } log::info!("PostgreSQL is now running, continuing to configure it"); diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f37a42945e..6977665c7d 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,6 +19,7 @@ anyhow = { version = "1", features = ["backtrace", "std"] } bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } +crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } From 5cf53786f9196c9461119ed5a0653707b7804e96 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 26 Sep 2022 21:47:08 +0300 Subject: [PATCH 86/90] Improve pytest ergonomics 1. Disable perf tests by default 2. Add instruction to run tests in parallel --- pytest.ini | 1 + test_runner/README.md | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/pytest.ini b/pytest.ini index bfa07e520b..7197b078c6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -5,6 +5,7 @@ filterwarnings = ignore:record_property is incompatible with junit_family:pytest.PytestWarning addopts = -m 'not remote_cluster' + --ignore=test_runner/performance markers = remote_cluster testpaths = diff --git a/test_runner/README.md b/test_runner/README.md index d6ee5730ac..e066ac3235 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -56,6 +56,14 @@ If you want to run all tests that have the string "bench" in their names: `./scripts/pytest -k bench` +To run tests in parellel we utilize `pytest-xdist` plugin. By default everything runs single threaded. Number of workers can be specified with `-n` argument: + +`./scripts/pytest -n4` + +By default performance tests are excluded. To run them explicitly pass performance tests selection to the script: + +`./scripts/pytest test_runner/performance` + Useful environment variables: `NEON_BIN`: The directory where neon binaries can be found. From 231dfbaed630963e709166677908fff0b558e35e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 22:13:26 +0300 Subject: [PATCH 87/90] Do not remove empty timelines/ directory for tenants --- pageserver/src/tenant_mgr.rs | 44 ++++++++++++++++++----------- test_runner/regress/test_tenants.py | 37 ++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 1efd3d4af4..0e8ee8c067 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -108,6 +108,10 @@ pub fn init_tenant_mgr( /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", /// and the load continues. /// +/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully. +/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines. +/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before). +/// /// Attach happens on startup and sucessful timeline downloads /// (some subset of timeline files, always including its metadata, after which the new one needs to be registered). pub fn attach_local_tenants( @@ -173,16 +177,28 @@ fn load_local_tenant( remote_index.clone(), conf.remote_storage_config.is_some(), )); - match Tenant::load_tenant_config(conf, tenant_id) { - Ok(tenant_conf) => { - tenant.update_tenant_config(tenant_conf); - tenant.activate(false); - } - Err(e) => { - error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); - tenant.set_state(TenantState::Broken); + + let tenant_timelines_dir = conf.timelines_path(&tenant_id); + if !tenant_timelines_dir.is_dir() { + error!( + "Tenant {} has no timelines directory at {}", + tenant_id, + tenant_timelines_dir.display() + ); + tenant.set_state(TenantState::Broken); + } else { + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } } } + tenant } @@ -630,14 +646,10 @@ fn collect_timelines_for_tenant( } if tenant_timelines.is_empty() { - match remove_if_empty(&timelines_dir) { - Ok(true) => info!( - "Removed empty tenant timelines directory {}", - timelines_dir.display() - ), - Ok(false) => (), - Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"), - } + // this is normal, we've removed all broken, empty and temporary timeline dirs + // but should allow the tenant to stay functional and allow creating new timelines + // on a restart, we require tenants to have the timelines dir, so leave it on disk + debug!("Tenant {tenant_id} has no timelines loaded"); } Ok((tenant_id, tenant_timelines)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 52b9e6369c..ba5109a16f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,4 +1,5 @@ import os +import shutil from contextlib import closing from datetime import datetime from pathlib import Path @@ -201,3 +202,39 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde post_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) assert post_detach_samples == set() + + +def test_pageserver_with_empty_tenants(neon_simple_env: NeonEnv): + env = neon_simple_env + client = env.pageserver.http_client() + + tenant_without_timelines_dir = env.initial_tenant + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") + + tenant_with_empty_timelines_dir = client.tenant_create() + for timeline_dir_entry in Path.iterdir( + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + ): + if timeline_dir_entry.is_dir(): + shutil.rmtree(timeline_dir_entry) + else: + timeline_dir_entry.unlink() + + env.postgres.stop_all() + for _ in range(0, 3): + env.pageserver.stop() + env.pageserver.start() + + client = env.pageserver.http_client() + tenants = client.tenant_list() + + assert ( + len(tenants) == 1 + ), "Pageserver should attach only tenants with empty timelines/ dir on restart" + loaded_tenant = tenants[0] + assert loaded_tenant["id"] == str( + tenant_with_empty_timelines_dir + ), f"Tenant {tenant_with_empty_timelines_dir} should be loaded as the only one with tenants/ directory" + assert loaded_tenant["state"] == { + "Active": {"background_jobs_running": False} + }, "Empty tenant should be loaded and ready for timeline creation" From d823e84ed5497c61ff04b9a4f689470c62ec2e9a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 23:14:39 +0300 Subject: [PATCH 88/90] Allow attaching tenants with zero timelines --- pageserver/src/http/routes.rs | 13 ++++-- test_runner/fixtures/neon_fixtures.py | 7 +++- test_runner/regress/test_tenants.py | 57 ++++++++++++++++++++------- 3 files changed, 59 insertions(+), 18 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 55429420a8..a1bd65c308 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -337,9 +337,16 @@ async fn tenant_attach_handler(request: Request) -> Result, info!("Handling tenant attach {tenant_id}"); tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) { - Ok(_) => Err(ApiError::Conflict( - "Tenant is already present locally".to_owned(), - )), + Ok(tenant) => { + if tenant.list_timelines().is_empty() { + info!("Attaching to tenant {tenant_id} with zero timelines"); + Ok(()) + } else { + Err(ApiError::Conflict( + "Tenant is already present locally".to_owned(), + )) + } + } Err(_) => Ok(()), }) .await diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index aa9fd68df5..5c2c3edbd8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -455,6 +455,9 @@ class RemoteStorageKind(enum.Enum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" + # Pass to tests that are generic to remote storage + # to ensure the test pass with or without the remote storage + NOOP = "noop" def available_remote_storages() -> List[RemoteStorageKind]: @@ -583,7 +586,9 @@ class NeonEnvBuilder: test_name: str, force_enable: bool = True, ): - if remote_storage_kind == RemoteStorageKind.LOCAL_FS: + if remote_storage_kind == RemoteStorageKind.NOOP: + return + elif remote_storage_kind == RemoteStorageKind.LOCAL_FS: self.enable_local_fs_remote_storage(force_enable=force_enable) elif remote_storage_kind == RemoteStorageKind.MOCK_S3: self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index ba5109a16f..f49b6fccb9 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -8,8 +8,13 @@ from typing import List import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, TenantId +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + RemoteStorageKind, + available_remote_storages, +) +from fixtures.types import Lsn, TenantId, TimelineId from prometheus_client.samples import Sample @@ -204,26 +209,50 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde assert post_detach_samples == set() -def test_pageserver_with_empty_tenants(neon_simple_env: NeonEnv): - env = neon_simple_env +# Check that empty tenants work with or without the remote storage +@pytest.mark.parametrize( + "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP] +) +def test_pageserver_with_empty_tenants( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_pageserver_with_empty_tenants", + ) + + env = neon_env_builder.init_start() client = env.pageserver.http_client() tenant_without_timelines_dir = env.initial_tenant + log.info( + f"Tenant {tenant_without_timelines_dir} becomes broken: it abnormally looses tenants/ directory and is expected to be completely ignored when pageserver restarts" + ) shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") tenant_with_empty_timelines_dir = client.tenant_create() - for timeline_dir_entry in Path.iterdir( - Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" - ): - if timeline_dir_entry.is_dir(): - shutil.rmtree(timeline_dir_entry) - else: - timeline_dir_entry.unlink() + log.info( + f"Tenant {tenant_with_empty_timelines_dir} gets all of its timelines deleted: still should be functional" + ) + temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir) + for temp_timeline in temp_timelines: + client.timeline_delete( + tenant_with_empty_timelines_dir, TimelineId(temp_timeline["timeline_id"]) + ) + files_in_timelines_dir = sum( + 1 + for _p in Path.iterdir( + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + ) + ) + assert ( + files_in_timelines_dir == 0 + ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory" + # Trigger timeline reinitialization after pageserver restart env.postgres.stop_all() - for _ in range(0, 3): - env.pageserver.stop() - env.pageserver.start() + env.pageserver.stop() + env.pageserver.start() client = env.pageserver.http_client() tenants = client.tenant_list() From 580584c8fce303da90d898d81703ab54e81e39b9 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 4 Oct 2022 19:14:45 +0100 Subject: [PATCH 89/90] Remove control_plane deps on pageserver/safekeeper (#2513) Creates new `pageserver_api` and `safekeeper_api` crates to serve as the shared dependencies. Should reduce both recompile times and cold compile times. Decreases the size of the optimized `neon_local` binary: 380M -> 179M. No significant changes for anything else (mostly as expected). --- Cargo.lock | 28 +++++++++++++++++-- control_plane/Cargo.toml | 6 ++-- control_plane/src/bin/neon_local.rs | 6 ++-- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage.rs | 2 +- libs/pageserver_api/Cargo.toml | 12 ++++++++ libs/pageserver_api/src/lib.rs | 9 ++++++ .../pageserver_api/src}/models.rs | 12 +++++++- libs/safekeeper_api/Cargo.toml | 12 ++++++++ libs/safekeeper_api/src/lib.rs | 10 +++++++ .../safekeeper_api/src}/models.rs | 0 pageserver/Cargo.toml | 1 + pageserver/src/config.rs | 8 +++--- pageserver/src/http/mod.rs | 3 +- pageserver/src/tenant.rs | 13 +-------- safekeeper/Cargo.toml | 1 + safekeeper/src/http/mod.rs | 3 +- safekeeper/src/lib.rs | 9 +++--- 18 files changed, 104 insertions(+), 33 deletions(-) create mode 100644 libs/pageserver_api/Cargo.toml create mode 100644 libs/pageserver_api/src/lib.rs rename {pageserver/src/http => libs/pageserver_api/src}/models.rs (90%) create mode 100644 libs/safekeeper_api/Cargo.toml create mode 100644 libs/safekeeper_api/src/lib.rs rename {safekeeper/src/http => libs/safekeeper_api/src}/models.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 69a8fa19ab..ab508c7109 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -542,11 +542,11 @@ dependencies = [ "git-version", "nix", "once_cell", - "pageserver", + "pageserver_api", "postgres", "regex", "reqwest", - "safekeeper", + "safekeeper_api", "serde", "serde_with", "tar", @@ -2044,6 +2044,7 @@ dependencies = [ "nix", "num-traits", "once_cell", + "pageserver_api", "postgres", "postgres-protocol", "postgres-types", @@ -2072,6 +2073,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_api" +version = "0.1.0" +dependencies = [ + "const_format", + "serde", + "serde_with", + "utils", + "workspace_hack", +] + [[package]] name = "parking_lot" version = "0.11.2" @@ -2960,6 +2972,7 @@ dependencies = [ "postgres_ffi", "regex", "remote_storage", + "safekeeper_api", "serde", "serde_json", "serde_with", @@ -2975,6 +2988,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "safekeeper_api" +version = "0.1.0" +dependencies = [ + "const_format", + "serde", + "serde_with", + "utils", + "workspace_hack", +] + [[package]] name = "same-file" version = "1.0.6" diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index ab9df8534c..ee8481e141 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -19,7 +19,9 @@ thiserror = "1" nix = "0.23" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } -pageserver = { path = "../pageserver" } -safekeeper = { path = "../safekeeper" } +# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api +# instead, so that recompile times are better. +pageserver_api = { path = "../libs/pageserver_api" } +safekeeper_api = { path = "../libs/safekeeper_api" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 93947d5326..0c26842b34 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -12,12 +12,12 @@ use control_plane::local_env::{EtcdBroker, LocalEnv}; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage::PageServerNode; use control_plane::{etcd, local_env}; -use pageserver::config::defaults::{ +use pageserver_api::models::TimelineInfo; +use pageserver_api::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; -use pageserver::http::models::TimelineInfo; -use safekeeper::defaults::{ +use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 600a9ffe05..34b2f3000a 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -12,7 +12,7 @@ use nix::unistd::Pid; use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; -use safekeeper::http::models::TimelineCreateRequest; +use safekeeper_api::models::TimelineCreateRequest; use thiserror::Error; use utils::{ connstring::connection_address, diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index bfbd6e91c3..59cb3d7efb 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{ +use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, }; use postgres::{Config, NoTls}; diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml new file mode 100644 index 0000000000..be8762100c --- /dev/null +++ b/libs/pageserver_api/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "pageserver_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "1.12.0" +const_format = "0.2.21" + +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs new file mode 100644 index 0000000000..a36c1692a9 --- /dev/null +++ b/libs/pageserver_api/src/lib.rs @@ -0,0 +1,9 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/pageserver/src/http/models.rs b/libs/pageserver_api/src/models.rs similarity index 90% rename from pageserver/src/http/models.rs rename to libs/pageserver_api/src/models.rs index d5559653b2..43059ead84 100644 --- a/pageserver/src/http/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -7,7 +7,17 @@ use utils::{ lsn::Lsn, }; -use crate::tenant::TenantState; +/// A state of a tenant in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TenantState { + /// Tenant is fully operational, its background jobs might be running or not. + Active { background_jobs_running: bool }, + /// A tenant is recognized by pageserver, but not yet ready to operate: + /// e.g. not present locally and being downloaded or being read into memory from the file system. + Paused, + /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} #[serde_as] #[derive(Serialize, Deserialize)] diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml new file mode 100644 index 0000000000..852d643f30 --- /dev/null +++ b/libs/safekeeper_api/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "safekeeper_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "1.12.0" +const_format = "0.2.21" + +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs new file mode 100644 index 0000000000..0a391478da --- /dev/null +++ b/libs/safekeeper_api/src/lib.rs @@ -0,0 +1,10 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/safekeeper/src/http/models.rs b/libs/safekeeper_api/src/models.rs similarity index 100% rename from safekeeper/src/http/models.rs rename to libs/safekeeper_api/src/models.rs diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 1ec7ec4f98..88430f3a86 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -58,6 +58,7 @@ rstar = "0.9.3" num-traits = "0.2.15" amplify_num = "0.4.1" +pageserver_api = { path = "../libs/pageserver_api" } postgres_ffi = { path = "../libs/postgres_ffi" } etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a52a3e8262..6e3c7baad8 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -30,10 +30,10 @@ pub mod defaults { use crate::tenant_config::defaults::*; use const_format::formatcp; - pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); + pub use pageserver_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; diff --git a/pageserver/src/http/mod.rs b/pageserver/src/http/mod.rs index 4c0be17ecd..1c083bd382 100644 --- a/pageserver/src/http/mod.rs +++ b/pageserver/src/http/mod.rs @@ -1,3 +1,4 @@ -pub mod models; pub mod routes; pub use routes::make_router; + +pub use pageserver_api::models; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 672ee3a488..c2fb9ef242 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -45,6 +45,7 @@ use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; +pub use pageserver_api::models::TenantState; use toml_edit; use utils::{ @@ -118,18 +119,6 @@ pub struct Tenant { upload_layers: bool, } -/// A state of a tenant in pageserver's memory. -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -pub enum TenantState { - /// Tenant is fully operational, its background jobs might be running or not. - Active { background_jobs_running: bool }, - /// A tenant is recognized by pageserver, but not yet ready to operate: - /// e.g. not present locally and being downloaded or being read into memory from the file system. - Paused, - /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. - Broken, -} - /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 87ee63d1df..cb1cecade9 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -33,6 +33,7 @@ toml_edit = { version = "0.13", features = ["easy"] } thiserror = "1" parking_lot = "0.12.1" +safekeeper_api = { path = "../libs/safekeeper_api" } postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 4c0be17ecd..1831470007 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,3 +1,4 @@ -pub mod models; pub mod routes; pub use routes::make_router; + +pub use safekeeper_api::models; diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 58a237a5d3..e38a5a4633 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -27,14 +27,13 @@ mod timelines_global_map; pub use timelines_global_map::GlobalTimelines; pub mod defaults { - use const_format::formatcp; use std::time::Duration; - pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + pub use safekeeper_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10); pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8; } From b99bed510d742babc061097a528f2dc09284c681 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 5 Oct 2022 16:14:09 +0300 Subject: [PATCH 90/90] Move proxies to neon-proxy namespace (#2555) --- .github/workflows/build_and_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 22042489a8..4f2f8f0833 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -768,5 +768,5 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s