diff --git a/.circleci/config.yml b/.circleci/config.yml index 24d151f765..c94dd20ff0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,7 +7,7 @@ executors: zenith-build-executor: resource_class: xlarge docker: - - image: cimg/rust:1.52.1 + - image: cimg/rust:1.55.0 jobs: check-codestyle: @@ -110,7 +110,7 @@ jobs: # Require an exact match. While an out of date cache might speed up the build, # there's no way to clean out old packages, so the cache grows every time something # changes. - - v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} + - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} # Build the rust code, including test binaries - run: @@ -128,7 +128,7 @@ jobs: - save_cache: name: Save rust cache - key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} + key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} paths: - ~/.cargo/registry - ~/.cargo/git @@ -182,6 +182,21 @@ jobs: paths: - "*" + check-python: + executor: python/default + steps: + - checkout + - run: + name: Install pipenv & deps + working_directory: test_runner + command: | + pip install pipenv + pipenv install --dev + - run: + name: Run yapf to ensure code format + working_directory: test_runner + command: pipenv run yapf --recursive --diff . + run-pytest: #description: "Run pytest" executor: python/default @@ -245,13 +260,13 @@ jobs: # # The junit.xml file allows CircleCI to display more fine-grained test information # in its "Tests" tab in the results page. - # -s prevents pytest from capturing output, which helps to see - # what's going on if the test hangs # --verbose prints name of each test (helpful when there are # multiple tests in one file) # -rA prints summary in the end # -n4 uses four processes to run tests via pytest-xdist - pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short -s --verbose -rA $TEST_SELECTION $EXTRA_PARAMS + # -s is not used to prevent pytest from capturing output, because tests are running + # in parallel and logs are mixed between different tests + pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS - run: # CircleCI artifacts are preserved one file at a time, so skipping # this step isn't a good idea. If you want to extract the @@ -260,7 +275,7 @@ jobs: when: always command: | du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete + find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete du -sh /tmp/test_output/* - store_artifacts: path: /tmp/test_output @@ -325,8 +340,7 @@ jobs: \"inputs\": { \"ci_job_name\": \"zenith-remote-ci\", \"commit_hash\": \"$CIRCLE_SHA1\", - \"remote_repo\": \"$LOCAL_REPO\", - \"zenith_image_branch\": \"$CIRCLE_BRANCH\" + \"remote_repo\": \"$LOCAL_REPO\" } }" @@ -334,6 +348,7 @@ workflows: build_and_test: jobs: - check-codestyle + - check-python - build-postgres: name: build-postgres-<< matrix.build_type >> matrix: diff --git a/Cargo.lock b/Cargo.lock index c217dfbebb..5f36f48966 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,18 +26,21 @@ dependencies = [ "winapi", ] +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anyhow" version = "1.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "595d3cfa7a60d4555cb5067b99f07142a08ea778de5cf993f7b75c7d8fabc486" -[[package]] -name = "arc-swap" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e906254e445520903e7fc9da4f709886c84ae4bc4ddaf0e093188d66df4dc820" - [[package]] name = "async-trait" version = "0.1.50" @@ -298,7 +301,7 @@ version = "2.33.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" dependencies = [ - "ansi_term", + "ansi_term 0.11.0", "atty", "bitflags", "strsim", @@ -387,26 +390,6 @@ dependencies = [ "rustc_version", ] -[[package]] -name = "crossbeam-channel" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" -dependencies = [ - "cfg-if 1.0.0", - "lazy_static", -] - [[package]] name = "crypto-mac" version = "0.10.0" @@ -445,16 +428,6 @@ dependencies = [ "dirs-sys", ] -[[package]] -name = "dirs-next" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" -dependencies = [ - "cfg-if 1.0.0", - "dirs-sys-next", -] - [[package]] name = "dirs-sys" version = "0.3.6" @@ -466,17 +439,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "dirs-sys-next" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - [[package]] name = "dlv-list" version = "0.2.3" @@ -956,6 +918,15 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "matchers" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1" +dependencies = [ + "regex-automata", +] + [[package]] name = "matches" version = "0.1.8" @@ -1220,10 +1191,12 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "signal-hook", "tar", "thiserror", "tokio", "toml", + "tracing", "workspace_hack", "zenith_metrics", "zenith_utils", @@ -1531,6 +1504,15 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax", +] + [[package]] name = "regex-syntax" version = "0.6.25" @@ -1689,12 +1671,6 @@ dependencies = [ "webpki", ] -[[package]] -name = "rustversion" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" - [[package]] name = "ryu" version = "1.0.5" @@ -1852,12 +1828,32 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "sharded-slab" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42a568c8f2cd051a4d283bd6eb0343ac214c1b0f1ac19f93e1175b2dee38c73d" +[[package]] +name = "signal-hook" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c98891d737e271a2954825ef19e46bd16bdb98e2746f2eec4f7a4ef7946efd1" +dependencies = [ + "cc", + "libc", + "signal-hook-registry", +] + [[package]] name = "signal-hook-registry" version = "1.4.0" @@ -1890,59 +1886,6 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527" -[[package]] -name = "slog" -version = "2.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06" - -[[package]] -name = "slog-async" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c60813879f820c85dbc4eabf3269befe374591289019775898d56a81a804fbdc" -dependencies = [ - "crossbeam-channel", - "slog", - "take_mut", - "thread_local", -] - -[[package]] -name = "slog-scope" -version = "4.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f95a4b4c3274cd2869549da82b57ccc930859bdbf5bcea0424bc5f140b3c786" -dependencies = [ - "arc-swap", - "lazy_static", - "slog", -] - -[[package]] -name = "slog-stdlog" -version = "4.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8228ab7302adbf4fcb37e66f3cda78003feb521e7fd9e3847ec117a7784d0f5a" -dependencies = [ - "log", - "slog", - "slog-scope", -] - -[[package]] -name = "slog-term" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95c1e7e5aab61ced6006149ea772770b84a0d16ce0f7885def313e4829946d76" -dependencies = [ - "atty", - "chrono", - "slog", - "term", - "thread_local", -] - [[package]] name = "smallvec" version = "1.6.1" @@ -1998,12 +1941,6 @@ dependencies = [ "unicode-xid", ] -[[package]] -name = "take_mut" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" - [[package]] name = "tap" version = "1.0.1" @@ -2035,17 +1972,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "term" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" -dependencies = [ - "dirs-next", - "rustversion", - "winapi", -] - [[package]] name = "termcolor" version = "1.1.2" @@ -2223,24 +2149,79 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" [[package]] name = "tracing" -version = "0.1.26" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" +checksum = "375a639232caf30edfc78e8d89b2d4c375515393e7af7e16f01cd96917fb2105" dependencies = [ "cfg-if 1.0.0", "pin-project-lite", + "tracing-attributes", "tracing-core", ] [[package]] -name = "tracing-core" +name = "tracing-attributes" version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052" +checksum = "f4f480b8f81512e825f337ad51e94c1eb5d3bbdf2b363dcd01e2b19a9ffe3f8e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4" dependencies = [ "lazy_static", ] +[[package]] +name = "tracing-log" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e0d2eaa99c3c2e41547cfa109e910a68ea03823cccad4a0525dcbc9b01e8c71" +dependencies = [ + "ansi_term 0.12.1", + "chrono", + "lazy_static", + "matchers", + "regex", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + [[package]] name = "try-lock" version = "0.2.3" @@ -2339,11 +2320,13 @@ dependencies = [ "byteorder", "bytes", "clap", + "const_format", "crc32c", "daemonize", "fs2", "hex", "humantime", + "hyper", "lazy_static", "log", "pageserver", @@ -2351,6 +2334,7 @@ dependencies = [ "postgres-protocol", "postgres_ffi", "regex", + "routerify", "rust-s3", "serde", "serde_json", @@ -2358,6 +2342,7 @@ dependencies = [ "tokio-stream", "walkdir", "workspace_hack", + "zenith_metrics", "zenith_utils", ] @@ -2603,14 +2588,12 @@ dependencies = [ "rustls-split", "serde", "serde_json", - "slog", - "slog-async", - "slog-scope", - "slog-stdlog", - "slog-term", "tempfile", "thiserror", "tokio", + "tracing", + "tracing-log", + "tracing-subscriber", "webpki", "workspace_hack", "zenith_metrics", diff --git a/Dockerfile b/Dockerfile index b38bac4480..528f29597f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl mkdir zenith_install COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin +COPY --from=build /zenith/target/release/safekeeper /usr/local/bin COPY --from=build /zenith/target/release/proxy /usr/local/bin COPY --from=pg-build /zenith/tmp_install postgres_install COPY docker-entrypoint.sh /docker-entrypoint.sh diff --git a/Dockerfile.alpine b/Dockerfile.alpine index a2a2fea1a4..dafb7eaf6b 100644 --- a/Dockerfile.alpine +++ b/Dockerfile.alpine @@ -81,7 +81,7 @@ FROM alpine:3.13 RUN apk add --update openssl build-base libseccomp-dev RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin +COPY --from=build /zenith/target/release/safekeeper /usr/local/bin COPY --from=build /zenith/target/release/proxy /usr/local/bin COPY --from=pg-build /zenith/tmp_install /usr/local COPY docker-entrypoint.sh /docker-entrypoint.sh diff --git a/Makefile b/Makefile index 2edf2a6b4a..ef26ceee2d 100644 --- a/Makefile +++ b/Makefile @@ -10,32 +10,43 @@ endif # We differentiate between release / debug build types using the BUILD_TYPE # environment variable. # +BUILD_TYPE ?= debug ifeq ($(BUILD_TYPE),release) PG_CONFIGURE_OPTS = --enable-debug - PG_CFLAGS = -O2 -g3 ${CFLAGS} -else + PG_CFLAGS = -O2 -g3 $(CFLAGS) + # Unfortunately, `--profile=...` is a nightly feature + CARGO_BUILD_FLAGS += --release +else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend - PG_CFLAGS = -O0 -g3 ${CFLAGS} + PG_CFLAGS = -O0 -g3 $(CFLAGS) +else +$(error Bad build type `$(BUILD_TYPE)', see Makefile for options) endif +# Choose whether we should be silent or verbose +CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) +# Fix for a corner case when make doesn't pass a jobserver +CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS)) + +# This option has a side effect of passing make jobserver to cargo. +# However, we shouldn't do this if `make -n` (--dry-run) has been asked. +CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+) +# Force cargo not to print progress bar +CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 + # # Top level Makefile to build Zenith and PostgreSQL # +.PHONY: all all: zenith postgres -# We don't want to run 'cargo build' in parallel with the postgres build, -# because interleaving cargo build output with postgres build output looks -# confusing. Also, 'cargo build' is parallel on its own, so it would be too -# much parallelism. (Recursive invocation of postgres target still gets any -# '-j' flag from the command line, so 'make -j' is still useful.) -.NOTPARALLEL: - ### Zenith Rust bits # # The 'postgres_ffi' depends on the Postgres headers. .PHONY: zenith zenith: postgres-headers - cargo build + +@echo "Compiling Zenith" + $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) ### PostgreSQL parts tmp_install/build/config.status: @@ -57,10 +68,10 @@ postgres-headers: postgres-configure +@echo "Installing PostgreSQL headers" $(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install - # Compile and install PostgreSQL and contrib/zenith .PHONY: postgres -postgres: postgres-configure +postgres: postgres-configure \ + postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` +@echo "Compiling PostgreSQL" $(MAKE) -C tmp_install/build MAKELEVEL=0 install +@echo "Compiling contrib/zenith" @@ -68,18 +79,21 @@ postgres: postgres-configure +@echo "Compiling contrib/zenith_test_utils" $(MAKE) -C tmp_install/build/contrib/zenith_test_utils install +.PHONY: postgres-clean postgres-clean: $(MAKE) -C tmp_install/build MAKELEVEL=0 clean # This doesn't remove the effects of 'configure'. +.PHONY: clean clean: - cd tmp_install/build && ${MAKE} clean - cargo clean + cd tmp_install/build && $(MAKE) clean + $(CARGO_CMD_PREFIX) cargo clean # This removes everything +.PHONY: distclean distclean: rm -rf tmp_install - cargo clean + $(CARGO_CMD_PREFIX) cargo clean .PHONY: fmt fmt: diff --git a/README.md b/README.md index 1e0f20fd45..977d015bfc 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec libssl-dev clang pkg-config libpq-dev ``` -[Rust] 1.52 or later is also required. +[Rust] 1.55 or later is also required. To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 5b4313494b..fb98eeca03 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -84,25 +84,53 @@ impl ComputeControlPlane { } } + // FIXME: see also parse_point_in_time in branches.rs. + fn parse_point_in_time( + &self, + tenantid: ZTenantId, + s: &str, + ) -> Result<(ZTimelineId, Option)> { + let mut strings = s.split('@'); + let name = strings.next().unwrap(); + + let lsn: Option; + if let Some(lsnstr) = strings.next() { + lsn = Some( + Lsn::from_str(lsnstr) + .with_context(|| "invalid LSN in point-in-time specification")?, + ); + } else { + lsn = None + } + + // Resolve the timeline ID, given the human-readable branch name + let timeline_id = self + .pageserver + .branch_get_by_name(&tenantid, name)? + .timeline_id; + + Ok((timeline_id, lsn)) + } + pub fn new_node( &mut self, tenantid: ZTenantId, - branch_name: &str, + name: &str, + timeline_spec: &str, port: Option, ) -> Result> { - let timeline_id = self - .pageserver - .branch_get_by_name(&tenantid, branch_name)? - .timeline_id; + // Resolve the human-readable timeline spec into timeline ID and LSN + let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?; let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { - name: branch_name.to_owned(), + name: name.to_owned(), address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), is_test: false, - timelineid: timeline_id, + timelineid, + lsn, tenantid, uses_wal_proposer: false, }); @@ -127,6 +155,7 @@ pub struct PostgresNode { pageserver: Arc, is_test: bool, pub timelineid: ZTimelineId, + pub lsn: Option, // if it's a read-only node. None for primary pub tenantid: ZTenantId, uses_wal_proposer: bool, } @@ -161,9 +190,12 @@ impl PostgresNode { let port: u16 = conf.parse_field("port", &context)?; let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; - let uses_wal_proposer = conf.get("wal_acceptors").is_some(); + // parse recovery_target_lsn, if any + let recovery_target_lsn: Option = + conf.parse_field_optional("recovery_target_lsn", &context)?; + // ok now Ok(PostgresNode { address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), @@ -172,6 +204,7 @@ impl PostgresNode { pageserver: Arc::clone(pageserver), is_test: false, timelineid, + lsn: recovery_target_lsn, tenantid, uses_wal_proposer, }) @@ -233,7 +266,7 @@ impl PostgresNode { // Read the archive directly from the `CopyOutReader` tar::Archive::new(copyreader) .unpack(&self.pgdata()) - .with_context(|| "extracting page backup failed")?; + .with_context(|| "extracting base backup failed")?; Ok(()) } @@ -301,6 +334,9 @@ impl PostgresNode { conf.append("zenith.page_server_connstring", &pageserver_connstr); conf.append("zenith.zenith_tenant", &self.tenantid.to_string()); conf.append("zenith.zenith_timeline", &self.timelineid.to_string()); + if let Some(lsn) = self.lsn { + conf.append("recovery_target_lsn", &lsn.to_string()); + } conf.append_line(""); // Configure the node to stream WAL directly to the pageserver @@ -314,7 +350,9 @@ impl PostgresNode { } fn load_basebackup(&self) -> Result<()> { - let lsn = if self.uses_wal_proposer { + let backup_lsn = if let Some(lsn) = self.lsn { + Some(lsn) + } else if self.uses_wal_proposer { // LSN 0 means that it is bootstrap and we need to download just // latest data from the pageserver. That is a bit clumsy but whole bootstrap // procedure evolves quite actively right now, so let's think about it again @@ -329,7 +367,7 @@ impl PostgresNode { None }; - self.do_basebackup(lsn)?; + self.do_basebackup(backup_lsn)?; Ok(()) } @@ -406,6 +444,10 @@ impl PostgresNode { // 3. Load basebackup self.load_basebackup()?; + if self.lsn.is_some() { + File::create(self.pgdata().join("standby.signal"))?; + } + // 4. Finally start the compute node postgres println!("Starting postgres node at '{}'", self.connstr()); self.pg_ctl(&["start"], auth_token) diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index bcd463999b..7f50fe9c2f 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -83,6 +83,22 @@ impl PostgresConf { .with_context(|| format!("could not parse '{}' option {}", field_name, context)) } + pub fn parse_field_optional(&self, field_name: &str, context: &str) -> Result> + where + T: FromStr, + ::Err: std::error::Error + Send + Sync + 'static, + { + if let Some(val) = self.get(field_name) { + let result = val + .parse::() + .with_context(|| format!("could not parse '{}' option {}", field_name, context))?; + + Ok(Some(result)) + } else { + Ok(None) + } + } + /// /// Note: if you call this multiple times for the same option, the config /// file will a line for each call. It would be nice to have a function diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 9d762c360f..3d331ca2a7 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -199,23 +199,45 @@ impl PageServerNode { bail!("pageserver failed to start in {} seconds", RETRIES); } - pub fn stop(&self) -> anyhow::Result<()> { + pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { let pid = read_pidfile(&self.pid_file())?; let pid = Pid::from_raw(pid); - if kill(pid, Signal::SIGTERM).is_err() { - bail!("Failed to kill pageserver with pid {}", pid); + if immediate { + println!("Stop pageserver immediately"); + if kill(pid, Signal::SIGQUIT).is_err() { + bail!("Failed to kill pageserver with pid {}", pid); + } + } else { + println!("Stop pageserver gracefully"); + if kill(pid, Signal::SIGTERM).is_err() { + bail!("Failed to stop pageserver with pid {}", pid); + } } - // wait for pageserver stop let address = connection_address(&self.pg_connection_config); - for _ in 0..5 { - let stream = TcpStream::connect(&address); - thread::sleep(Duration::from_secs(1)); - if let Err(_e) = stream { - println!("Pageserver stopped"); - return Ok(()); + + // TODO Remove this "timeout" and handle it on caller side instead. + // Shutting down may take a long time, + // if pageserver checkpoints a lot of data + for _ in 0..100 { + if let Err(_e) = TcpStream::connect(&address) { + println!("Pageserver stopped receiving connections"); + + //Now check status + match self.check_status() { + Ok(_) => { + println!("Pageserver status is OK. Wait a bit."); + thread::sleep(Duration::from_secs(1)); + } + Err(err) => { + println!("Pageserver status is: {}", err); + return Ok(()); + } + } + } else { + println!("Pageserver still receives connections"); + thread::sleep(Duration::from_secs(1)); } - println!("Stopping pageserver on {}", address); } bail!("Failed to stop pageserver with pid {}", pid); @@ -313,8 +335,9 @@ impl PageServerNode { impl Drop for PageServerNode { fn drop(&mut self) { + // TODO Looks like this flag is never set if self.kill_on_exit { - let _ = self.stop(); + let _ = self.stop(true); } } } diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 3754c18193..566e77c1a4 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -7,7 +7,7 @@ if [ "$1" = 'pageserver' ]; then pageserver --init -D /data --postgres-distrib /usr/local fi echo "Staring pageserver at 0.0.0.0:6400" - pageserver -l 0.0.0.0:6400 -D /data + pageserver -l 0.0.0.0:6400 --listen-http 0.0.0.0:9898 -D /data else "$@" fi diff --git a/docs/docker.md b/docs/docker.md index 9a909ebfe3..14ba2146cb 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -4,7 +4,7 @@ Currently we build two main images: -- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). +- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). - [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres). And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos: diff --git a/docs/multitenancy.md b/docs/multitenancy.md index c9a95116c5..4f1d45e970 100644 --- a/docs/multitenancy.md +++ b/docs/multitenancy.md @@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id ### Safety -For now particular tenant can only appear on a particular pageserver. Set of WAL acceptors are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline). +For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline). diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 0e5a82df88..33c911c840 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -17,7 +17,7 @@ lazy_static = "1.4.0" log = "0.4.14" clap = "2.33.0" daemonize = "0.4.1" -tokio = { version = "1.11", features = ["process", "macros", "fs"] } +tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } @@ -35,6 +35,8 @@ scopeguard = "1.1.0" rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] } async-trait = "0.1" const_format = "0.2.21" +tracing = "0.1.27" +signal-hook = {version = "0.3.10", features = ["extended-siginfo"] } postgres_ffi = { path = "../postgres_ffi" } zenith_metrics = { path = "../zenith_metrics" } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 5389d609a5..def815a32d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -13,6 +13,7 @@ use anyhow::Result; use bytes::{BufMut, BytesMut}; use log::*; +use std::fmt::Write as FmtWrite; use std::io; use std::io::Write; use std::sync::Arc; @@ -31,7 +32,7 @@ use zenith_utils::lsn::Lsn; pub struct Basebackup<'a> { ar: Builder<&'a mut dyn Write>, timeline: &'a Arc, - lsn: Lsn, + pub lsn: Lsn, prev_record_lsn: Lsn, } @@ -83,7 +84,7 @@ impl<'a> Basebackup<'a> { info!( "taking basebackup lsn={}, prev_lsn={}", - backup_prev, backup_lsn + backup_lsn, backup_prev ); Ok(Basebackup { @@ -97,7 +98,6 @@ impl<'a> Basebackup<'a> { pub fn send_tarball(&mut self) -> anyhow::Result<()> { // Create pgdata subdirs structure for dir in pg_constants::PGDATA_SUBDIRS.iter() { - info!("send subdir {:?}", *dir); let header = new_tar_header_dir(*dir)?; self.ar.append(&header, &mut io::empty())?; } @@ -249,13 +249,7 @@ impl<'a> Basebackup<'a> { let mut pg_control = ControlFileData::decode(&pg_control_bytes)?; let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - // Generate new pg_control and WAL needed for bootstrap - let checkpoint_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let checkpoint_lsn = XLogSegNoOffsetToRecPtr( - checkpoint_segno, - XLOG_SIZE_OF_XLOG_LONG_PHD as u32, - pg_constants::WAL_SEGMENT_SIZE, - ); + // Generate new pg_control needed for bootstrap checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0; //reset some fields we don't want to preserve @@ -264,19 +258,24 @@ impl<'a> Basebackup<'a> { checkpoint.oldestActiveXid = 0; //save new values in pg_control - pg_control.checkPoint = checkpoint_lsn; + pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; pg_control.state = pg_constants::DB_SHUTDOWNED; // add zenith.signal file - let xl_prev = if self.prev_record_lsn == Lsn(0) { - 0xBAD0 // magic value to indicate that we don't know prev_lsn + let mut zenith_signal = String::new(); + if self.prev_record_lsn == Lsn(0) { + if self.lsn == self.timeline.get_ancestor_lsn() { + write!(zenith_signal, "PREV LSN: none")?; + } else { + write!(zenith_signal, "PREV LSN: invalid")?; + } } else { - self.prev_record_lsn.0 - }; + write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; + } self.ar.append( - &new_tar_header("zenith.signal", 8)?, - &xl_prev.to_le_bytes()[..], + &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, + zenith_signal.as_bytes(), )?; //send pg_control @@ -285,14 +284,15 @@ impl<'a> Basebackup<'a> { self.ar.append(&header, &pg_control_bytes[..])?; //send wal segment + let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE); let wal_file_name = XLogFileName( 1, // FIXME: always use Postgres timeline 1 - checkpoint_segno, + segno, pg_constants::WAL_SEGMENT_SIZE, ); let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?; - let wal_seg = generate_wal_segment(&pg_control); + let wal_seg = generate_wal_segment(segno, pg_control.system_identifier); assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index c763f98a7f..3a577476dc 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -2,8 +2,6 @@ // Main entry point for the Page Server executable // -use log::*; -use pageserver::defaults::*; use serde::{Deserialize, Serialize}; use std::{ env, @@ -12,27 +10,33 @@ use std::{ str::FromStr, thread, }; +use tracing::*; use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType}; use anyhow::{bail, ensure, Context, Result}; +use signal_hook::consts::signal::*; +use signal_hook::consts::TERM_SIGNALS; +use signal_hook::flag; +use signal_hook::iterator::exfiltrator::WithOrigin; +use signal_hook::iterator::SignalsInfo; +use std::process::exit; +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + use clap::{App, Arg, ArgMatches}; use daemonize::Daemonize; use pageserver::{ - branches, - defaults::{ - DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, - DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS, - }, - http, page_service, relish_storage, tenant_mgr, PageServerConf, RelishStorageConfig, - RelishStorageKind, S3Config, LOG_FILE_NAME, + branches, defaults::*, http, page_service, relish_storage, tenant_mgr, PageServerConf, + RelishStorageConfig, RelishStorageKind, S3Config, LOG_FILE_NAME, }; use zenith_utils::http::endpoint; +use zenith_utils::postgres_backend; use const_format::formatcp; /// String arguments that can be declared via CLI or config file -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] struct CfgFileParams { listen_pg_addr: Option, listen_http_addr: Option, @@ -43,12 +47,21 @@ struct CfgFileParams { pg_distrib_dir: Option, auth_validation_public_key_path: Option, auth_type: Option, - // see https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for enum deserialisation examples - relish_storage: Option, relish_storage_max_concurrent_sync: Option, + ///////////////////////////////// + //// Don't put `Option` and other "simple" values below. + //// + /// `Option` is a table in TOML. + /// Values in TOML cannot be defined after tables (other tables can), + /// and [`toml`] crate serializes all fields in the order of their appearance. + //////////////////////////////// + relish_storage: Option, } -#[derive(Serialize, Deserialize, Clone)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +// Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!). +// See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples +#[serde(untagged)] enum RelishStorage { Local { local_path: String, @@ -447,7 +460,18 @@ fn main() -> Result<()> { fn start_pageserver(conf: &'static PageServerConf) -> Result<()> { // Initialize logger - let (_scope_guard, log_file) = logging::init(LOG_FILE_NAME, conf.daemonize)?; + let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?; + + let term_now = Arc::new(AtomicBool::new(false)); + for sig in TERM_SIGNALS { + // When terminated by a second term signal, exit with exit code 1. + // This will do nothing the first time (because term_now is false). + flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?; + // But this will "arm" the above for the second time, by setting it to true. + // The order of registering these is important, if you put this one first, it will + // first arm and then terminate ‒ all in the first round. + flag::register(*sig, Arc::clone(&term_now))?; + } // TODO: Check that it looks like a valid repository before going further @@ -480,7 +504,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> { match daemonize.start() { Ok(_) => info!("Success, daemonized"), - Err(e) => error!("could not daemonize: {:#}", e), + Err(err) => error!(%err, "could not daemonize"), } } @@ -525,13 +549,173 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> { page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type) })?; - join_handles.push(page_service_thread); + for info in SignalsInfo::::new(TERM_SIGNALS)?.into_iter() { + match info.signal { + SIGQUIT => { + info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode"); + exit(111); + } + SIGINT | SIGTERM => { + info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode"); + // Terminate postgres backends + postgres_backend::set_pgbackend_shutdown_requested(); + // Stop all tenants and flush their data + tenant_mgr::shutdown_all_tenants()?; + // Wait for pageservice thread to complete the job + page_service_thread + .join() + .expect("thread panicked") + .expect("thread exited with an error"); - for handle in join_handles.into_iter() { - handle - .join() - .expect("thread panicked") - .expect("thread exited with an error") + // Shut down http router + endpoint::shutdown(); + + // Wait for all threads + for handle in join_handles.into_iter() { + handle + .join() + .expect("thread panicked") + .expect("thread exited with an error"); + } + info!("Pageserver shut down successfully completed"); + exit(0); + } + unknown_signal => { + debug!("Unknown signal {}", unknown_signal); + } + } } + Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn page_server_conf_toml_serde() { + let params = CfgFileParams { + listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()), + listen_http_addr: Some("listen_http_addr_VALUE".to_string()), + checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()), + checkpoint_period: Some("checkpoint_period_VALUE".to_string()), + gc_horizon: Some("gc_horizon_VALUE".to_string()), + gc_period: Some("gc_period_VALUE".to_string()), + pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()), + auth_validation_public_key_path: Some( + "auth_validation_public_key_path_VALUE".to_string(), + ), + auth_type: Some("auth_type_VALUE".to_string()), + relish_storage: Some(RelishStorage::Local { + local_path: "relish_storage_local_VALUE".to_string(), + }), + relish_storage_max_concurrent_sync: Some( + "relish_storage_max_concurrent_sync_VALUE".to_string(), + ), + }; + + let toml_string = toml::to_string(¶ms).expect("Failed to serialize correct config"); + let toml_pretty_string = + toml::to_string_pretty(¶ms).expect("Failed to serialize correct config"); + assert_eq!( + r#"listen_pg_addr = 'listen_pg_addr_VALUE' +listen_http_addr = 'listen_http_addr_VALUE' +checkpoint_distance = 'checkpoint_distance_VALUE' +checkpoint_period = 'checkpoint_period_VALUE' +gc_horizon = 'gc_horizon_VALUE' +gc_period = 'gc_period_VALUE' +pg_distrib_dir = 'pg_distrib_dir_VALUE' +auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE' +auth_type = 'auth_type_VALUE' +relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE' + +[relish_storage] +local_path = 'relish_storage_local_VALUE' +"#, + toml_pretty_string + ); + + let params_from_serialized: CfgFileParams = toml::from_str(&toml_string) + .expect("Failed to deserialize the serialization result of the config"); + let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string) + .expect("Failed to deserialize the prettified serialization result of the config"); + assert!( + params_from_serialized == params, + "Expected the same config in the end of config -> serialize -> deserialize chain" + ); + assert!( + params_from_serialized_pretty == params, + "Expected the same config in the end of config -> serialize pretty -> deserialize chain" + ); + } + + #[test] + fn credentials_omitted_during_serialization() { + let params = CfgFileParams { + listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()), + listen_http_addr: Some("listen_http_addr_VALUE".to_string()), + checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()), + checkpoint_period: Some("checkpoint_period_VALUE".to_string()), + gc_horizon: Some("gc_horizon_VALUE".to_string()), + gc_period: Some("gc_period_VALUE".to_string()), + pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()), + auth_validation_public_key_path: Some( + "auth_validation_public_key_path_VALUE".to_string(), + ), + auth_type: Some("auth_type_VALUE".to_string()), + relish_storage: Some(RelishStorage::AwsS3 { + bucket_name: "bucket_name_VALUE".to_string(), + bucket_region: "bucket_region_VALUE".to_string(), + access_key_id: Some("access_key_id_VALUE".to_string()), + secret_access_key: Some("secret_access_key_VALUE".to_string()), + }), + relish_storage_max_concurrent_sync: Some( + "relish_storage_max_concurrent_sync_VALUE".to_string(), + ), + }; + + let toml_string = toml::to_string(¶ms).expect("Failed to serialize correct config"); + let toml_pretty_string = + toml::to_string_pretty(¶ms).expect("Failed to serialize correct config"); + assert_eq!( + r#"listen_pg_addr = 'listen_pg_addr_VALUE' +listen_http_addr = 'listen_http_addr_VALUE' +checkpoint_distance = 'checkpoint_distance_VALUE' +checkpoint_period = 'checkpoint_period_VALUE' +gc_horizon = 'gc_horizon_VALUE' +gc_period = 'gc_period_VALUE' +pg_distrib_dir = 'pg_distrib_dir_VALUE' +auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE' +auth_type = 'auth_type_VALUE' +relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE' + +[relish_storage] +bucket_name = 'bucket_name_VALUE' +bucket_region = 'bucket_region_VALUE' +"#, + toml_pretty_string + ); + + let params_from_serialized: CfgFileParams = toml::from_str(&toml_string) + .expect("Failed to deserialize the serialization result of the config"); + let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string) + .expect("Failed to deserialize the prettified serialization result of the config"); + + let mut expected_params = params; + expected_params.relish_storage = Some(RelishStorage::AwsS3 { + bucket_name: "bucket_name_VALUE".to_string(), + bucket_region: "bucket_region_VALUE".to_string(), + access_key_id: None, + secret_access_key: None, + }); + assert!( + params_from_serialized == expected_params, + "Expected the config without credentials in the end of a 'config -> serialize -> deserialize' chain" + ); + assert!( + params_from_serialized_pretty == expected_params, + "Expected the config without credentials in the end of a 'config -> serialize pretty -> deserialize' chain" + ); + } +} diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs index 57adf479ca..15e56b18e5 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/branches.rs @@ -14,12 +14,12 @@ use std::{ str::FromStr, sync::Arc, }; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use tracing::*; -use log::*; use zenith_utils::crashsafe_dir; use zenith_utils::logging; use zenith_utils::lsn::Lsn; +use zenith_utils::zid::{ZTenantId, ZTimelineId}; use crate::tenant_mgr; use crate::walredo::WalRedoManager; @@ -100,7 +100,7 @@ pub struct PointInTime { pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> { // Initialize logger // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages - let (_scope_guard, _log_file) = logging::init(LOG_FILE_NAME, true)?; + let _log_file = logging::init(LOG_FILE_NAME, true)?; // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo // process during repository initialization. @@ -176,13 +176,16 @@ fn get_lsn_from_controlfile(path: &Path) -> Result { // to get bootstrap data for timeline initialization. // fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { - info!("running initdb... "); + info!("running initdb in {}... ", initdbpath.display()); let initdb_path = conf.pg_bin_dir().join("initdb"); let initdb_output = Command::new(initdb_path) .args(&["-D", initdbpath.to_str().unwrap()]) .args(&["-U", &conf.superuser]) .arg("--no-instructions") + // This is only used for a temporary installation that is deleted shortly after, + // so no need to fsync it + .arg("--no-sync") .env_clear() .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) @@ -195,7 +198,6 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { String::from_utf8_lossy(&initdb_output.stderr) ); } - info!("initdb succeeded"); Ok(()) } @@ -210,6 +212,8 @@ fn bootstrap_timeline( tli: ZTimelineId, repo: &dyn Repository, ) -> Result<()> { + let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); + let initdb_path = conf.tenant_path(&tenantid).join("tmp"); // Init temporarily repo to get bootstrap data @@ -218,14 +222,12 @@ fn bootstrap_timeline( let lsn = get_lsn_from_controlfile(&pgdata_path)?.align(); - info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn); - // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. let timeline = repo.create_empty_timeline(tli)?; restore_local_repo::import_timeline_from_postgres_datadir( &pgdata_path, - timeline.as_ref(), + timeline.writer().as_ref(), lsn, )?; timeline.checkpoint()?; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index cd6b84b22f..cacb98ec84 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,4 +1,3 @@ -use std::str::FromStr; use std::sync::Arc; use anyhow::Result; @@ -6,6 +5,7 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use routerify::{ext::RequestExt, RouterBuilder}; +use tracing::*; use zenith_utils::auth::JwtAuth; use zenith_utils::http::endpoint::attach_openapi_ui; use zenith_utils::http::endpoint::auth_middleware; @@ -15,6 +15,8 @@ use zenith_utils::http::{ endpoint, error::HttpErrorBody, json::{json_request, json_response}, + request::get_request_param, + request::parse_request_param, }; use super::models::BranchCreateRequest; @@ -56,33 +58,6 @@ fn get_config(request: &Request) -> &'static PageServerConf { get_state(request).conf } -fn get_request_param<'a>( - request: &'a Request, - param_name: &str, -) -> Result<&'a str, ApiError> { - match request.param(param_name) { - Some(arg) => Ok(arg), - None => { - return Err(ApiError::BadRequest(format!( - "no {} specified in path param", - param_name - ))) - } - } -} - -fn parse_request_param( - request: &Request, - param_name: &str, -) -> Result { - match get_request_param(request, param_name)?.parse() { - Ok(v) => Ok(v), - Err(_) => Err(ApiError::BadRequest( - "failed to parse tenant id".to_string(), - )), - } -} - // healthcheck handler async fn status_handler(_: Request) -> Result, ApiError> { Ok(Response::builder() @@ -98,6 +73,7 @@ async fn branch_create_handler(mut request: Request) -> Result) -> Result, A check_permission(&request, Some(tenantid))?; let response_data = tokio::task::spawn_blocking(move || { + let _enter = info_span!("branch_list", tenant = %tenantid).entered(); crate::branches::get_branches(get_config(&request), &tenantid) }) .await @@ -126,11 +103,12 @@ async fn branch_list_handler(request: Request) -> Result, A // TODO add to swagger async fn branch_detail_handler(request: Request) -> Result, ApiError> { let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?; - let branch_name: &str = get_request_param(&request, "branch_name")?; + let branch_name: String = get_request_param(&request, "branch_name")?.to_string(); let conf = get_state(&request).conf; - let path = conf.branch_path(branch_name, &tenantid); + let path = conf.branch_path(&branch_name, &tenantid); let response_data = tokio::task::spawn_blocking(move || { + let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered(); let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; BranchInfo::from_path(path, conf, &tenantid, &repo) }) @@ -144,10 +122,13 @@ async fn tenant_list_handler(request: Request) -> Result, A // check for management permission check_permission(&request, None)?; - let response_data = - tokio::task::spawn_blocking(move || crate::branches::get_tenants(get_config(&request))) - .await - .map_err(ApiError::from_err)??; + let response_data = tokio::task::spawn_blocking(move || { + let _enter = info_span!("tenant_list").entered(); + crate::branches::get_tenants(get_config(&request)) + }) + .await + .map_err(ApiError::from_err)??; + Ok(json_response(StatusCode::OK, response_data)?) } @@ -158,6 +139,7 @@ async fn tenant_create_handler(mut request: Request) -> Result Result<()> { + trace!("LayeredRepository shutdown for tenant {}", self.tenantid); + + let timelines = self.timelines.lock().unwrap(); + for (timelineid, timeline) in timelines.iter() { + walreceiver::stop_wal_receiver(*timelineid); + // Wait for syncing data to disk + trace!("repo shutdown. checkpoint timeline {}", timelineid); + timeline.checkpoint()?; + + //TODO Wait for walredo process to shutdown too + } + + Ok(()) + } } /// Private functions @@ -239,6 +260,10 @@ impl LayeredRepository { None }; + let _enter = + info_span!("loading timeline", timeline = %timelineid, tenant = %self.tenantid) + .entered(); + let mut timeline = LayeredTimeline::new( self.conf, metadata, @@ -251,7 +276,16 @@ impl LayeredRepository { )?; // List the layers on disk, and load them into the layer map - timeline.load_layer_map(disk_consistent_lsn)?; + let _loaded_layers = timeline.load_layer_map(disk_consistent_lsn)?; + if self.upload_relishes { + schedule_timeline_upload(()); + // schedule_timeline_upload( + // self.tenantid, + // timelineid, + // loaded_layers, + // disk_consistent_lsn, + // ); + } // needs to be after load_layer_map timeline.init_current_logical_size()?; @@ -281,21 +315,24 @@ impl LayeredRepository { /// /// Launch the checkpointer thread in given repository. /// - pub fn launch_checkpointer_thread(conf: &'static PageServerConf, rc: Arc) { - let _thread = std::thread::Builder::new() + pub fn launch_checkpointer_thread( + conf: &'static PageServerConf, + rc: Arc, + ) -> JoinHandle<()> { + std::thread::Builder::new() .name("Checkpointer thread".into()) .spawn(move || { // FIXME: relaunch it? Panic is not good. rc.checkpoint_loop(conf).expect("Checkpointer thread died"); }) - .unwrap(); + .unwrap() } /// /// Checkpointer thread's main loop /// fn checkpoint_loop(&self, conf: &'static PageServerConf) -> Result<()> { - loop { + while !tenant_mgr::shutdown_requested() { std::thread::sleep(conf.checkpoint_period); info!("checkpointer thread for tenant {} waking up", self.tenantid); @@ -303,44 +340,60 @@ impl LayeredRepository { // bytes of WAL since last checkpoint. { let timelines = self.timelines.lock().unwrap(); - for (_timelineid, timeline) in timelines.iter() { + for (timelineid, timeline) in timelines.iter() { + let _entered = + info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid) + .entered(); + STORAGE_TIME .with_label_values(&["checkpoint_timed"]) .observe_closure_duration(|| { - timeline.checkpoint_internal(conf.checkpoint_distance) + timeline.checkpoint_internal(conf.checkpoint_distance, false) })? } // release lock on 'timelines' } } + trace!("Checkpointer thread shut down"); + Ok(()) } /// /// Launch the GC thread in given repository. /// - pub fn launch_gc_thread(conf: &'static PageServerConf, rc: Arc) { - let _thread = std::thread::Builder::new() + pub fn launch_gc_thread( + conf: &'static PageServerConf, + rc: Arc, + ) -> JoinHandle<()> { + std::thread::Builder::new() .name("GC thread".into()) .spawn(move || { // FIXME: relaunch it? Panic is not good. rc.gc_loop(conf).expect("GC thread died"); }) - .unwrap(); + .unwrap() } /// /// GC thread's main loop /// fn gc_loop(&self, conf: &'static PageServerConf) -> Result<()> { - loop { - std::thread::sleep(conf.gc_period); - info!("gc thread for tenant {} waking up", self.tenantid); - + while !tenant_mgr::shutdown_requested() { // Garbage collect old files that are not needed for PITR anymore if conf.gc_horizon > 0 { self.gc_iteration(None, conf.gc_horizon, false).unwrap(); } + + // TODO Write it in more adequate way using + // condvar.wait_timeout() or something + let mut sleep_time = conf.gc_period.as_secs(); + while sleep_time > 0 && !tenant_mgr::shutdown_requested() { + sleep_time -= 1; + std::thread::sleep(Duration::from_secs(1)); + } + info!("gc thread for tenant {} waking up", self.tenantid); } + Ok(()) } /// Save timeline metadata to file @@ -350,17 +403,15 @@ impl LayeredRepository { tenantid: ZTenantId, data: &TimelineMetadata, first_save: bool, - ) -> Result { - let timeline_path = conf.timeline_path(&timelineid, &tenantid); - let path = timeline_path.join("metadata"); + ) -> Result<()> { + let _enter = info_span!("saving metadata").entered(); + let path = metadata_path(conf, timelineid, tenantid); // use OpenOptions to ensure file presence is consistent with first_save let mut file = OpenOptions::new() .write(true) .create_new(first_save) .open(&path)?; - info!("saving metadata {}", path.display()); - let mut metadata_bytes = TimelineMetadata::ser(data)?; assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE); @@ -376,11 +427,15 @@ impl LayeredRepository { // fsync the parent directory to ensure the directory entry is durable if first_save { - let timeline_dir = File::open(&timeline_path)?; + let timeline_dir = File::open( + &path + .parent() + .expect("Metadata should always have a parent dir"), + )?; timeline_dir.sync_all()?; } - Ok(path) + Ok(()) } fn load_metadata( @@ -388,7 +443,7 @@ impl LayeredRepository { timelineid: ZTimelineId, tenantid: ZTenantId, ) -> Result { - let path = conf.timeline_path(&timelineid, &tenantid).join("metadata"); + let path = metadata_path(conf, timelineid, tenantid); let metadata_bytes = std::fs::read(&path)?; ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE); @@ -468,7 +523,7 @@ impl LayeredRepository { let timeline = self.get_timeline_locked(*timelineid, &mut *timelines)?; if let Some(ancestor_timeline) = &timeline.ancestor_timeline { - // If target_timeline is specified, we only need to know branchpoints of its childs + // If target_timeline is specified, we only need to know branchpoints of its children if let Some(timelineid) = target_timelineid { if ancestor_timeline.timelineid == timelineid { all_branchpoints @@ -485,6 +540,10 @@ impl LayeredRepository { // Ok, we now know all the branch points. // Perform GC for each timeline. for timelineid in timelineids { + if tenant_mgr::shutdown_requested() { + return Ok(totals); + } + // We have already loaded all timelines above // so this operation is just a quick map lookup. let timeline = self.get_timeline_locked(timelineid, &mut *timelines)?; @@ -608,10 +667,21 @@ pub struct LayeredTimeline { /// If `true`, will backup its timeline files to remote storage after freezing. upload_relishes: bool, + + /// Ensures layers aren't frozen by checkpointer between + /// [`LayeredTimeline::get_layer_for_write`] and layer reads. + /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. + /// Must always be acquired before the layer map/individual layer lock + /// to avoid deadlock. + write_lock: Mutex<()>, } /// Public interface functions impl Timeline for LayeredTimeline { + fn get_ancestor_lsn(&self) -> Lsn { + self.ancestor_lsn + } + /// Wait until WAL has been received up to the given LSN. fn wait_lsn(&self, lsn: Lsn) -> Result<()> { // This should never be called from the WAL receiver thread, because that could lead @@ -670,13 +740,7 @@ impl Timeline for LayeredTimeline { let segsize; if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { segsize = layer.get_seg_size(lsn)?; - trace!( - "get_seg_size: {} at {}/{} -> {}", - seg, - self.timelineid, - lsn, - segsize - ); + trace!("get_seg_size: {} at {} -> {}", seg, lsn, segsize); } else { if segno == 0 { return Ok(None); @@ -778,138 +842,13 @@ impl Timeline for LayeredTimeline { result.insert(new_relish); trace!("List object {}", new_relish); } else { - trace!("Filter out droped object {}", new_relish); + trace!("Filtered out dropped object {}", new_relish); } } Ok(result) } - fn put_wal_record(&self, rel: RelishTag, blknum: u32, rec: WALRecord) -> Result<()> { - if !rel.is_blocky() && blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - blknum, - rel - ); - } - ensure!(rec.lsn.is_aligned(), "unaligned record LSN"); - - let seg = SegmentTag::from_blknum(rel, blknum); - let delta_size = self.perform_write_op(seg, rec.lsn, |layer| { - layer.put_wal_record(blknum, rec.clone()) - })?; - self.increase_current_logical_size(delta_size * BLCKSZ as u32); - Ok(()) - } - - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: u32) -> anyhow::Result<()> { - if !rel.is_blocky() { - bail!("invalid truncation for non-blocky relish {}", rel); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - debug!("put_truncation: {} to {} blocks at {}", rel, relsize, lsn); - - let oldsize = self - .get_relish_size(rel, self.get_last_record_lsn())? - .ok_or_else(|| { - anyhow!( - "attempted to truncate non-existent relish {} at {}", - rel, - lsn - ) - })?; - - if oldsize <= relsize { - return Ok(()); - } - let old_last_seg = (oldsize - 1) / RELISH_SEG_SIZE; - - let last_remain_seg = if relsize == 0 { - 0 - } else { - (relsize - 1) / RELISH_SEG_SIZE - }; - - // Drop segments beyond the last remaining segment. - for remove_segno in (last_remain_seg + 1)..=old_last_seg { - let seg = SegmentTag { - rel, - segno: remove_segno, - }; - self.perform_write_op(seg, lsn, |layer| layer.drop_segment(lsn))?; - } - - // Truncate the last remaining segment to the specified size - if relsize == 0 || relsize % RELISH_SEG_SIZE != 0 { - let seg = SegmentTag { - rel, - segno: last_remain_seg, - }; - self.perform_write_op(seg, lsn, |layer| { - layer.put_truncation(lsn, relsize % RELISH_SEG_SIZE) - })?; - } - self.decrease_current_logical_size((oldsize - relsize) * BLCKSZ as u32); - Ok(()) - } - - fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> { - trace!("drop_segment: {} at {}", rel, lsn); - - if rel.is_blocky() { - if let Some(oldsize) = self.get_relish_size(rel, self.get_last_record_lsn())? { - let old_last_seg = if oldsize == 0 { - 0 - } else { - (oldsize - 1) / RELISH_SEG_SIZE - }; - - // Drop all segments of the relish - for remove_segno in 0..=old_last_seg { - let seg = SegmentTag { - rel, - segno: remove_segno, - }; - self.perform_write_op(seg, lsn, |layer| layer.drop_segment(lsn))?; - } - self.decrease_current_logical_size(oldsize * BLCKSZ as u32); - } else { - warn!( - "drop_segment called on non-existent relish {} at {}", - rel, lsn - ); - } - } else { - // TODO handle TwoPhase relishes - let seg = SegmentTag::from_blknum(rel, 0); - self.perform_write_op(seg, lsn, |layer| layer.drop_segment(lsn))?; - } - - Ok(()) - } - - fn put_page_image(&self, rel: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> { - if !rel.is_blocky() && blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - blknum, - rel - ); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - let seg = SegmentTag::from_blknum(rel, blknum); - - let delta_size = self.perform_write_op(seg, lsn, |layer| { - layer.put_page_image(blknum, lsn, img.clone()) - })?; - - self.increase_current_logical_size(delta_size * BLCKSZ as u32); - Ok(()) - } - /// Public entry point for checkpoint(). All the logic is in the private /// checkpoint_internal function, this public facade just wraps it for /// metrics collection. @@ -917,16 +856,7 @@ impl Timeline for LayeredTimeline { STORAGE_TIME .with_label_values(&["checkpoint_force"]) //pass checkpoint_distance=0 to force checkpoint - .observe_closure_duration(|| self.checkpoint_internal(0)) - } - - /// - /// Remember the (end of) last valid WAL record remembered in the timeline. - /// - fn advance_last_record_lsn(&self, new_lsn: Lsn) { - assert!(new_lsn.is_aligned()); - - self.last_record_lsn.advance(new_lsn); + .observe_closure_duration(|| self.checkpoint_internal(0, true)) } fn get_last_record_lsn(&self) -> Lsn { @@ -956,6 +886,8 @@ impl Timeline for LayeredTimeline { fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { let mut total_blocks: usize = 0; + let _enter = info_span!("calc logical size", %lsn).entered(); + // list of all relations in this timeline, including ancestor timelines let all_rels = self.list_rels(0, 0, lsn)?; @@ -980,6 +912,13 @@ impl Timeline for LayeredTimeline { fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() + } + + fn writer<'a>(&'a self) -> Box { + Box::new(LayeredTimelineWriter { + tl: self, + _write_guard: self.write_lock.lock().unwrap(), + }) } } @@ -1021,26 +960,26 @@ impl LayeredTimeline { current_logical_size: AtomicUsize::new(current_logical_size), current_logical_size_gauge, upload_relishes, + + write_lock: Mutex::new(()), }; Ok(timeline) } /// - /// Scan the timeline directory to populate the layer map + /// Scan the timeline directory to populate the layer map. + /// Returns all timeline-related files that were found and loaded. /// - fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { - info!( - "loading layer map for timeline {} into memory", - self.timelineid - ); + fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result> { let mut layers = self.layers.lock().unwrap(); + let mut num_layers = 0; let (imgfilenames, deltafilenames) = filename::list_files(self.conf, self.timelineid, self.tenantid)?; let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid); - + let mut local_layers = Vec::with_capacity(imgfilenames.len() + deltafilenames.len()); // First create ImageLayer structs for each image file. - for filename in imgfilenames.iter() { + for filename in &imgfilenames { if filename.lsn > disk_consistent_lsn { warn!( "found future image layer {} on timeline {}", @@ -1053,17 +992,13 @@ impl LayeredTimeline { let layer = ImageLayer::new(self.conf, self.timelineid, self.tenantid, filename); - info!( - "found layer {} {} on timeline {}", - layer.get_seg_tag(), - layer.get_start_lsn(), - self.timelineid - ); + trace!("found layer {}", layer.filename().display()); + local_layers.push(layer.path()); layers.insert_historic(Arc::new(layer)); + num_layers += 1; } - // Then for the Delta files. - for filename in deltafilenames.iter() { + for filename in &deltafilenames { ensure!(filename.start_lsn < filename.end_lsn); if filename.end_lsn > disk_consistent_lsn { warn!( @@ -1077,15 +1012,14 @@ impl LayeredTimeline { let layer = DeltaLayer::new(self.conf, self.timelineid, self.tenantid, filename); - info!( - "found layer {} on timeline {}", - layer.filename().display(), - self.timelineid, - ); + trace!("found layer {}", layer.filename().display()); + local_layers.push(layer.path()); layers.insert_historic(Arc::new(layer)); + num_layers += 1; } + info!("loaded layer map with {} layers", num_layers); - Ok(()) + Ok(local_layers) } /// @@ -1134,12 +1068,7 @@ impl LayeredTimeline { lsn: Lsn, self_layers: &MutexGuard, ) -> Result, Lsn)>> { - trace!( - "get_layer_for_read called for {} at {}/{}", - seg, - self.timelineid, - lsn - ); + trace!("get_layer_for_read called for {} at {}", seg, lsn); // If you requested a page at an older LSN, before the branch point, dig into // the right ancestor timeline. This can only happen if you launch a read-only @@ -1257,17 +1186,15 @@ impl LayeredTimeline { // First modification on this timeline start_lsn = self.ancestor_lsn + 1; trace!( - "creating layer for write for {} at branch point {}/{}", + "creating layer for write for {} at branch point {}", seg, - self.timelineid, start_lsn ); } else { start_lsn = prev_layer.get_end_lsn(); trace!( - "creating layer for write for {} after previous layer {}/{}", + "creating layer for write for {} after previous layer {}", seg, - self.timelineid, start_lsn ); } @@ -1308,31 +1235,20 @@ impl LayeredTimeline { /// Flush to disk all data that was written with the put_* functions /// /// NOTE: This has nothing to do with checkpoint in PostgreSQL. - fn checkpoint_internal(&self, checkpoint_distance: u64) -> Result<()> { - // Grab lock on the layer map. - // - // TODO: We hold it locked throughout the checkpoint operation. That's bad, - // the checkpointing could take many seconds, and any incoming get_page_at_lsn() - // requests will block. + fn checkpoint_internal(&self, checkpoint_distance: u64, forced: bool) -> Result<()> { + let mut write_guard = self.write_lock.lock().unwrap(); let mut layers = self.layers.lock().unwrap(); // Bump the generation number in the layer map, so that we can distinguish // entries inserted after the checkpoint started let current_generation = layers.increment_generation(); - // Read 'last_record_lsn'. That becomes the cutoff LSN for frozen layers. let RecordLsn { last: last_record_lsn, prev: prev_record_lsn, } = self.last_record_lsn.load(); - trace!( - "checkpointing timeline {} at {}", - self.timelineid, - last_record_lsn - ); - - let timeline_dir = File::open(self.conf.timeline_path(&self.timelineid, &self.tenantid))?; + trace!("checkpoint starting at {}", last_record_lsn); // Take the in-memory layer with the oldest WAL record. If it's older // than the threshold, write it out to disk as a new image and delta file. @@ -1346,10 +1262,14 @@ impl LayeredTimeline { let mut disk_consistent_lsn = last_record_lsn; let mut created_historics = false; - + let mut layer_uploads = Vec::new(); while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() { let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn(); + if tenant_mgr::shutdown_requested() && !forced { + return Ok(()); + } + // Does this layer need freezing? // // Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE. @@ -1372,32 +1292,24 @@ impl LayeredTimeline { break; } - // Freeze the layer. - // - // This is a two-step process. First, we "freeze" the in-memory - // layer, to close it for new writes, and replace the original - // layer with the new frozen in-memory layer (and possibly a new - // open layer to hold changes newer than the cutoff.) Then we write - // the frozen layer to disk, and replace the in-memory frozen layer - // with the new on-disk layers. - let FreezeLayers { - frozen, - open: maybe_new_open, - } = oldest_layer.freeze(last_record_lsn)?; + // Mark the layer as no longer accepting writes and record the end_lsn. + // This happens in-place, no new layers are created now. + // We call `get_last_record_lsn` again, which may be different from the + // original load, as we may have released the write lock since then. + oldest_layer.freeze(self.get_last_record_lsn()); - // replace this layer with the new layers that 'freeze' returned + // The layer is no longer open, update the layer map to reflect this. + // We will replace it with on-disk historics below. layers.pop_oldest_open(); - if let Some(new_open) = maybe_new_open.clone() { - layers.insert_open(new_open); - } - - // We temporarily insert InMemory layer into historic list here. - // TODO: check that all possible concurrent users of 'historic' treat it right - layers.insert_historic(frozen.clone()); + layers.insert_historic(oldest_layer.clone()); // Write the now-frozen layer to disk. That could take a while, so release the lock while do it drop(layers); - let new_historics = frozen.write_to_disk(self)?; + drop(write_guard); + + let new_historics = oldest_layer.write_to_disk(self)?; + + write_guard = self.write_lock.lock().unwrap(); layers = self.layers.lock().unwrap(); if !new_historics.is_empty() { @@ -1405,11 +1317,16 @@ impl LayeredTimeline { } // Finally, replace the frozen in-memory layer with the new on-disk layers - layers.remove_historic(frozen.clone()); + layers.remove_historic(oldest_layer); // Add the historics to the LayerMap - for n in new_historics { - layers.insert_historic(n); + for delta_layer in new_historics.delta_layers { + layer_uploads.push(delta_layer.path()); + layers.insert_historic(Arc::new(delta_layer)); + } + for image_layer in new_historics.image_layers { + layer_uploads.push(image_layer.path()); + layers.insert_historic(Arc::new(image_layer)); } } @@ -1421,59 +1338,64 @@ impl LayeredTimeline { } drop(layers); + drop(write_guard); if created_historics { // We must fsync the timeline dir to ensure the directory entries for // new layer files are durable + let timeline_dir = + File::open(self.conf.timeline_path(&self.timelineid, &self.tenantid))?; timeline_dir.sync_all()?; } - // Save the metadata, with updated 'disk_consistent_lsn', to a - // file in the timeline dir. After crash, we will restart WAL - // streaming and processing from that point. + // If we were able to advance 'disk_consistent_lsn', save it the metadata file. + // After crash, we will restart WAL streaming and processing from that point. + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + if disk_consistent_lsn != old_disk_consistent_lsn { + assert!(disk_consistent_lsn > old_disk_consistent_lsn); - // We can only save a valid 'prev_record_lsn' value on disk if we - // flushed *all* in-memory changes to disk. We only track - // 'prev_record_lsn' in memory for the latest processed record, so we - // don't remember what the correct value that corresponds to some old - // LSN is. But if we flush everything, then the value corresponding - // current 'last_record_lsn' is correct and we can store it on disk. - let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { - Some(prev_record_lsn) - } else { - None - }; + // We can only save a valid 'prev_record_lsn' value on disk if we + // flushed *all* in-memory changes to disk. We only track + // 'prev_record_lsn' in memory for the latest processed record, so we + // don't remember what the correct value that corresponds to some old + // LSN is. But if we flush everything, then the value corresponding + // current 'last_record_lsn' is correct and we can store it on disk. + let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { + Some(prev_record_lsn) + } else { + None + }; - let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid); + let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid); - let metadata = TimelineMetadata { - disk_consistent_lsn, - prev_record_lsn: ondisk_prev_record_lsn, - ancestor_timeline: ancestor_timelineid, - ancestor_lsn: self.ancestor_lsn, - }; - let _metadata_path = LayeredRepository::save_metadata( - self.conf, - self.timelineid, - self.tenantid, - &metadata, - false, - )?; - if self.upload_relishes { - schedule_timeline_upload(()) - // schedule_timeline_upload(LocalTimeline { - // tenant_id: self.tenantid, - // timeline_id: self.timelineid, - // metadata_path, - // image_layers: image_layer_uploads, - // delta_layers: delta_layer_uploads, - // disk_consistent_lsn, - // }); + let metadata = TimelineMetadata { + disk_consistent_lsn, + prev_record_lsn: ondisk_prev_record_lsn, + ancestor_timeline: ancestor_timelineid, + ancestor_lsn: self.ancestor_lsn, + }; + LayeredRepository::save_metadata( + self.conf, + self.timelineid, + self.tenantid, + &metadata, + false, + )?; + + // Also update the in-memory copy + self.disk_consistent_lsn.store(disk_consistent_lsn); + + if self.upload_relishes { + schedule_timeline_upload(()) + // schedule_timeline_upload( + // self.tenantid, + // self.timelineid, + // layer_uploads, + // disk_consistent_lsn, + // }); + } } - // Also update the in-memory copy - self.disk_consistent_lsn.store(disk_consistent_lsn); - Ok(()) } @@ -1503,11 +1425,11 @@ impl LayeredTimeline { let now = Instant::now(); let mut result: GcResult = Default::default(); - info!( - "running GC on timeline {}, cutoff {}", - self.timelineid, cutoff - ); - info!("retain_lsns: {:?}", retain_lsns); + let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered(); + + info!("GC starting"); + + debug!("retain_lsns: {:?}", retain_lsns); let mut layers_to_remove: Vec> = Vec::new(); @@ -1769,10 +1691,9 @@ impl LayeredTimeline { if data.records.is_empty() { if let Some(img) = &data.page_img { trace!( - "found page image for blk {} in {} at {}/{}, no WAL redo required", + "found page image for blk {} in {} at {}, no WAL redo required", blknum, rel, - self.timelineid, request_lsn ); Ok(img.clone()) @@ -1786,7 +1707,7 @@ impl LayeredTimeline { // // If we don't have a base image, then the oldest WAL record better initialize // the page - if data.page_img.is_none() && !data.records.first().unwrap().will_init { + if data.page_img.is_none() && !data.records.first().unwrap().1.will_init { // FIXME: this ought to be an error? warn!( "Base image for page {}/{} at {} not found, but got {} WAL records", @@ -1798,9 +1719,9 @@ impl LayeredTimeline { Ok(ZERO_PAGE.clone()) } else { if data.page_img.is_some() { - trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn); + trace!("found {} WAL records and a base image for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn); } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn); + trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn); } let img = self.walredo_mgr.request_redo( rel, @@ -1848,36 +1769,163 @@ impl LayeredTimeline { self.current_logical_size_gauge .set(val as i64 - diff as i64); } +} - /// If a layer is in the process of being replaced in [`LayerMap`], write - /// operations will fail with [`NonWriteableError`]. This may happen due to - /// a race: the checkpointer thread freezes a layer just after - /// [`Self::get_layer_for_write`] returned it. To handle this error, we try - /// again getting the layer and attempt the write. - fn perform_write_op( - &self, - seg: SegmentTag, - lsn: Lsn, - write_op: impl Fn(&Arc) -> WriteResult, - ) -> anyhow::Result { - let mut layer = self.get_layer_for_write(seg, lsn)?; - loop { - match write_op(&layer) { - Ok(r) => return Ok(r), - Err(NonWriteableError {}) => {} - } +struct LayeredTimelineWriter<'a> { + tl: &'a LayeredTimeline, + _write_guard: MutexGuard<'a, ()>, +} - info!( - "attempted to write to non-writeable layer, retrying {} {}", - seg, lsn +impl Deref for LayeredTimelineWriter<'_> { + type Target = dyn Timeline; + + fn deref(&self) -> &Self::Target { + self.tl + } +} + +impl<'a> TimelineWriter for LayeredTimelineWriter<'a> { + fn put_wal_record(&self, lsn: Lsn, rel: RelishTag, blknum: u32, rec: WALRecord) -> Result<()> { + if !rel.is_blocky() && blknum != 0 { + bail!( + "invalid request for block {} for non-blocky relish {}", + blknum, + rel ); - - // layer was non-writeable, try again - let new_layer = self.get_layer_for_write(seg, lsn)?; - // the new layer does not have to be writeable, but it should at least be different - assert!(!Arc::ptr_eq(&layer, &new_layer)); - layer = new_layer; } + ensure!(lsn.is_aligned(), "unaligned record LSN"); + + let seg = SegmentTag::from_blknum(rel, blknum); + let layer = self.tl.get_layer_for_write(seg, lsn)?; + let delta_size = layer.put_wal_record(lsn, blknum, rec); + self.tl + .increase_current_logical_size(delta_size * BLCKSZ as u32); + Ok(()) + } + + fn put_page_image(&self, rel: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> { + if !rel.is_blocky() && blknum != 0 { + bail!( + "invalid request for block {} for non-blocky relish {}", + blknum, + rel + ); + } + ensure!(lsn.is_aligned(), "unaligned record LSN"); + + let seg = SegmentTag::from_blknum(rel, blknum); + + let layer = self.tl.get_layer_for_write(seg, lsn)?; + let delta_size = layer.put_page_image(blknum, lsn, img); + + self.tl + .increase_current_logical_size(delta_size * BLCKSZ as u32); + Ok(()) + } + + fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: u32) -> Result<()> { + if !rel.is_blocky() { + bail!("invalid truncation for non-blocky relish {}", rel); + } + ensure!(lsn.is_aligned(), "unaligned record LSN"); + + debug!("put_truncation: {} to {} blocks at {}", rel, relsize, lsn); + + let oldsize = self + .tl + .get_relish_size(rel, self.tl.get_last_record_lsn())? + .ok_or_else(|| { + anyhow!( + "attempted to truncate non-existent relish {} at {}", + rel, + lsn + ) + })?; + + if oldsize <= relsize { + return Ok(()); + } + let old_last_seg = (oldsize - 1) / RELISH_SEG_SIZE; + + let last_remain_seg = if relsize == 0 { + 0 + } else { + (relsize - 1) / RELISH_SEG_SIZE + }; + + // Drop segments beyond the last remaining segment. + for remove_segno in (last_remain_seg + 1)..=old_last_seg { + let seg = SegmentTag { + rel, + segno: remove_segno, + }; + + let layer = self.tl.get_layer_for_write(seg, lsn)?; + layer.drop_segment(lsn); + } + + // Truncate the last remaining segment to the specified size + if relsize == 0 || relsize % RELISH_SEG_SIZE != 0 { + let seg = SegmentTag { + rel, + segno: last_remain_seg, + }; + let layer = self.tl.get_layer_for_write(seg, lsn)?; + layer.put_truncation(lsn, relsize % RELISH_SEG_SIZE) + } + self.tl + .decrease_current_logical_size((oldsize - relsize) * BLCKSZ as u32); + Ok(()) + } + + fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> { + trace!("drop_segment: {} at {}", rel, lsn); + + if rel.is_blocky() { + if let Some(oldsize) = self + .tl + .get_relish_size(rel, self.tl.get_last_record_lsn())? + { + let old_last_seg = if oldsize == 0 { + 0 + } else { + (oldsize - 1) / RELISH_SEG_SIZE + }; + + // Drop all segments of the relish + for remove_segno in 0..=old_last_seg { + let seg = SegmentTag { + rel, + segno: remove_segno, + }; + let layer = self.tl.get_layer_for_write(seg, lsn)?; + layer.drop_segment(lsn); + } + self.tl + .decrease_current_logical_size(oldsize * BLCKSZ as u32); + } else { + warn!( + "drop_segment called on non-existent relish {} at {}", + rel, lsn + ); + } + } else { + // TODO handle TwoPhase relishes + let seg = SegmentTag::from_blknum(rel, 0); + let layer = self.tl.get_layer_for_write(seg, lsn)?; + layer.drop_segment(lsn); + } + + Ok(()) + } + + /// + /// Remember the (end of) last valid WAL record remembered in the timeline. + /// + fn advance_last_record_lsn(&self, new_lsn: Lsn) { + assert!(new_lsn.is_aligned()); + + self.tl.last_record_lsn.advance(new_lsn); } } @@ -1899,6 +1947,15 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> { Ok(()) } +fn metadata_path( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, +) -> PathBuf { + conf.timeline_path(&timelineid, &tenantid) + .join(METADATA_FILE_NAME) +} + /// Add a suffix to a layer file's name: .{num}.old /// Uses the first available num (starts at 0) fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index ad16a86030..24ed9d6e69 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -48,7 +48,7 @@ use crate::{ZTenantId, ZTimelineId}; use anyhow::{bail, ensure, Result}; use log::*; use serde::{Deserialize, Serialize}; -use std::collections::BTreeMap; +use zenith_utils::vec_map::VecMap; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use std::fmt::Write as _; @@ -141,10 +141,10 @@ pub struct DeltaLayerInner { /// All versions of all pages in the file are are kept here. /// Indexed by block number and LSN. - page_version_metas: BTreeMap<(u32, Lsn), BlobRange>, + page_version_metas: VecMap<(u32, Lsn), BlobRange>, /// `relsizes` tracks the size of the relation at different points in time. - relsizes: BTreeMap, + relsizes: VecMap, } impl Layer for DeltaLayer { @@ -169,29 +169,7 @@ impl Layer for DeltaLayer { } fn filename(&self) -> PathBuf { - PathBuf::from( - DeltaFileName { - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - dropped: self.dropped, - } - .to_string(), - ) - } - - fn path(&self) -> Option { - Some(Self::path_for( - &self.path_or_conf, - self.timelineid, - self.tenantid, - &DeltaFileName { - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - dropped: self.dropped, - }, - )) + PathBuf::from(self.layer_name().to_string()) } /// Look up given page in the cache. @@ -215,10 +193,12 @@ impl Layer for DeltaLayer { // Scan the metadata BTreeMap backwards, starting from the given entry. let minkey = (blknum, Lsn(0)); let maxkey = (blknum, lsn); - let mut iter = inner + let iter = inner .page_version_metas - .range((Included(&minkey), Included(&maxkey))); - while let Some(((_blknum, _entry_lsn), blob_range)) = iter.next_back() { + .slice_range((Included(&minkey), Included(&maxkey))) + .iter() + .rev(); + for ((_blknum, pv_lsn), blob_range) in iter { let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?; if let Some(img) = pv.page_image { @@ -228,7 +208,7 @@ impl Layer for DeltaLayer { break; } else if let Some(rec) = pv.record { let will_init = rec.will_init; - reconstruct_data.records.push(rec); + reconstruct_data.records.push((*pv_lsn, rec)); if will_init { // This WAL record initializes the page, so no need to go further back need_image = false; @@ -262,15 +242,15 @@ impl Layer for DeltaLayer { // Scan the BTreeMap backwards, starting from the given entry. let inner = self.load()?; - let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn))); + let slice = inner + .relsizes + .slice_range((Included(&Lsn(0)), Included(&lsn))); - let result; - if let Some((_entry_lsn, entry)) = iter.next_back() { - result = *entry; + if let Some((_entry_lsn, entry)) = slice.last() { + Ok(*entry) } else { - bail!("could not find seg size in delta layer"); + Err(anyhow::anyhow!("could not find seg size in delta layer")) } - Ok(result) } /// Does this segment exist at given LSN? @@ -290,17 +270,15 @@ impl Layer for DeltaLayer { /// fn unload(&self) -> Result<()> { let mut inner = self.inner.lock().unwrap(); - inner.page_version_metas = BTreeMap::new(); - inner.relsizes = BTreeMap::new(); + inner.page_version_metas = VecMap::default(); + inner.relsizes = VecMap::default(); inner.loaded = false; Ok(()) } fn delete(&self) -> Result<()> { // delete underlying file - if let Some(path) = self.path() { - fs::remove_file(path)?; - } + fs::remove_file(self.path())?; Ok(()) } @@ -317,13 +295,13 @@ impl Layer for DeltaLayer { println!("--- relsizes ---"); let inner = self.load()?; - for (k, v) in inner.relsizes.iter() { + for (k, v) in inner.relsizes.as_slice() { println!(" {}: {}", k, v); } println!("--- page versions ---"); let (_path, book) = self.open_book()?; let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?; - for ((blk, lsn), blob_range) in inner.page_version_metas.iter() { + for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() { let mut desc = String::new(); let buf = read_blob(&chapter, blob_range)?; @@ -380,8 +358,8 @@ impl DeltaLayer { start_lsn: Lsn, end_lsn: Lsn, dropped: bool, - page_versions: impl Iterator, - relsizes: BTreeMap, + page_versions: impl Iterator, + relsizes: VecMap, ) -> Result { if seg.rel.is_blocky() { assert!(!relsizes.is_empty()); @@ -397,16 +375,14 @@ impl DeltaLayer { dropped, inner: Mutex::new(DeltaLayerInner { loaded: true, - page_version_metas: BTreeMap::new(), + page_version_metas: VecMap::default(), relsizes, }), }; let mut inner = delta_layer.inner.lock().unwrap(); // Write the in-memory btreemaps into a file - let path = delta_layer - .path() - .expect("DeltaLayer is supposed to have a layer path on disk"); + let path = delta_layer.path(); // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? @@ -416,26 +392,27 @@ impl DeltaLayer { let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER); - for (key, page_version) in page_versions { + for (blknum, lsn, page_version) in page_versions { let buf = PageVersion::ser(page_version)?; let blob_range = page_version_writer.write_blob(&buf)?; - let old = inner.page_version_metas.insert(*key, blob_range); - - assert!(old.is_none()); + inner + .page_version_metas + .append((blknum, lsn), blob_range) + .unwrap(); } let book = page_version_writer.close()?; // Write out page versions let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER); - let buf = BTreeMap::ser(&inner.page_version_metas)?; + let buf = VecMap::ser(&inner.page_version_metas)?; chapter.write_all(&buf)?; let book = chapter.close()?; // and relsizes to separate chapter let mut chapter = book.new_chapter(REL_SIZES_CHAPTER); - let buf = BTreeMap::ser(&inner.relsizes)?; + let buf = VecMap::ser(&inner.relsizes)?; chapter.write_all(&buf)?; let book = chapter.close()?; @@ -469,12 +446,7 @@ impl DeltaLayer { &self.path_or_conf, self.timelineid, self.tenantid, - &DeltaFileName { - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - dropped: self.dropped, - }, + &self.layer_name(), ); let file = File::open(&path)?; @@ -522,10 +494,10 @@ impl DeltaLayer { } let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?; - let page_version_metas = BTreeMap::des(&chapter)?; + let page_version_metas = VecMap::des(&chapter)?; let chapter = book.read_chapter(REL_SIZES_CHAPTER)?; - let relsizes = BTreeMap::des(&chapter)?; + let relsizes = VecMap::des(&chapter)?; debug!("loaded from {}", &path.display()); @@ -555,8 +527,8 @@ impl DeltaLayer { dropped: filename.dropped, inner: Mutex::new(DeltaLayerInner { loaded: false, - page_version_metas: BTreeMap::new(), - relsizes: BTreeMap::new(), + page_version_metas: VecMap::default(), + relsizes: VecMap::default(), }), } } @@ -578,9 +550,28 @@ impl DeltaLayer { dropped: summary.dropped, inner: Mutex::new(DeltaLayerInner { loaded: false, - page_version_metas: BTreeMap::new(), - relsizes: BTreeMap::new(), + page_version_metas: VecMap::default(), + relsizes: VecMap::default(), }), }) } + + fn layer_name(&self) -> DeltaFileName { + DeltaFileName { + seg: self.seg, + start_lsn: self.start_lsn, + end_lsn: self.end_lsn, + dropped: self.dropped, + } + } + + /// Path to the layer file in pageserver workdir. + pub fn path(&self) -> PathBuf { + Self::path_for( + &self.path_or_conf, + self.timelineid, + self.tenantid, + &self.layer_name(), + ) + } } diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs index 50bfe2977e..afa106f939 100644 --- a/pageserver/src/layered_repository/filename.rs +++ b/pageserver/src/layered_repository/filename.rs @@ -13,6 +13,8 @@ use anyhow::Result; use log::*; use zenith_utils::lsn::Lsn; +use super::METADATA_FILE_NAME; + // Note: LayeredTimeline::load_layer_map() relies on this sort order #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] pub struct DeltaFileName { @@ -35,7 +37,7 @@ impl DeltaFileName { /// Parse a string as a delta file name. Returns None if the filename does not /// match the expected pattern. /// - pub fn from_str(fname: &str) -> Option { + pub fn parse_str(fname: &str) -> Option { let rel; let mut parts; if let Some(rest) = fname.strip_prefix("rel_") { @@ -168,7 +170,7 @@ impl ImageFileName { /// Parse a string as an image file name. Returns None if the filename does not /// match the expected pattern. /// - pub fn from_str(fname: &str) -> Option { + pub fn parse_str(fname: &str) -> Option { let rel; let mut parts; if let Some(rest) = fname.strip_prefix("rel_") { @@ -286,11 +288,11 @@ pub fn list_files( let fname = direntry?.file_name(); let fname = fname.to_str().unwrap(); - if let Some(deltafilename) = DeltaFileName::from_str(fname) { + if let Some(deltafilename) = DeltaFileName::parse_str(fname) { deltafiles.push(deltafilename); - } else if let Some(imgfilename) = ImageFileName::from_str(fname) { + } else if let Some(imgfilename) = ImageFileName::parse_str(fname) { imgfiles.push(imgfilename); - } else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") { + } else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") { // ignore these } else { warn!("unrecognized filename in timeline dir: {}", fname); diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index a9487a02d4..744f793558 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -114,25 +114,7 @@ pub struct ImageLayerInner { impl Layer for ImageLayer { fn filename(&self) -> PathBuf { - PathBuf::from( - ImageFileName { - seg: self.seg, - lsn: self.lsn, - } - .to_string(), - ) - } - - fn path(&self) -> Option { - Some(Self::path_for( - &self.path_or_conf, - self.timelineid, - self.tenantid, - &ImageFileName { - seg: self.seg, - lsn: self.lsn, - }, - )) + PathBuf::from(self.layer_name().to_string()) } fn get_timeline_id(&self) -> ZTimelineId { @@ -222,9 +204,7 @@ impl Layer for ImageLayer { fn delete(&self) -> Result<()> { // delete underlying file - if let Some(path) = self.path() { - fs::remove_file(path)?; - } + fs::remove_file(self.path())?; Ok(()) } @@ -300,9 +280,7 @@ impl ImageLayer { let inner = layer.inner.lock().unwrap(); // Write the images into a file - let path = layer - .path() - .expect("ImageLayer is supposed to have a layer path on disk"); + let path = layer.path(); // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? let file = File::create(&path)?; @@ -340,7 +318,7 @@ impl ImageLayer { let writer = book.close()?; writer.get_ref().sync_all()?; - trace!("saved {}", &path.display()); + trace!("saved {}", path.display()); drop(inner); @@ -445,15 +423,7 @@ impl ImageLayer { } fn open_book(&self) -> Result<(PathBuf, Book)> { - let path = Self::path_for( - &self.path_or_conf, - self.timelineid, - self.tenantid, - &ImageFileName { - seg: self.seg, - lsn: self.lsn, - }, - ); + let path = self.path(); let file = File::open(&path)?; let book = Book::new(file)?; @@ -500,4 +470,21 @@ impl ImageLayer { }), }) } + + fn layer_name(&self) -> ImageFileName { + ImageFileName { + seg: self.seg, + lsn: self.lsn, + } + } + + /// Path to the layer file in pageserver workdir. + pub fn path(&self) -> PathBuf { + Self::path_for( + &self.path_or_conf, + self.timelineid, + self.tenantid, + &self.layer_name(), + ) + } } diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index f96b5e71d1..474eef09c4 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -15,15 +15,14 @@ use crate::{ZTenantId, ZTimelineId}; use anyhow::{bail, ensure, Result}; use bytes::Bytes; use log::*; -use std::cmp::min; -use std::collections::BTreeMap; -use std::ops::Bound::Included; use std::path::PathBuf; use std::sync::{Arc, RwLock}; +use zenith_utils::vec_map::VecMap; -use zenith_utils::accum::Accum; use zenith_utils::lsn::Lsn; +use super::page_versions::PageVersions; + pub struct InMemoryLayer { conf: &'static PageServerConf, tenantid: ZTenantId, @@ -36,9 +35,6 @@ pub struct InMemoryLayer { /// start_lsn: Lsn, - /// Frozen in-memory layers have an inclusive end LSN. - end_lsn: Option, - /// LSN of the oldest page version stored in this layer oldest_pending_lsn: Lsn, @@ -51,14 +47,19 @@ pub struct InMemoryLayer { } pub struct InMemoryLayerInner { + /// Frozen in-memory layers have an exclusive end LSN. + /// Writes are only allowed when this is None + end_lsn: Option, + /// If this relation was dropped, remember when that happened. - drop_lsn: Option, + /// The drop LSN is recorded in [`end_lsn`]. + dropped: bool, /// /// All versions of all pages in the layer are are kept here. /// Indexed by block number and LSN. /// - page_versions: BTreeMap<(u32, Lsn), PageVersion>, + page_versions: PageVersions, /// /// `segsizes` tracks the size of the segment at different points in time. @@ -67,28 +68,20 @@ pub struct InMemoryLayerInner { /// so that determining the size never depends on the predecessor layer. For /// a non-blocky rel, 'segsizes' is not used and is always empty. /// - segsizes: BTreeMap, - - /// Writes are only allowed when true. - /// Set to false when this layer is in the process of being replaced. - writeable: bool, + segsizes: VecMap, } impl InMemoryLayerInner { - fn check_writeable(&self) -> WriteResult<()> { - if self.writeable { - Ok(()) - } else { - Err(NonWriteableError) - } + fn assert_writeable(&self) { + assert!(self.end_lsn.is_none()); } fn get_seg_size(&self, lsn: Lsn) -> u32 { // Scan the BTreeMap backwards, starting from the given entry. - let mut iter = self.segsizes.range((Included(&Lsn(0)), Included(&lsn))); + let slice = self.segsizes.slice_range(..=lsn); // We make sure there is always at least one entry - if let Some((_entry_lsn, entry)) = iter.next_back() { + if let Some((_entry_lsn, entry)) = slice.last() { *entry } else { panic!("could not find seg size in in-memory layer"); @@ -103,30 +96,23 @@ impl Layer for InMemoryLayer { let inner = self.inner.read().unwrap(); let end_lsn; - let dropped; - if let Some(drop_lsn) = inner.drop_lsn { + if let Some(drop_lsn) = inner.end_lsn { end_lsn = drop_lsn; - dropped = true; } else { end_lsn = Lsn(u64::MAX); - dropped = false; } let delta_filename = DeltaFileName { seg: self.seg, start_lsn: self.start_lsn, end_lsn, - dropped, + dropped: inner.dropped, } .to_string(); PathBuf::from(format!("inmem-{}", delta_filename)) } - fn path(&self) -> Option { - None - } - fn get_timeline_id(&self) -> ZTimelineId { self.timelineid } @@ -140,14 +126,10 @@ impl Layer for InMemoryLayer { } fn get_end_lsn(&self) -> Lsn { - if let Some(end_lsn) = self.end_lsn { - return Lsn(end_lsn.0 + 1); - } - let inner = self.inner.read().unwrap(); - if let Some(drop_lsn) = inner.drop_lsn { - drop_lsn + if let Some(end_lsn) = inner.end_lsn { + end_lsn } else { Lsn(u64::MAX) } @@ -155,7 +137,7 @@ impl Layer for InMemoryLayer { fn is_dropped(&self) -> bool { let inner = self.inner.read().unwrap(); - inner.drop_lsn.is_some() + inner.dropped } /// Look up given page in the cache. @@ -172,19 +154,19 @@ impl Layer for InMemoryLayer { { let inner = self.inner.read().unwrap(); - // Scan the BTreeMap backwards, starting from reconstruct_data.lsn. - let minkey = (blknum, Lsn(0)); - let maxkey = (blknum, lsn); - let mut iter = inner + // Scan the page versions backwards, starting from `lsn`. + let iter = inner .page_versions - .range((Included(&minkey), Included(&maxkey))); - while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() { + .get_block_lsn_range(blknum, ..=lsn) + .iter() + .rev(); + for (entry_lsn, entry) in iter { if let Some(img) = &entry.page_image { reconstruct_data.page_img = Some(img.clone()); need_image = false; break; } else if let Some(rec) = &entry.record { - reconstruct_data.records.push(rec.clone()); + reconstruct_data.records.push((*entry_lsn, rec.clone())); if rec.will_init { // This WAL record initializes the page, so no need to go further back need_image = false; @@ -233,8 +215,8 @@ impl Layer for InMemoryLayer { assert!(lsn >= self.start_lsn); // Is the requested LSN after the segment was dropped? - if let Some(drop_lsn) = inner.drop_lsn { - if lsn >= drop_lsn { + if let Some(end_lsn) = inner.end_lsn { + if lsn >= end_lsn { return Ok(false); } } @@ -265,27 +247,27 @@ impl Layer for InMemoryLayer { let inner = self.inner.read().unwrap(); let end_str = inner - .drop_lsn + .end_lsn .as_ref() - .map(|drop_lsn| drop_lsn.to_string()) + .map(Lsn::to_string) .unwrap_or_default(); println!( - "----- in-memory layer for tli {} seg {} {}-{} ----", - self.timelineid, self.seg, self.start_lsn, end_str + "----- in-memory layer for tli {} seg {} {}-{} {} ----", + self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped, ); - for (k, v) in inner.segsizes.iter() { + for (k, v) in inner.segsizes.as_slice() { println!("segsizes {}: {}", k, v); } - for (k, v) in inner.page_versions.iter() { + for (blknum, lsn, pv) in inner.page_versions.ordered_page_version_iter(None) { println!( "blk {} at {}: {}/{}\n", - k.0, - k.1, - v.page_image.is_some(), - v.record.is_some() + blknum, + lsn, + pv.page_image.is_some(), + pv.record.is_some() ); } @@ -293,26 +275,19 @@ impl Layer for InMemoryLayer { } } -/// Write failed because the layer is in process of being replaced. -/// See [`LayeredTimeline::perform_write_op`] for how to handle this error. -#[derive(Debug)] -pub struct NonWriteableError; +/// A result of an inmemory layer data being written to disk. +pub struct LayersOnDisk { + pub delta_layers: Vec, + pub image_layers: Vec, +} -pub type WriteResult = std::result::Result; - -/// Helper struct to cleanup `InMemoryLayer::freeze` return signature. -pub struct FreezeLayers { - /// Replacement layer for the layer which freeze was called on. - pub frozen: Arc, - /// New open layer containing leftover data. - pub open: Option>, +impl LayersOnDisk { + pub fn is_empty(&self) -> bool { + self.delta_layers.is_empty() && self.image_layers.is_empty() + } } impl InMemoryLayer { - fn assert_not_frozen(&self) { - assert!(self.end_lsn.is_none()); - } - /// Return the oldest page version that's stored in this layer pub fn get_oldest_pending_lsn(&self) -> Lsn { self.oldest_pending_lsn @@ -337,9 +312,9 @@ impl InMemoryLayer { ); // The segment is initially empty, so initialize 'segsizes' with 0. - let mut segsizes = BTreeMap::new(); + let mut segsizes = VecMap::default(); if seg.rel.is_blocky() { - segsizes.insert(start_lsn, 0); + segsizes.append(start_lsn, 0).unwrap(); } Ok(InMemoryLayer { @@ -348,14 +323,13 @@ impl InMemoryLayer { tenantid, seg, start_lsn, - end_lsn: None, oldest_pending_lsn, incremental: false, inner: RwLock::new(InMemoryLayerInner { - drop_lsn: None, - page_versions: BTreeMap::new(), + end_lsn: None, + dropped: false, + page_versions: PageVersions::default(), segsizes, - writeable: true, }), }) } @@ -363,10 +337,10 @@ impl InMemoryLayer { // Write operations /// Remember new page version, as a WAL record over previous version - pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> WriteResult { + pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> u32 { self.put_page_version( blknum, - rec.lsn, + lsn, PageVersion { page_image: None, record: Some(rec), @@ -375,7 +349,7 @@ impl InMemoryLayer { } /// Remember new page version, as a full page image - pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> WriteResult { + pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> u32 { self.put_page_version( blknum, lsn, @@ -388,8 +362,7 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> WriteResult { - self.assert_not_frozen(); + pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> u32 { assert!(self.seg.blknum_in_seg(blknum)); trace!( @@ -401,9 +374,9 @@ impl InMemoryLayer { ); let mut inner = self.inner.write().unwrap(); - inner.check_writeable()?; + inner.assert_writeable(); - let old = inner.page_versions.insert((blknum, lsn), pv); + let old = inner.page_versions.append_or_update_last(blknum, lsn, pv); if old.is_some() { // We already had an entry for this LSN. That's odd.. @@ -448,7 +421,9 @@ impl InMemoryLayer { gapblknum, blknum ); - let old = inner.page_versions.insert((gapblknum, lsn), zeropv); + let old = inner + .page_versions + .append_or_update_last(gapblknum, lsn, zeropv); // We already had an entry for this LSN. That's odd.. if old.is_some() { @@ -459,53 +434,47 @@ impl InMemoryLayer { } } - inner.segsizes.insert(lsn, newsize); - return Ok(newsize - oldsize); + inner.segsizes.append_or_update_last(lsn, newsize).unwrap(); + return newsize - oldsize; } } - Ok(0) + + 0 } /// Remember that the relation was truncated at given LSN - pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> { + pub fn put_truncation(&self, lsn: Lsn, segsize: u32) { assert!( self.seg.rel.is_blocky(), "put_truncation() called on a non-blocky rel" ); - self.assert_not_frozen(); let mut inner = self.inner.write().unwrap(); - inner.check_writeable()?; + inner.assert_writeable(); // check that this we truncate to a smaller size than segment was before the truncation let oldsize = inner.get_seg_size(lsn); assert!(segsize < oldsize); - let old = inner.segsizes.insert(lsn, segsize); + let old = inner.segsizes.append_or_update_last(lsn, segsize).unwrap(); if old.is_some() { // We already had an entry for this LSN. That's odd.. warn!("Inserting truncation, but had an entry for the LSN already"); } - - Ok(()) } /// Remember that the segment was dropped at given LSN - pub fn drop_segment(&self, lsn: Lsn) -> WriteResult<()> { - self.assert_not_frozen(); - + pub fn drop_segment(&self, lsn: Lsn) { let mut inner = self.inner.write().unwrap(); - inner.check_writeable()?; - - assert!(inner.drop_lsn.is_none()); - inner.drop_lsn = Some(lsn); - inner.writeable = false; + assert!(inner.end_lsn.is_none()); + assert!(!inner.dropped); + inner.dropped = true; + assert!(self.start_lsn < lsn); + inner.end_lsn = Some(lsn); trace!("dropped segment {} at {}", self.seg, lsn); - - Ok(()) } /// @@ -533,10 +502,10 @@ impl InMemoryLayer { ); // Copy the segment size at the start LSN from the predecessor layer. - let mut segsizes = BTreeMap::new(); + let mut segsizes = VecMap::default(); if seg.rel.is_blocky() { let size = src.get_seg_size(start_lsn)?; - segsizes.insert(start_lsn, size); + segsizes.append(start_lsn, size).unwrap(); } Ok(InMemoryLayer { @@ -545,124 +514,43 @@ impl InMemoryLayer { tenantid, seg, start_lsn, - end_lsn: None, oldest_pending_lsn, incremental: true, inner: RwLock::new(InMemoryLayerInner { - drop_lsn: None, - page_versions: BTreeMap::new(), + end_lsn: None, + dropped: false, + page_versions: PageVersions::default(), segsizes, - writeable: true, }), }) } pub fn is_writeable(&self) -> bool { let inner = self.inner.read().unwrap(); - inner.writeable + inner.end_lsn.is_none() } - /// Splits `self` into two InMemoryLayers: `frozen` and `open`. - /// All data up to and including `cutoff_lsn` - /// is copied to `frozen`, while the remaining data is copied to `open`. - /// After completion, self is non-writeable, but not frozen. - pub fn freeze(self: Arc, cutoff_lsn: Lsn) -> Result { - info!( - "freezing in memory layer {} on timeline {} at {} (oldest {})", - self.filename().display(), - self.timelineid, - cutoff_lsn, - self.oldest_pending_lsn - ); + /// Make the layer non-writeable. Only call once. + /// Records the end_lsn for non-dropped layers. + /// `end_lsn` is inclusive + pub fn freeze(&self, end_lsn: Lsn) { + let mut inner = self.inner.write().unwrap(); - self.assert_not_frozen(); - - let self_ref = self.clone(); - let mut inner = self_ref.inner.write().unwrap(); - // Dropped layers don't need any special freeze actions, - // they are marked as non-writeable at drop and just - // written out to disk by checkpointer. - if inner.drop_lsn.is_some() { - assert!(!inner.writeable); - info!( - "freezing in memory layer for {} on timeline {} is dropped at {}", - self.seg, - self.timelineid, - inner.drop_lsn.unwrap() - ); - - // There should be no newer layer that refers this non-writeable layer, - // because layer that is created after dropped one represents a new rel. - return Ok(FreezeLayers { - frozen: self, - open: None, - }); - } - assert!(inner.writeable); - inner.writeable = false; - - // Divide all the page versions into old and new - // at the 'cutoff_lsn' point. - let mut before_segsizes = BTreeMap::new(); - let mut after_segsizes = BTreeMap::new(); - let mut after_oldest_lsn: Accum = Accum(None); - for (lsn, size) in inner.segsizes.iter() { - if *lsn > cutoff_lsn { - after_segsizes.insert(*lsn, *size); - after_oldest_lsn.accum(min, *lsn); - } else { - before_segsizes.insert(*lsn, *size); - } - } - - let mut before_page_versions = BTreeMap::new(); - let mut after_page_versions = BTreeMap::new(); - for ((blknum, lsn), pv) in inner.page_versions.iter() { - if *lsn > cutoff_lsn { - after_page_versions.insert((*blknum, *lsn), pv.clone()); - after_oldest_lsn.accum(min, *lsn); - } else { - before_page_versions.insert((*blknum, *lsn), pv.clone()); - } - } - - let frozen = Arc::new(InMemoryLayer { - conf: self.conf, - tenantid: self.tenantid, - timelineid: self.timelineid, - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: Some(cutoff_lsn), - oldest_pending_lsn: self.start_lsn, - incremental: self.incremental, - inner: RwLock::new(InMemoryLayerInner { - drop_lsn: inner.drop_lsn, - page_versions: before_page_versions, - segsizes: before_segsizes, - writeable: false, - }), - }); - - let open = if !after_segsizes.is_empty() || !after_page_versions.is_empty() { - let mut new_open = Self::create_successor_layer( - self.conf, - frozen.clone(), - self.timelineid, - self.tenantid, - cutoff_lsn + 1, - after_oldest_lsn.0.unwrap(), - )?; - - let new_inner = new_open.inner.get_mut().unwrap(); - new_inner.page_versions.append(&mut after_page_versions); - new_inner.segsizes.append(&mut after_segsizes); - - Some(Arc::new(new_open)) + if inner.end_lsn.is_some() { + assert!(inner.dropped); } else { - None - }; + assert!(!inner.dropped); + assert!(self.start_lsn < end_lsn + 1); + inner.end_lsn = Some(Lsn(end_lsn.0 + 1)); - Ok(FreezeLayers { frozen, open }) + if let Some((lsn, _)) = inner.segsizes.as_slice().last() { + assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn); + } + + for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) { + assert!(lsn <= end_lsn); + } + } } /// Write the this frozen in-memory layer to disk. @@ -673,16 +561,15 @@ impl InMemoryLayer { /// WAL records between start and end LSN. (The delta layer is not needed /// when a new relish is created with a single LSN, so that the start and /// end LSN are the same.) - pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result>> { + pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result { trace!( - "write_to_disk {} end_lsn is {} get_end_lsn is {}", + "write_to_disk {} get_end_lsn is {}", self.filename().display(), - self.end_lsn.unwrap_or(Lsn(0)), self.get_end_lsn() ); // Grab the lock in read-mode. We hold it over the I/O, but because this - // layer is not writeable anymore, no one should be trying to aquire the + // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception // though: another thread might have grabbed a reference to this layer // in `get_layer_for_write' just before the checkpointer called @@ -691,46 +578,45 @@ impl InMemoryLayer { // would have to wait until we release it. That race condition is very // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().unwrap(); - assert!(!inner.writeable); + let end_lsn_exclusive = inner.end_lsn.unwrap(); - if let Some(drop_lsn) = inner.drop_lsn { + if inner.dropped { let delta_layer = DeltaLayer::create( self.conf, self.timelineid, self.tenantid, self.seg, self.start_lsn, - drop_lsn, + end_lsn_exclusive, true, - inner.page_versions.iter(), + inner.page_versions.ordered_page_version_iter(None), inner.segsizes.clone(), )?; trace!( "freeze: created delta layer for dropped segment {} {}-{}", self.seg, self.start_lsn, - drop_lsn + end_lsn_exclusive ); - return Ok(vec![Arc::new(delta_layer)]); + return Ok(LayersOnDisk { + delta_layers: vec![delta_layer], + image_layers: Vec::new(), + }); } - let end_lsn = self.end_lsn.unwrap(); + // Since `end_lsn` is inclusive, subtract 1. + // We want to make an ImageLayer for the last included LSN, + // so the DeltaLayer should exlcude that LSN. + let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1); - let mut before_segsizes = BTreeMap::new(); - for (lsn, size) in inner.segsizes.iter() { - if *lsn <= end_lsn { - before_segsizes.insert(*lsn, *size); - } - } - let mut before_page_versions = inner.page_versions.iter().filter(|tup| { - let ((_blknum, lsn), _pv) = tup; + let mut page_versions = inner + .page_versions + .ordered_page_version_iter(Some(end_lsn_inclusive)); - *lsn < end_lsn - }); + let mut delta_layers = Vec::new(); - let mut frozen_layers: Vec> = Vec::new(); - - if self.start_lsn != end_lsn { + if self.start_lsn != end_lsn_inclusive { + let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive); // Write the page versions before the cutoff to disk. let delta_layer = DeltaLayer::create( self.conf, @@ -738,29 +624,36 @@ impl InMemoryLayer { self.tenantid, self.seg, self.start_lsn, - end_lsn, + end_lsn_inclusive, false, - before_page_versions, - before_segsizes, + page_versions, + segsizes, )?; - frozen_layers.push(Arc::new(delta_layer)); + delta_layers.push(delta_layer); trace!( "freeze: created delta layer {} {}-{}", self.seg, self.start_lsn, - end_lsn + end_lsn_inclusive ); } else { - assert!(before_page_versions.next().is_none()); + assert!(page_versions.next().is_none()); } drop(inner); // Write a new base image layer at the cutoff point - let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?; - frozen_layers.push(Arc::new(image_layer)); - trace!("freeze: created image layer {} at {}", self.seg, end_lsn); + let image_layer = + ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?; + trace!( + "freeze: created image layer {} at {}", + self.seg, + end_lsn_inclusive + ); - Ok(frozen_layers) + Ok(LayersOnDisk { + delta_layers, + image_layers: vec![image_layer], + }) } } diff --git a/pageserver/src/layered_repository/page_versions.rs b/pageserver/src/layered_repository/page_versions.rs new file mode 100644 index 0000000000..90321f96cd --- /dev/null +++ b/pageserver/src/layered_repository/page_versions.rs @@ -0,0 +1,150 @@ +use std::{collections::HashMap, ops::RangeBounds, slice}; + +use zenith_utils::{lsn::Lsn, vec_map::VecMap}; + +use super::storage_layer::PageVersion; + +const EMPTY_SLICE: &[(Lsn, PageVersion)] = &[]; + +#[derive(Debug, Default)] +pub struct PageVersions(HashMap>); + +impl PageVersions { + pub fn append_or_update_last( + &mut self, + blknum: u32, + lsn: Lsn, + page_version: PageVersion, + ) -> Option { + let map = self.0.entry(blknum).or_insert_with(VecMap::default); + map.append_or_update_last(lsn, page_version).unwrap() + } + + /// Get all [`PageVersion`]s in a block + pub fn get_block_slice(&self, blknum: u32) -> &[(Lsn, PageVersion)] { + self.0 + .get(&blknum) + .map(VecMap::as_slice) + .unwrap_or(EMPTY_SLICE) + } + + /// Get a range of [`PageVersions`] in a block + pub fn get_block_lsn_range>( + &self, + blknum: u32, + range: R, + ) -> &[(Lsn, PageVersion)] { + self.0 + .get(&blknum) + .map(|vec_map| vec_map.slice_range(range)) + .unwrap_or(EMPTY_SLICE) + } + + /// Iterate through [`PageVersion`]s in (block, lsn) order. + /// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn` + pub fn ordered_page_version_iter(&self, cutoff_lsn: Option) -> OrderedPageVersionIter<'_> { + let mut ordered_blocks: Vec = self.0.keys().cloned().collect(); + ordered_blocks.sort_unstable(); + + let slice = ordered_blocks + .first() + .map(|&blknum| self.get_block_slice(blknum)) + .unwrap_or(EMPTY_SLICE); + + OrderedPageVersionIter { + page_versions: self, + ordered_blocks, + cur_block_idx: 0, + cutoff_lsn, + cur_slice_iter: slice.iter(), + } + } +} + +pub struct OrderedPageVersionIter<'a> { + page_versions: &'a PageVersions, + + ordered_blocks: Vec, + cur_block_idx: usize, + + cutoff_lsn: Option, + + cur_slice_iter: slice::Iter<'a, (Lsn, PageVersion)>, +} + +impl OrderedPageVersionIter<'_> { + fn is_lsn_before_cutoff(&self, lsn: &Lsn) -> bool { + if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() { + lsn < cutoff_lsn + } else { + true + } + } +} + +impl<'a> Iterator for OrderedPageVersionIter<'a> { + type Item = (u32, Lsn, &'a PageVersion); + + fn next(&mut self) -> Option { + loop { + if let Some((lsn, page_version)) = self.cur_slice_iter.next() { + if self.is_lsn_before_cutoff(lsn) { + let blknum = self.ordered_blocks[self.cur_block_idx]; + return Some((blknum, *lsn, page_version)); + } + } + + let next_block_idx = self.cur_block_idx + 1; + let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?; + self.cur_block_idx = next_block_idx; + self.cur_slice_iter = self.page_versions.get_block_slice(blknum).iter(); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const EMPTY_PAGE_VERSION: PageVersion = PageVersion { + page_image: None, + record: None, + }; + + #[test] + fn test_ordered_iter() { + let mut page_versions = PageVersions::default(); + const BLOCKS: u32 = 1000; + const LSNS: u64 = 50; + + for blknum in 0..BLOCKS { + for lsn in 0..LSNS { + let old = page_versions.append_or_update_last(blknum, Lsn(lsn), EMPTY_PAGE_VERSION); + assert!(old.is_none()); + } + } + + let mut iter = page_versions.ordered_page_version_iter(None); + for blknum in 0..BLOCKS { + for lsn in 0..LSNS { + let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap(); + assert_eq!(actual_blknum, blknum); + assert_eq!(Lsn(lsn), actual_lsn); + } + } + assert!(iter.next().is_none()); + assert!(iter.next().is_none()); // should be robust against excessive next() calls + + const CUTOFF_LSN: Lsn = Lsn(30); + let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN)); + for blknum in 0..BLOCKS { + for lsn in 0..CUTOFF_LSN.0 { + let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap(); + assert_eq!(actual_blknum, blknum); + assert_eq!(Lsn(lsn), actual_lsn); + } + } + assert!(iter.next().is_none()); + assert!(iter.next().is_none()); // should be robust against excessive next() calls + } +} diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index a107d63b40..0a86fe407d 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -78,7 +78,7 @@ pub struct PageVersion { /// 'records' contains the records to apply over the base image. /// pub struct PageReconstructData { - pub records: Vec, + pub records: Vec<(Lsn, WALRecord)>, pub page_img: Option, } @@ -123,10 +123,6 @@ pub trait Layer: Send + Sync { /// Is the segment represented by this layer dropped by PostgreSQL? fn is_dropped(&self) -> bool; - /// Gets the physical location of the layer on disk. - /// Some layers, such as in-memory, might not have the location. - fn path(&self) -> Option; - /// Filename used to store this layer on disk. (Even in-memory layers /// implement this, to print a handy unique identifier for the layer for /// log messages, even though they're never not on disk.) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d592a83993..be849ce35f 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -13,7 +13,6 @@ use anyhow::{anyhow, bail, ensure, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use lazy_static::lazy_static; -use log::*; use regex::Regex; use std::net::TcpListener; use std::str; @@ -21,10 +20,12 @@ use std::str::FromStr; use std::sync::Arc; use std::thread; use std::{io, net::TcpStream}; +use tracing::*; use zenith_metrics::{register_histogram_vec, HistogramVec}; use zenith_utils::auth::{self, JwtAuth}; use zenith_utils::auth::{Claims, Scope}; use zenith_utils::lsn::Lsn; +use zenith_utils::postgres_backend::is_socket_read_timed_out; use zenith_utils::postgres_backend::PostgresBackend; use zenith_utils::postgres_backend::{self, AuthType}; use zenith_utils::pq_proto::{ @@ -187,17 +188,32 @@ pub fn thread_main( listener: TcpListener, auth_type: AuthType, ) -> anyhow::Result<()> { - loop { + let mut join_handles = Vec::new(); + + while !tenant_mgr::shutdown_requested() { let (socket, peer_addr) = listener.accept()?; debug!("accepted connection from {}", peer_addr); socket.set_nodelay(true).unwrap(); let local_auth = auth.clone(); - thread::spawn(move || { - if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) { - error!("page server thread exiting with error: {:#}", err); - } - }); + + let handle = thread::Builder::new() + .name("serving Page Service thread".into()) + .spawn(move || { + if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) { + error!(%err, "page server thread exited with error"); + } + }) + .unwrap(); + + join_handles.push(handle); } + + debug!("page_service loop terminated. wait for connections to cancel"); + for handle in join_handles.into_iter() { + handle.join().unwrap(); + } + + Ok(()) } fn page_service_conn_main( @@ -216,7 +232,7 @@ fn page_service_conn_main( } let mut conn_handler = PageServerHandler::new(conf, auth); - let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; pgbackend.run(&mut conn_handler) } @@ -260,50 +276,66 @@ impl PageServerHandler { timelineid: ZTimelineId, tenantid: ZTenantId, ) -> anyhow::Result<()> { + let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered(); + // Check that the timeline exists let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?; /* switch client to COPYBOTH */ pgb.write_message(&BeMessage::CopyBothResponse)?; - while let Some(message) = pgb.read_message()? { - trace!("query({:?}): {:?}", timelineid, message); + while !tenant_mgr::shutdown_requested() { + match pgb.read_message() { + Ok(message) => { + if let Some(message) = message { + trace!("query: {:?}", message); - let copy_data_bytes = match message { - FeMessage::CopyData(bytes) => bytes, - _ => continue, - }; + let copy_data_bytes = match message { + FeMessage::CopyData(bytes) => bytes, + _ => continue, + }; - let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let response = match zenith_fe_msg { - PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists"]) - .observe_closure_duration(|| { - self.handle_get_rel_exists_request(&*timeline, &req) - }), - PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size"]) - .observe_closure_duration(|| self.handle_get_nblocks_request(&*timeline, &req)), - PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn"]) - .observe_closure_duration(|| { - self.handle_get_page_at_lsn_request(&*timeline, &req) - }), - }; + let response = match zenith_fe_msg { + PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME + .with_label_values(&["get_rel_exists"]) + .observe_closure_duration(|| { + self.handle_get_rel_exists_request(&*timeline, &req) + }), + PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME + .with_label_values(&["get_rel_size"]) + .observe_closure_duration(|| { + self.handle_get_nblocks_request(&*timeline, &req) + }), + PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME + .with_label_values(&["get_page_at_lsn"]) + .observe_closure_duration(|| { + self.handle_get_page_at_lsn_request(&*timeline, &req) + }), + }; - let response = response.unwrap_or_else(|e| { - // print the all details to the log with {:#}, but for the client the - // error message is enough - error!("error reading relation or page version: {:#}", e); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - }); + let response = response.unwrap_or_else(|e| { + // print the all details to the log with {:#}, but for the client the + // error message is enough + error!("error reading relation or page version: {:#}", e); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + }); - pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; + pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; + } else { + break; + } + } + Err(e) => { + if !is_socket_read_timed_out(&e) { + return Err(e); + } + } + } } - Ok(()) } @@ -363,6 +395,8 @@ impl PageServerHandler { timeline: &dyn Timeline, req: &PagestreamExistsRequest, ) -> Result { + let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); + let tag = RelishTag::Relation(req.rel); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; @@ -378,6 +412,7 @@ impl PageServerHandler { timeline: &dyn Timeline, req: &PagestreamNblocksRequest, ) -> Result { + let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); let tag = RelishTag::Relation(req.rel); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; @@ -397,6 +432,8 @@ impl PageServerHandler { timeline: &dyn Timeline, req: &PagestreamGetPageRequest, ) -> Result { + let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) + .entered(); let tag = RelishTag::Relation(req.rel); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; @@ -414,17 +451,20 @@ impl PageServerHandler { lsn: Option, tenantid: ZTenantId, ) -> anyhow::Result<()> { + let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty); + let _enter = span.enter(); + // check that the timeline exists let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?; - /* switch client to COPYOUT */ + // switch client to COPYOUT pgb.write_message(&BeMessage::CopyOutResponse)?; - info!("sent CopyOut"); /* Send a tarball of the latest layer on the timeline */ { let mut writer = CopyDataSink { pgb }; let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; + span.record("lsn", &basebackup.lsn.to_string().as_str()); basebackup.send_tarball()?; } pgb.write_message(&BeMessage::CopyDone)?; @@ -529,11 +569,6 @@ impl postgres_backend::Handler for PageServerHandler { None }; - info!( - "got basebackup command. tenantid=\"{}\" timelineid=\"{}\" lsn=\"{:#?}\"", - tenantid, timelineid, lsn - ); - // Check that the timeline exists self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; @@ -551,6 +586,9 @@ impl postgres_backend::Handler for PageServerHandler { self.check_permission(Some(tenantid))?; + let _enter = + info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered(); + // Check that the timeline exists tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?; @@ -573,6 +611,9 @@ impl postgres_backend::Handler for PageServerHandler { self.check_permission(Some(tenantid))?; + let _enter = + info_span!("branch_create", name = %branchname, tenant = %tenantid).entered(); + let branch = branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?; let branch = serde_json::to_vec(&branch)?; diff --git a/pageserver/src/relish_storage.rs b/pageserver/src/relish_storage.rs index a687abe489..885ca9581f 100644 --- a/pageserver/src/relish_storage.rs +++ b/pageserver/src/relish_storage.rs @@ -12,14 +12,12 @@ mod rust_s3; /// local page server layer files with external storage. mod synced_storage; -use std::path::Path; -use std::thread; +use std::{path::Path, thread}; use anyhow::Context; -use self::local_fs::LocalFs; pub use self::synced_storage::schedule_timeline_upload; -use crate::relish_storage::rust_s3::RustS3; +use self::{local_fs::LocalFs, rust_s3::RustS3}; use crate::{PageServerConf, RelishStorageKind}; pub fn run_storage_sync_thread( @@ -57,15 +55,21 @@ pub trait RelishStorage: Send + Sync { async fn list_relishes(&self) -> anyhow::Result>; - async fn download_relish( + async fn download_relish( &self, from: &Self::RelishStoragePath, - to: &Path, - ) -> anyhow::Result<()>; + // rust_s3 `get_object_stream` method requires `std::io::BufWriter` for some reason, not the async counterpart + // that forces us to consume and return the writer to satisfy the blocking operation async wrapper requirements + to: std::io::BufWriter, + ) -> anyhow::Result>; async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>; - async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()>; + async fn upload_relish( + &self, + from: &mut tokio::io::BufReader, + to: &Self::RelishStoragePath, + ) -> anyhow::Result<()>; } fn strip_workspace_prefix<'a>( diff --git a/pageserver/src/relish_storage/local_fs.rs b/pageserver/src/relish_storage/local_fs.rs index 78ee858a5b..49d656d5a6 100644 --- a/pageserver/src/relish_storage/local_fs.rs +++ b/pageserver/src/relish_storage/local_fs.rs @@ -9,11 +9,13 @@ use std::{ future::Future, + io::Write, path::{Path, PathBuf}, pin::Pin, }; use anyhow::{bail, Context}; +use tokio::{fs, io}; use super::{strip_workspace_prefix, RelishStorage}; @@ -64,16 +66,33 @@ impl RelishStorage for LocalFs { Ok(get_all_files(&self.root).await?.into_iter().collect()) } - async fn download_relish( + async fn download_relish( &self, from: &Self::RelishStoragePath, - to: &Path, - ) -> anyhow::Result<()> { + mut to: std::io::BufWriter, + ) -> anyhow::Result> { let file_path = self.resolve_in_storage(from)?; if file_path.exists() && file_path.is_file() { - create_target_directory(to).await?; - tokio::fs::copy(file_path, to).await?; - Ok(()) + let updated_buffer = tokio::task::spawn_blocking(move || { + let mut source = std::io::BufReader::new( + std::fs::OpenOptions::new() + .read(true) + .open(&file_path) + .with_context(|| { + format!( + "Failed to open source file '{}' to use in the download", + file_path.display() + ) + })?, + ); + std::io::copy(&mut source, &mut to) + .context("Failed to download the relish file")?; + to.flush().context("Failed to flush the download buffer")?; + Ok::<_, anyhow::Error>(to) + }) + .await + .context("Failed to spawn a blocking task")??; + Ok(updated_buffer) } else { bail!( "File '{}' either does not exist or is not a file", @@ -94,18 +113,30 @@ impl RelishStorage for LocalFs { } } - async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> { + async fn upload_relish( + &self, + from: &mut io::BufReader, + to: &Self::RelishStoragePath, + ) -> anyhow::Result<()> { let target_file_path = self.resolve_in_storage(to)?; create_target_directory(&target_file_path).await?; + let mut destination = io::BufWriter::new( + fs::OpenOptions::new() + .write(true) + .create(true) + .open(&target_file_path) + .await + .with_context(|| { + format!( + "Failed to open target fs destination at '{}'", + target_file_path.display() + ) + })?, + ); - tokio::fs::copy(&from, &target_file_path) + io::copy_buf(from, &mut destination) .await - .with_context(|| { - format!( - "Failed to upload relish '{}' to local storage", - from.display(), - ) - })?; + .context("Failed to upload relish to local storage")?; Ok(()) } } diff --git a/pageserver/src/relish_storage/rust_s3.rs b/pageserver/src/relish_storage/rust_s3.rs index e98bf8949f..5dddaa36ca 100644 --- a/pageserver/src/relish_storage/rust_s3.rs +++ b/pageserver/src/relish_storage/rust_s3.rs @@ -1,13 +1,15 @@ //! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage. +use std::io::Write; use std::path::Path; use anyhow::Context; use s3::{bucket::Bucket, creds::Credentials, region::Region}; -use crate::{relish_storage::strip_workspace_prefix, S3Config}; - -use super::RelishStorage; +use crate::{ + relish_storage::{strip_workspace_prefix, RelishStorage}, + S3Config, +}; const S3_FILE_SEPARATOR: char = '/'; @@ -82,18 +84,14 @@ impl RelishStorage for RustS3 { .collect()) } - async fn download_relish( + async fn download_relish( &self, from: &Self::RelishStoragePath, - to: &Path, - ) -> anyhow::Result<()> { - let mut target_file = std::fs::OpenOptions::new() - .write(true) - .open(to) - .with_context(|| format!("Failed to open target s3 destination at {}", to.display()))?; + mut to: std::io::BufWriter, + ) -> anyhow::Result> { let code = self .bucket - .get_object_stream(from.key(), &mut target_file) + .get_object_stream(from.key(), &mut to) .await .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?; if code != 200 { @@ -102,7 +100,12 @@ impl RelishStorage for RustS3 { code )) } else { - Ok(()) + tokio::task::spawn_blocking(move || { + to.flush().context("Failed to fluch the downoad buffer")?; + Ok::<_, anyhow::Error>(to) + }) + .await + .context("Failed to joim the download buffer flush task")? } } @@ -112,9 +115,9 @@ impl RelishStorage for RustS3 { .delete_object(path.key()) .await .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?; - if code != 200 { + if code != 204 { Err(anyhow::format_err!( - "Received non-200 exit code during deleting object with key '{}', code: {}", + "Received non-204 exit code during deleting object with key '{}', code: {}", path.key(), code )) @@ -123,12 +126,14 @@ impl RelishStorage for RustS3 { } } - async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> { - let mut local_file = tokio::fs::OpenOptions::new().read(true).open(from).await?; - + async fn upload_relish( + &self, + from: &mut tokio::io::BufReader, + to: &Self::RelishStoragePath, + ) -> anyhow::Result<()> { let code = self .bucket - .put_object_stream(&mut local_file, to.key()) + .put_object_stream(from, to.key()) .await .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?; if code != 200 { diff --git a/pageserver/src/relish_storage/synced_storage.rs b/pageserver/src/relish_storage/synced_storage.rs index f51e976a83..e9ac20ff8c 100644 --- a/pageserver/src/relish_storage/synced_storage.rs +++ b/pageserver/src/relish_storage/synced_storage.rs @@ -1,6 +1,7 @@ use std::time::Duration; use std::{collections::BinaryHeap, sync::Mutex, thread}; +use crate::tenant_mgr; use crate::{relish_storage::RelishStorage, PageServerConf}; lazy_static::lazy_static! { @@ -31,22 +32,26 @@ pub fn run_storage_sync_thread< let handle = thread::Builder::new() .name("Queue based relish storage sync".to_string()) - .spawn(move || loop { - let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap(); - log::debug!("Upload queue length: {}", queue_accessor.len()); - let next_task = queue_accessor.pop(); - drop(queue_accessor); - match next_task { - Some(task) => runtime.block_on(async { - // suppress warnings - let _ = (config, task, &relish_storage, max_concurrent_sync); - todo!("omitted for brevity") - }), - None => { - thread::sleep(Duration::from_secs(1)); - continue; + .spawn(move || { + while !tenant_mgr::shutdown_requested() { + let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap(); + log::debug!("Upload queue length: {}", queue_accessor.len()); + let next_task = queue_accessor.pop(); + drop(queue_accessor); + match next_task { + Some(task) => runtime.block_on(async { + // suppress warnings + let _ = (config, task, &relish_storage, max_concurrent_sync); + todo!("omitted for brevity") + }), + None => { + thread::sleep(Duration::from_secs(1)); + continue; + } } } + log::debug!("Queue based relish storage sync thread shut down"); + Ok(()) })?; Ok(Some(handle)) } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index fa6a3e83e0..73c6f370d6 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -3,7 +3,7 @@ use anyhow::Result; use bytes::{Buf, BufMut, Bytes, BytesMut}; use serde::{Deserialize, Serialize}; use std::collections::HashSet; -use std::ops::AddAssign; +use std::ops::{AddAssign, Deref}; use std::sync::Arc; use std::time::Duration; use zenith_utils::lsn::{Lsn, RecordLsn}; @@ -13,6 +13,8 @@ use zenith_utils::zid::ZTimelineId; /// A repository corresponds to one .zenith directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { + fn shutdown(&self) -> Result<()>; + /// Get Timeline handle for given zenith timeline ID. fn get_timeline(&self, timelineid: ZTimelineId) -> Result>; @@ -117,32 +119,15 @@ pub trait Timeline: Send + Sync { /// Get a list of all existing non-relational objects fn list_nonrels(&self, lsn: Lsn) -> Result>; + /// Get the LSN where this branch was created + fn get_ancestor_lsn(&self) -> Lsn; + //------------------------------------------------------------------------------ // Public PUT functions, to update the repository with new page versions. // // These are called by the WAL receiver to digest WAL records. //------------------------------------------------------------------------------ - /// Put a new page version that can be constructed from a WAL record - /// - /// This will implicitly extend the relation, if the page is beyond the - /// current end-of-file. - fn put_wal_record(&self, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>; - - /// Like put_wal_record, but with ready-made image of the page. - fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>; - - /// Truncate relation - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>; - - /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records - fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>; - - /// Track end of the latest digested WAL record. - /// - /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers. - /// Previous last record LSN is stored alongside the latest and can be read. - fn advance_last_record_lsn(&self, lsn: Lsn); /// Atomically get both last and prev. fn get_last_record_rlsn(&self) -> RecordLsn; /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. @@ -151,6 +136,9 @@ pub trait Timeline: Send + Sync { fn get_start_lsn(&self) -> Lsn; fn get_disk_consistent_lsn(&self) -> Lsn; + /// Mutate the timeline with a [`TimelineWriter`]. + fn writer<'a>(&'a self) -> Box; + /// /// Flush to disk all data that was written with the put_* functions /// @@ -169,9 +157,35 @@ pub trait Timeline: Send + Sync { fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result; } +/// Various functions to mutate the timeline. +// TODO Currently, Deref is used to allow easy access to read methods from this trait. +// This is probably considered a bad practice in Rust and should be fixed eventually, +// but will cause large code changes. +pub trait TimelineWriter: Deref { + /// Put a new page version that can be constructed from a WAL record + /// + /// This will implicitly extend the relation, if the page is beyond the + /// current end-of-file. + fn put_wal_record(&self, lsn: Lsn, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>; + + /// Like put_wal_record, but with ready-made image of the page. + fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>; + + /// Truncate relation + fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>; + + /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records + fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>; + + /// Track end of the latest digested WAL record. + /// + /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers. + /// Previous last record LSN is stored alongside the latest and can be read. + fn advance_last_record_lsn(&self, lsn: Lsn); +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct WALRecord { - pub lsn: Lsn, // LSN at the *end* of the record pub will_init: bool, pub rec: Bytes, // Remember the offset of main_data in rec, @@ -182,22 +196,19 @@ pub struct WALRecord { impl WALRecord { pub fn pack(&self, buf: &mut BytesMut) { - buf.put_u64(self.lsn.0); buf.put_u8(self.will_init as u8); buf.put_u32(self.main_data_offset); buf.put_u32(self.rec.len() as u32); buf.put_slice(&self.rec[..]); } pub fn unpack(buf: &mut Bytes) -> WALRecord { - let lsn = Lsn::from(buf.get_u64()); let will_init = buf.get_u8() != 0; let main_data_offset = buf.get_u32(); - let mut dst = vec![0u8; buf.get_u32() as usize]; - buf.copy_to_slice(&mut dst); + let rec_len = buf.get_u32() as usize; + let rec = buf.split_to(rec_len); WALRecord { - lsn, will_init, - rec: Bytes::from(dst), + rec, main_data_offset, } } @@ -210,7 +221,7 @@ impl WALRecord { #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::LayeredRepository; + use crate::layered_repository::{LayeredRepository, METADATA_FILE_NAME}; use crate::walredo::{WalRedoError, WalRedoManager}; use crate::PageServerConf; use hex_literal::hex; @@ -307,14 +318,15 @@ mod tests { // Create timeline to work on let tline = repo.create_empty_timeline(TIMELINE_ID)?; + let writer = tline.writer(); - tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; - tline.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?; - tline.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?; + writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; + writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; + writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; + writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?; + writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?; - tline.advance_last_record_lsn(Lsn(0x50)); + writer.advance_last_record_lsn(Lsn(0x50)); assert_current_logical_size(&tline, Lsn(0x50)); @@ -360,8 +372,8 @@ mod tests { ); // Truncate last block - tline.put_truncation(TESTREL_A, Lsn(0x60), 2)?; - tline.advance_last_record_lsn(Lsn(0x60)); + writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?; + writer.advance_last_record_lsn(Lsn(0x60)); assert_current_logical_size(&tline, Lsn(0x60)); // Check reported size and contents after truncation @@ -383,13 +395,13 @@ mod tests { ); // Truncate to zero length - tline.put_truncation(TESTREL_A, Lsn(0x68), 0)?; - tline.advance_last_record_lsn(Lsn(0x68)); + writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?; + writer.advance_last_record_lsn(Lsn(0x68)); assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0); // Extend from 0 to 2 blocks, leaving a gap - tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?; - tline.advance_last_record_lsn(Lsn(0x70)); + writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?; + writer.advance_last_record_lsn(Lsn(0x70)); assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2); assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE); assert_eq!( @@ -424,25 +436,26 @@ mod tests { // Create timeline to work on let tline = repo.create_empty_timeline(TIMELINE_ID)?; + let writer = tline.writer(); - tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - tline.advance_last_record_lsn(Lsn(0x20)); + writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; + writer.advance_last_record_lsn(Lsn(0x20)); // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1); // Drop relish - tline.drop_relish(TESTREL_A, Lsn(0x30))?; - tline.advance_last_record_lsn(Lsn(0x30)); + writer.drop_relish(TESTREL_A, Lsn(0x30))?; + writer.advance_last_record_lsn(Lsn(0x30)); // Check that rel is not visible anymore assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none()); // Extend it again - tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; - tline.advance_last_record_lsn(Lsn(0x40)); + writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; + writer.advance_last_record_lsn(Lsn(0x40)); // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); @@ -460,6 +473,7 @@ mod tests { // Create timeline to work on let tline = repo.create_empty_timeline(TIMELINE_ID)?; + let writer = tline.writer(); //from storage_layer.rs const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; @@ -469,10 +483,10 @@ mod tests { for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); - tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; + writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; } - tline.advance_last_record_lsn(Lsn(0x20)); + writer.advance_last_record_lsn(Lsn(0x20)); // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); @@ -496,8 +510,8 @@ mod tests { // Truncate relation so that second segment was dropped // - only leave one page - tline.put_truncation(TESTREL_A, Lsn(0x60), 1)?; - tline.advance_last_record_lsn(Lsn(0x60)); + writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?; + writer.advance_last_record_lsn(Lsn(0x60)); // Check reported size and contents after truncation assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1); @@ -530,9 +544,9 @@ mod tests { for blkno in 0..relsize { let lsn = Lsn(0x80); let data = format!("foo blk {} at {}", blkno, lsn); - tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; + writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; } - tline.advance_last_record_lsn(Lsn(0x80)); + writer.advance_last_record_lsn(Lsn(0x80)); assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); assert_eq!( @@ -558,14 +572,15 @@ mod tests { fn test_large_rel() -> Result<()> { let repo = RepoHarness::create("test_large_rel")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID)?; + let writer = tline.writer(); let mut lsn = 0x10; for blknum in 0..pg_constants::RELSEG_SIZE + 1 { let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); lsn += 0x10; - tline.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?; + writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?; } - tline.advance_last_record_lsn(Lsn(lsn)); + writer.advance_last_record_lsn(Lsn(lsn)); assert_current_logical_size(&tline, Lsn(lsn)); @@ -576,8 +591,8 @@ mod tests { // Truncate one block lsn += 0x10; - tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?; - tline.advance_last_record_lsn(Lsn(lsn)); + writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?; + writer.advance_last_record_lsn(Lsn(lsn)); assert_eq!( tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), pg_constants::RELSEG_SIZE @@ -586,8 +601,8 @@ mod tests { // Truncate another block lsn += 0x10; - tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?; - tline.advance_last_record_lsn(Lsn(lsn)); + writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?; + writer.advance_last_record_lsn(Lsn(lsn)); assert_eq!( tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), pg_constants::RELSEG_SIZE - 1 @@ -599,8 +614,8 @@ mod tests { let mut size: i32 = 3000; while size >= 0 { lsn += 0x10; - tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?; - tline.advance_last_record_lsn(Lsn(lsn)); + writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?; + writer.advance_last_record_lsn(Lsn(lsn)); assert_eq!( tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), size as u32 @@ -620,16 +635,17 @@ mod tests { fn test_list_rels_drop() -> Result<()> { let repo = RepoHarness::create("test_list_rels_drop")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID)?; + let writer = tline.writer(); const TESTDB: u32 = 111; // Import initial dummy checkpoint record, otherwise the get_timeline() call // after branching fails below - tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; + writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; // Create a relation on the timeline - tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; + writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - tline.advance_last_record_lsn(Lsn(0x30)); + writer.advance_last_record_lsn(Lsn(0x30)); // Check that list_rels() lists it after LSN 2, but no before it assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A)); @@ -639,14 +655,17 @@ mod tests { // Create a branch, check that the relation is visible there repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; let newtline = repo.get_timeline(NEW_TIMELINE_ID)?; + let new_writer = newtline.writer(); assert!(newtline .list_rels(0, TESTDB, Lsn(0x30))? .contains(&TESTREL_A)); // Drop it on the branch - newtline.drop_relish(TESTREL_A, Lsn(0x40))?; - newtline.advance_last_record_lsn(Lsn(0x40)); + new_writer.drop_relish(TESTREL_A, Lsn(0x40))?; + new_writer.advance_last_record_lsn(Lsn(0x40)); + + drop(new_writer); // Check that it's no longer listed on the branch after the point where it was dropped assert!(newtline @@ -674,28 +693,30 @@ mod tests { fn test_branch() -> Result<()> { let repo = RepoHarness::create("test_branch")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID)?; + let writer = tline.writer(); // Import initial dummy checkpoint record, otherwise the get_timeline() call // after branching fails below - tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; + writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; // Create a relation on the timeline - tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; - tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; + writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; + writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; + writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; // Create another relation - tline.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?; + writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?; - tline.advance_last_record_lsn(Lsn(0x40)); + writer.advance_last_record_lsn(Lsn(0x40)); assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; let newtline = repo.get_timeline(NEW_TIMELINE_ID)?; + let new_writer = newtline.writer(); - newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?; - newtline.advance_last_record_lsn(Lsn(0x40)); + new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?; + new_writer.advance_last_record_lsn(Lsn(0x40)); // Check page contents on both branches assert_eq!( @@ -729,7 +750,7 @@ mod tests { repo.create_empty_timeline(TIMELINE_ID)?; drop(repo); - let metadata_path = harness.timeline_path(&TIMELINE_ID).join("metadata"); + let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); assert!(metadata_path.is_file()); @@ -811,7 +832,7 @@ mod tests { blknum: u32, lsn: Lsn, base_img: Option, - records: Vec, + records: Vec<(Lsn, WALRecord)>, ) -> Result { let s = format!( "redo for {} blk {} to get to {}, with {} and {} records", diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs index dfe3edd7ac..8afa2676e2 100644 --- a/pageserver/src/restore_local_repo.rs +++ b/pageserver/src/restore_local_repo.rs @@ -2,17 +2,17 @@ //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! zenith Timeline. //! -use log::*; use postgres_ffi::nonrelfile_utils::clogpage_precedes; use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; use std::cmp::min; use std::fs; use std::fs::File; -use std::io::Read; -use std::path::Path; +use std::io::{Read, Seek, SeekFrom}; +use std::path::{Path, PathBuf}; -use anyhow::{bail, Result}; +use anyhow::{anyhow, bail, Result}; use bytes::{Buf, Bytes}; +use tracing::*; use crate::relish::*; use crate::repository::*; @@ -34,9 +34,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); /// pub fn import_timeline_from_postgres_datadir( path: &Path, - timeline: &dyn Timeline, + writer: &dyn TimelineWriter, lsn: Lsn, ) -> Result<()> { + let mut pg_control: Option = None; + // Scan 'global' for direntry in fs::read_dir(path.join("global"))? { let direntry = direntry?; @@ -44,10 +46,10 @@ pub fn import_timeline_from_postgres_datadir( None => continue, Some("pg_control") => { - import_control_file(timeline, lsn, &direntry.path())?; + pg_control = Some(import_control_file(writer, lsn, &direntry.path())?); } Some("pg_filenode.map") => import_nonrel_file( - timeline, + writer, lsn, RelishTag::FileNodeMap { spcnode: pg_constants::GLOBALTABLESPACE_OID, @@ -59,7 +61,7 @@ pub fn import_timeline_from_postgres_datadir( // Load any relation files into the page server _ => import_relfile( &direntry.path(), - timeline, + writer, lsn, pg_constants::GLOBALTABLESPACE_OID, 0, @@ -86,7 +88,7 @@ pub fn import_timeline_from_postgres_datadir( Some("PG_VERSION") => continue, Some("pg_filenode.map") => import_nonrel_file( - timeline, + writer, lsn, RelishTag::FileNodeMap { spcnode: pg_constants::DEFAULTTABLESPACE_OID, @@ -98,7 +100,7 @@ pub fn import_timeline_from_postgres_datadir( // Load any relation files into the page server _ => import_relfile( &direntry.path(), - timeline, + writer, lsn, pg_constants::DEFAULTTABLESPACE_OID, dboid, @@ -108,24 +110,36 @@ pub fn import_timeline_from_postgres_datadir( } for entry in fs::read_dir(path.join("pg_xact"))? { let entry = entry?; - import_slru_file(timeline, lsn, SlruKind::Clog, &entry.path())?; + import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?; } for entry in fs::read_dir(path.join("pg_multixact").join("members"))? { let entry = entry?; - import_slru_file(timeline, lsn, SlruKind::MultiXactMembers, &entry.path())?; + import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?; } for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? { let entry = entry?; - import_slru_file(timeline, lsn, SlruKind::MultiXactOffsets, &entry.path())?; + import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?; } for entry in fs::read_dir(path.join("pg_twophase"))? { let entry = entry?; let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?; - import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?; + import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?; } // TODO: Scan pg_tblspc - timeline.advance_last_record_lsn(lsn); + writer.advance_last_record_lsn(lsn); + + // Import WAL. This is needed even when starting from a shutdown checkpoint, because + // this reads the checkpoint record itself, advancing the tip of the timeline to + // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn' + let pg_control = pg_control.ok_or_else(|| anyhow!("pg_control file not found"))?; + import_wal( + &path.join("pg_wal"), + writer, + Lsn(pg_control.checkPointCopy.redo), + lsn, + &mut pg_control.checkPointCopy.clone(), + )?; Ok(()) } @@ -133,12 +147,13 @@ pub fn import_timeline_from_postgres_datadir( // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. fn import_relfile( path: &Path, - timeline: &dyn Timeline, + timeline: &dyn TimelineWriter, lsn: Lsn, spcoid: Oid, dboid: Oid, ) -> Result<()> { // Does it look like a relation file? + trace!("importing rel file {}", path.display()); let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap()); if let Err(e) = p { @@ -166,14 +181,14 @@ fn import_relfile( } // TODO: UnexpectedEof is expected - Err(e) => match e.kind() { + Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. // FIXME: maybe check that we read the full length of the file? break; } _ => { - bail!("error reading file {}: {:#}", path.display(), e); + bail!("error reading file {}: {:#}", path.display(), err); } }, }; @@ -190,7 +205,7 @@ fn import_relfile( /// are just slurped into the repository as one blob. /// fn import_nonrel_file( - timeline: &dyn Timeline, + timeline: &dyn TimelineWriter, lsn: Lsn, tag: RelishTag, path: &Path, @@ -200,7 +215,7 @@ fn import_nonrel_file( // read the whole file file.read_to_end(&mut buffer)?; - info!("importing non-rel file {}", path.display()); + trace!("importing non-rel file {}", path.display()); timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?; Ok(()) @@ -211,13 +226,17 @@ fn import_nonrel_file( /// /// The control file is imported as is, but we also extract the checkpoint record /// from it and store it separated. -fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result<()> { +fn import_control_file( + timeline: &dyn TimelineWriter, + lsn: Lsn, + path: &Path, +) -> Result { let mut file = File::open(path)?; let mut buffer = Vec::new(); // read the whole file file.read_to_end(&mut buffer)?; - info!("importing control file {}", path.display()); + trace!("importing control file {}", path.display()); // Import it as ControlFile timeline.put_page_image( @@ -232,19 +251,24 @@ fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result let checkpoint_bytes = pg_control.checkPointCopy.encode(); timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?; - Ok(()) + Ok(pg_control) } /// /// Import an SLRU segment file /// -fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Path) -> Result<()> { +fn import_slru_file( + timeline: &dyn TimelineWriter, + lsn: Lsn, + slru: SlruKind, + path: &Path, +) -> Result<()> { // Does it look like an SLRU file? let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?; - info!("importing slru file {}", path.display()); + trace!("importing slru file {}", path.display()); let mut rpageno = 0; loop { @@ -260,14 +284,14 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa } // TODO: UnexpectedEof is expected - Err(e) => match e.kind() { + Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. // FIXME: maybe check that we read the full length of the file? break; } _ => { - bail!("error reading file {}: {:#}", path.display(), e); + bail!("error reading file {}: {:#}", path.display(), err); } }, }; @@ -279,18 +303,119 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa Ok(()) } +/// Scan PostgreSQL WAL files in given directory and load all records between +/// 'startpoint' and 'endpoint' into the repository. +fn import_wal( + walpath: &Path, + timeline: &dyn TimelineWriter, + startpoint: Lsn, + endpoint: Lsn, + checkpoint: &mut CheckPoint, +) -> Result<()> { + let mut waldecoder = WalStreamDecoder::new(startpoint); + + let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE); + let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE); + let mut last_lsn = startpoint; + + while last_lsn <= endpoint { + // FIXME: assume postgresql tli 1 for now + let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE); + let mut buf = Vec::new(); + + // Read local file + let mut path = walpath.join(&filename); + + // It could be as .partial + if !PathBuf::from(&path).exists() { + path = walpath.join(filename + ".partial"); + } + + // Slurp the WAL file + let mut file = File::open(&path)?; + + if offset > 0 { + file.seek(SeekFrom::Start(offset as u64))?; + } + + let nread = file.read_to_end(&mut buf)?; + if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize { + // Maybe allow this for .partial files? + error!("read only {} bytes from WAL file", nread); + } + + waldecoder.feed_bytes(&buf); + + let mut nrecords = 0; + while last_lsn <= endpoint { + if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let mut checkpoint_modified = false; + + let decoded = decode_wal_record(recdata.clone()); + save_decoded_record( + checkpoint, + &mut checkpoint_modified, + timeline, + &decoded, + recdata, + lsn, + )?; + last_lsn = lsn; + + if checkpoint_modified { + let checkpoint_bytes = checkpoint.encode(); + timeline.put_page_image( + RelishTag::Checkpoint, + 0, + last_lsn, + checkpoint_bytes, + )?; + } + + // Now that this record has been fully handled, including updating the + // checkpoint data, let the repository know that it is up-to-date to this LSN + timeline.advance_last_record_lsn(last_lsn); + nrecords += 1; + + trace!("imported record at {} (end {})", lsn, endpoint); + } + } + + debug!("imported {} records up to {}", nrecords, last_lsn); + + segno += 1; + offset = 0; + } + + if last_lsn != startpoint { + debug!( + "reached end of WAL at {}, updating checkpoint info", + last_lsn + ); + + timeline.advance_last_record_lsn(last_lsn); + } else { + info!("no WAL to import at {}", last_lsn); + } + + Ok(()) +} + /// /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. /// pub fn save_decoded_record( checkpoint: &mut CheckPoint, - timeline: &dyn Timeline, + checkpoint_modified: &mut bool, + timeline: &dyn TimelineWriter, decoded: &DecodedWALRecord, recdata: Bytes, lsn: Lsn, ) -> Result<()> { - checkpoint.update_next_xid(decoded.xl_xid); + if checkpoint.update_next_xid(decoded.xl_xid) { + *checkpoint_modified = true; + } // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. @@ -303,13 +428,12 @@ pub fn save_decoded_record( }); let rec = WALRecord { - lsn, will_init: blk.will_init || blk.apply_image, rec: recdata.clone(), main_data_offset: decoded.main_data_offset as u32, }; - timeline.put_wal_record(tag, blk.blkno, rec)?; + timeline.put_wal_record(lsn, tag, blk.blkno, rec)?; } let mut buf = decoded.record.clone(); @@ -374,7 +498,7 @@ pub fn save_decoded_record( } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - save_clog_truncate_record(checkpoint, timeline, lsn, &xlrec)?; + save_clog_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; @@ -443,10 +567,17 @@ pub fn save_decoded_record( )?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?; + save_multixact_create_record( + checkpoint, + checkpoint_modified, + timeline, + lsn, + &xlrec, + decoded, + )?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?; + save_multixact_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); @@ -455,7 +586,10 @@ pub fn save_decoded_record( let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { let next_oid = buf.get_u32_le(); - checkpoint.nextOid = next_oid; + if checkpoint.nextOid != next_oid { + checkpoint.nextOid = next_oid; + *checkpoint_modified = true; + } } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { @@ -471,6 +605,7 @@ pub fn save_decoded_record( ); if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 { checkpoint.oldestXid = xlog_checkpoint.oldestXid; + *checkpoint_modified = true; } } } @@ -478,7 +613,11 @@ pub fn save_decoded_record( } /// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record. -fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> { +fn save_xlog_dbase_create( + timeline: &dyn TimelineWriter, + lsn: Lsn, + rec: &XlCreateDatabase, +) -> Result<()> { let db_id = rec.db_id; let tablespace_id = rec.tablespace_id; let src_db_id = rec.src_db_id; @@ -555,7 +694,11 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab /// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record. /// /// This is the same logic as in PostgreSQL's smgr_redo() function. -fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> { +fn save_xlog_smgr_truncate( + timeline: &dyn TimelineWriter, + lsn: Lsn, + rec: &XlSmgrTruncate, +) -> Result<()> { let spcnode = rec.rnode.spcnode; let dbnode = rec.rnode.dbnode; let relnode = rec.rnode.relnode; @@ -617,7 +760,7 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca /// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records. /// fn save_xact_record( - timeline: &dyn Timeline, + timeline: &dyn TimelineWriter, lsn: Lsn, parsed: &XlXactParsedRecord, decoded: &DecodedWALRecord, @@ -628,12 +771,12 @@ fn save_xact_record( let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; let rec = WALRecord { - lsn, will_init: false, rec: decoded.record.clone(), main_data_offset: decoded.main_data_offset as u32, }; timeline.put_wal_record( + lsn, RelishTag::Slru { slru: SlruKind::Clog, segno, @@ -649,6 +792,7 @@ fn save_xact_record( let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; timeline.put_wal_record( + lsn, RelishTag::Slru { slru: SlruKind::Clog, segno, @@ -674,7 +818,8 @@ fn save_xact_record( fn save_clog_truncate_record( checkpoint: &mut CheckPoint, - timeline: &dyn Timeline, + checkpoint_modified: &mut bool, + timeline: &dyn TimelineWriter, lsn: Lsn, xlrec: &XlClogTruncate, ) -> Result<()> { @@ -692,6 +837,7 @@ fn save_clog_truncate_record( // TODO Figure out if there will be any issues with replica. checkpoint.oldestXid = xlrec.oldest_xid; checkpoint.oldestXidDB = xlrec.oldest_xid_db; + *checkpoint_modified = true; // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it @@ -734,13 +880,13 @@ fn save_clog_truncate_record( fn save_multixact_create_record( checkpoint: &mut CheckPoint, - timeline: &dyn Timeline, + checkpoint_modified: &mut bool, + timeline: &dyn TimelineWriter, lsn: Lsn, xlrec: &XlMultiXactCreate, decoded: &DecodedWALRecord, ) -> Result<()> { let rec = WALRecord { - lsn, will_init: false, rec: decoded.record.clone(), main_data_offset: decoded.main_data_offset as u32, @@ -749,6 +895,7 @@ fn save_multixact_create_record( let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; timeline.put_wal_record( + lsn, RelishTag::Slru { slru: SlruKind::MultiXactOffsets, segno, @@ -768,6 +915,7 @@ fn save_multixact_create_record( let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; timeline.put_wal_record( + lsn, RelishTag::Slru { slru: SlruKind::MultiXactMembers, segno, @@ -790,9 +938,11 @@ fn save_multixact_create_record( } if xlrec.mid >= checkpoint.nextMulti { checkpoint.nextMulti = xlrec.mid + 1; + *checkpoint_modified = true; } if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset { checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers; + *checkpoint_modified = true; } let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| { if mbr.xid.wrapping_sub(acc) as i32 > 0 { @@ -802,18 +952,22 @@ fn save_multixact_create_record( } }); - checkpoint.update_next_xid(max_mbr_xid); + if checkpoint.update_next_xid(max_mbr_xid) { + *checkpoint_modified = true; + } Ok(()) } fn save_multixact_truncate_record( checkpoint: &mut CheckPoint, - timeline: &dyn Timeline, + checkpoint_modified: &mut bool, + timeline: &dyn TimelineWriter, lsn: Lsn, xlrec: &XlMultiXactTruncate, ) -> Result<()> { checkpoint.oldestMulti = xlrec.end_trunc_off; checkpoint.oldestMultiDB = xlrec.oldest_multi_db; + *checkpoint_modified = true; // PerformMembersTruncation let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET); @@ -847,7 +1001,7 @@ fn save_multixact_truncate_record( } fn save_relmap_page( - timeline: &dyn Timeline, + timeline: &dyn TimelineWriter, lsn: Lsn, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 4eb46ba71a..be3a36fda4 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -8,35 +8,92 @@ use crate::walredo::PostgresRedoManager; use crate::PageServerConf; use anyhow::{anyhow, bail, Context, Result}; use lazy_static::lazy_static; -use log::info; -use std::collections::hash_map::Entry; +use log::{debug, info}; use std::collections::HashMap; +use std::fmt; use std::fs; use std::str::FromStr; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex, MutexGuard}; +use std::thread::JoinHandle; use zenith_utils::zid::{ZTenantId, ZTimelineId}; lazy_static! { - static ref REPOSITORY: Mutex>> = - Mutex::new(HashMap::new()); + static ref TENANTS: Mutex> = Mutex::new(HashMap::new()); } -fn access_repository() -> MutexGuard<'static, HashMap>> { - REPOSITORY.lock().unwrap() +struct Tenant { + state: TenantState, + repo: Option>, } -pub fn init(conf: &'static PageServerConf) { - let mut m = access_repository(); - for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() { - let tenantid = - ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap(); - let repo = init_repo(conf, tenantid); - info!("initialized storage for tenant: {}", &tenantid); - m.insert(tenantid, repo); +#[derive(Debug)] +enum TenantState { + // This tenant only exists in cloud storage. It cannot be accessed. + CloudOnly, + // This tenant exists in cloud storage, and we are currently downloading it to local disk. + // It cannot be accessed yet, not until it's been fully downloaded to local disk. + Downloading, + // All data for this tenant is complete on local disk, but we haven't loaded the Repository, + // Timeline and Layer structs into memory yet, so it cannot be accessed yet. + //Ready, + // This tenant exists on local disk, and the layer map has been loaded into memory. + // The local disk might have some newer files that don't exist in cloud storage yet. + Active, + // This tenant exists on local disk, and the layer map has been loaded into memory. + // The local disk might have some newer files that don't exist in cloud storage yet. + // The tenant cannot be accessed anymore for any reason, but graceful shutdown. + //Stopping, +} + +impl fmt::Display for TenantState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + TenantState::CloudOnly => f.write_str("CloudOnly"), + TenantState::Downloading => f.write_str("Downloading"), + TenantState::Active => f.write_str("Active"), + } } } -fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc { +fn access_tenants() -> MutexGuard<'static, HashMap> { + TENANTS.lock().unwrap() +} + +struct TenantHandleEntry { + checkpointer_handle: Option>, + gc_handle: Option>, +} + +// Logically these handles belong to Repository, +// but it's just simpler to store them separately +lazy_static! { + static ref TENANT_HANDLES: Mutex> = + Mutex::new(HashMap::new()); +} + +static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false); + +pub fn init(conf: &'static PageServerConf) { + for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() { + let tenantid = + ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap(); + + { + let mut m = access_tenants(); + let tenant = Tenant { + state: TenantState::CloudOnly, + repo: None, + }; + m.insert(tenantid, tenant); + } + + init_repo(conf, tenantid); + info!("initialized storage for tenant: {}", &tenantid); + } +} + +fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) { // Set up a WAL redo manager, for applying WAL records. let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); @@ -47,9 +104,22 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc init_timeline(o.get().as_ref(), timeline_id), - Entry::Vacant(v) => { - log::info!("New repo initialized"); - let new_repo = init_repo(conf, tenant_id); - init_timeline(new_repo.as_ref(), timeline_id); - v.insert(new_repo); + + { + let mut m = access_tenants(); + let mut tenant = m.get_mut(&tenant_id).unwrap(); + tenant.state = TenantState::Downloading; + match &tenant.repo { + Some(repo) => init_timeline(repo.as_ref(), timeline_id), + None => { + log::info!("Initialize new repo"); + } } } + + // init repo updates Tenant state + init_repo(conf, tenant_id); + let new_repo = get_repository_for_tenant(tenant_id).unwrap(); + init_timeline(new_repo.as_ref(), timeline_id); } fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) { @@ -82,29 +160,73 @@ fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) { } } +// Check this flag in the thread loops to know when to exit +pub fn shutdown_requested() -> bool { + SHUTDOWN_REQUESTED.load(Ordering::Relaxed) +} + +pub fn stop_tenant_threads(tenantid: ZTenantId) { + let mut handles = TENANT_HANDLES.lock().unwrap(); + if let Some(h) = handles.get_mut(&tenantid) { + h.checkpointer_handle.take().map(JoinHandle::join); + debug!("checkpointer for tenant {} has stopped", tenantid); + h.gc_handle.take().map(JoinHandle::join); + debug!("gc for tenant {} has stopped", tenantid); + } +} + +pub fn shutdown_all_tenants() -> Result<()> { + SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed); + + let tenantids = list_tenantids()?; + for tenantid in tenantids { + stop_tenant_threads(tenantid); + let repo = get_repository_for_tenant(tenantid)?; + debug!("shutdown tenant {}", tenantid); + repo.shutdown()?; + } + + Ok(()) +} + pub fn create_repository_for_tenant( conf: &'static PageServerConf, tenantid: ZTenantId, ) -> Result<()> { - let mut m = access_repository(); - - // First check that the tenant doesn't exist already - if m.get(&tenantid).is_some() { - bail!("tenant {} already exists", tenantid); + { + let mut m = access_tenants(); + // First check that the tenant doesn't exist already + if m.get(&tenantid).is_some() { + bail!("tenant {} already exists", tenantid); + } + let tenant = Tenant { + state: TenantState::CloudOnly, + repo: None, + }; + m.insert(tenantid, tenant); } + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?; - m.insert(tenantid, repo); + let mut m = access_tenants(); + let tenant = m.get_mut(&tenantid).unwrap(); + tenant.repo = Some(repo); + tenant.state = TenantState::Active; Ok(()) } pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result> { - access_repository() + let m = access_tenants(); + let tenant = m .get(&tenantid) - .map(Arc::clone) - .ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid)) + .ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid)); + + match &tenant.unwrap().repo { + Some(repo) => Ok(Arc::clone(repo)), + None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid), + } } pub fn get_timeline_for_tenant( @@ -115,3 +237,13 @@ pub fn get_timeline_for_tenant( .get_timeline(timelineid) .with_context(|| format!("cannot fetch timeline {}", timelineid)) } + +fn list_tenantids() -> Result> { + let m = access_tenants(); + m.iter() + .map(|v| { + let (tenantid, _) = v; + Ok(*tenantid) + }) + .collect() +} diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index cb94b9248b..b1e8e3b54f 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -72,6 +72,10 @@ impl WalStreamDecoder { /// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid. /// pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + let recordbuf; + + // Run state machine that validates page headers, and reassembles records + // that cross page boundaries. loop { // parse and verify page boundaries as we go if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { @@ -120,29 +124,41 @@ impl WalStreamDecoder { self.lsn += self.padlen as u64; self.padlen = 0; } else if self.contlen == 0 { - // need to have at least the xl_tot_len field + assert!(self.recordbuf.is_empty()); + // need to have at least the xl_tot_len field if self.inputbuf.remaining() < 4 { return Ok(None); } - // read xl_tot_len FIXME: assumes little-endian + // peek xl_tot_len at the beginning of the record. + // FIXME: assumes little-endian self.startlsn = self.lsn; - let xl_tot_len = self.inputbuf.get_u32_le(); + let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le(); if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD { return Err(WalDecodeError { msg: format!("invalid xl_tot_len {}", xl_tot_len), lsn: self.lsn, }); } - self.lsn += 4; - self.recordbuf.clear(); - self.recordbuf.reserve(xl_tot_len as usize); - self.recordbuf.put_u32_le(xl_tot_len); - - self.contlen = xl_tot_len - 4; - continue; + // Fast path for the common case that the whole record fits on the page. + let pageleft = self.lsn.remaining_in_block() as u32; + if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft { + // Take the record from the 'inputbuf', and validate it. + recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize); + self.lsn += xl_tot_len as u64; + break; + } else { + // Need to assemble the record from pieces. Remember the size of the + // record, and loop back. On next iteration, we will reach the 'else' + // branch below, and copy the part of the record that was on this page + // to 'recordbuf'. Subsequent iterations will skip page headers, and + // append the continuations from the next pages to 'recordbuf'. + self.recordbuf.reserve(xl_tot_len as usize); + self.contlen = xl_tot_len; + continue; + } } else { // we're continuing a record, possibly from previous page. let pageleft = self.lsn.remaining_in_block() as u32; @@ -159,47 +175,42 @@ impl WalStreamDecoder { self.contlen -= n as u32; if self.contlen == 0 { - let recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()); - - let recordbuf = recordbuf.freeze(); - let mut buf = recordbuf.clone(); - - let xlogrec = XLogRecord::from_bytes(&mut buf); - - // XLOG_SWITCH records are special. If we see one, we need to skip - // to the next WAL segment. - if xlogrec.is_xlog_switch_record() { - trace!("saw xlog switch record at {}", self.lsn); - self.padlen = - self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32; - } else { - // Pad to an 8-byte boundary - self.padlen = self.lsn.calc_padding(8u32) as u32; - } - - let mut crc = crc32c_append(0, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]); - crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]); - if crc != xlogrec.xl_crc { - return Err(WalDecodeError { - msg: "WAL record crc mismatch".into(), - lsn: self.lsn, - }); - } - - // Always align resulting LSN on 0x8 boundary -- that is important for getPage() - // and WalReceiver integration. Since this code is used both for WalReceiver and - // initial WAL import let's force alignment right here. - let result = (self.lsn.align(), recordbuf); - return Ok(Some(result)); + // The record is now complete. + recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze(); + break; } continue; } } - // check record boundaries - // deal with continuation records + // We now have a record in the 'recordbuf' local variable. + let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]); - // deal with xlog_switch records + let mut crc = 0; + crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]); + crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]); + if crc != xlogrec.xl_crc { + return Err(WalDecodeError { + msg: "WAL record crc mismatch".into(), + lsn: self.lsn, + }); + } + + // XLOG_SWITCH records are special. If we see one, we need to skip + // to the next WAL segment. + if xlogrec.is_xlog_switch_record() { + trace!("saw xlog switch record at {}", self.lsn); + self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32; + } else { + // Pad to an 8-byte boundary + self.padlen = self.lsn.calc_padding(8u32) as u32; + } + + // Always align resulting LSN on 0x8 boundary -- that is important for getPage() + // and WalReceiver integration. Since this code is used both for WalReceiver and + // initial WAL import let's force alignment right here. + let result = (self.lsn.align(), recordbuf); + Ok(Some(result)) } } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index d7bdfd6f2e..65b3fa5cf6 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -12,7 +12,6 @@ use crate::waldecoder::*; use crate::PageServerConf; use anyhow::{bail, Error, Result}; use lazy_static::lazy_static; -use log::*; use postgres::fallible_iterator::FallibleIterator; use postgres::replication::ReplicationIter; use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow}; @@ -25,8 +24,10 @@ use std::str::FromStr; use std::sync::Mutex; use std::thread; use std::thread::sleep; +use std::thread::JoinHandle; use std::thread_local; use std::time::{Duration, SystemTime}; +use tracing::*; use zenith_utils::lsn::Lsn; use zenith_utils::zid::ZTenantId; use zenith_utils::zid::ZTimelineId; @@ -36,6 +37,7 @@ use zenith_utils::zid::ZTimelineId; // struct WalReceiverEntry { wal_producer_connstr: String, + wal_receiver_handle: Option>, } lazy_static! { @@ -50,6 +52,19 @@ thread_local! { pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); } +// Wait for walreceiver to stop +// Now it stops when pageserver shutdown is requested. +// In future we can make this more granular and send shutdown signals +// per tenant/timeline to cancel inactive walreceivers. +// TODO deal with blocking pg connections +pub fn stop_wal_receiver(timelineid: ZTimelineId) { + let mut receivers = WAL_RECEIVERS.lock().unwrap(); + if let Some(r) = receivers.get_mut(&timelineid) { + r.wal_receiver_handle.take(); + // r.wal_receiver_handle.take().map(JoinHandle::join); + } +} + // Launch a new WAL receiver, or tell one that's running about change in connection string pub fn launch_wal_receiver( conf: &'static PageServerConf, @@ -64,19 +79,19 @@ pub fn launch_wal_receiver( receiver.wal_producer_connstr = wal_producer_connstr.into(); } None => { - let receiver = WalReceiverEntry { - wal_producer_connstr: wal_producer_connstr.into(), - }; - receivers.insert(timelineid, receiver); - - // Also launch a new thread to handle this connection - let _walreceiver_thread = thread::Builder::new() + let wal_receiver_handle = thread::Builder::new() .name("WAL receiver thread".into()) .spawn(move || { IS_WAL_RECEIVER.with(|c| c.set(true)); thread_main(conf, timelineid, tenantid); }) .unwrap(); + + let receiver = WalReceiverEntry { + wal_producer_connstr: wal_producer_connstr.into(), + wal_receiver_handle: Some(wal_receiver_handle), + }; + receivers.insert(timelineid, receiver); } }; } @@ -96,16 +111,14 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String { // This is the entry point for the WAL receiver thread. // fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId) { - info!( - "WAL receiver thread started for timeline : '{}'", - timelineid - ); + let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered(); + info!("WAL receiver thread started"); // // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server, // and start streaming WAL from it. If the connection is lost, keep retrying. // - loop { + while !tenant_mgr::shutdown_requested() { // Look up the current WAL producer address let wal_producer_connstr = get_wal_producer_connstr(timelineid); @@ -119,6 +132,7 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: sleep(Duration::from_secs(1)); } } + debug!("WAL streaming shut down"); } fn walreceiver_main( @@ -169,8 +183,8 @@ fn walreceiver_main( startpoint += startpoint.calc_padding(8u32); info!( - "last_record_lsn {} starting replication from {} for timeline {}, server is at {}...", - last_rec_lsn, startpoint, timelineid, end_of_wal + "last_record_lsn {} starting replication from {}, server is at {}...", + last_rec_lsn, startpoint, end_of_wal ); let query = format!("START_REPLICATION PHYSICAL {}", startpoint); @@ -198,27 +212,32 @@ fn walreceiver_main( waldecoder.feed_bytes(data); while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - // Save old checkpoint value to compare with it after decoding WAL record - let old_checkpoint_bytes = checkpoint.encode(); - let decoded = decode_wal_record(recdata.clone()); + let _enter = info_span!("processing record", lsn = %lsn).entered(); // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are // at risk of hittind a deadlock. assert!(lsn.is_aligned()); + let writer = timeline.writer(); + + let mut checkpoint_modified = false; + + let decoded = decode_wal_record(recdata.clone()); restore_local_repo::save_decoded_record( &mut checkpoint, - &*timeline, + &mut checkpoint_modified, + writer.as_ref(), &decoded, recdata, lsn, )?; - let new_checkpoint_bytes = checkpoint.encode(); // Check if checkpoint data was updated by save_decoded_record - if new_checkpoint_bytes != old_checkpoint_bytes { - timeline.put_page_image( + if checkpoint_modified { + let new_checkpoint_bytes = checkpoint.encode(); + + writer.put_page_image( RelishTag::Checkpoint, 0, lsn, @@ -228,7 +247,7 @@ fn walreceiver_main( // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - timeline.advance_last_record_lsn(lsn); + writer.advance_last_record_lsn(lsn); last_rec_lsn = lsn; } @@ -275,6 +294,11 @@ fn walreceiver_main( const NO_REPLY: u8 = 0; physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?; } + + if tenant_mgr::shutdown_requested() { + debug!("stop walreceiver because pageserver shutdown is requested"); + break; + } } Ok(()) } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index f233fceb3e..8cd696e8f3 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -82,7 +82,7 @@ pub trait WalRedoManager: Send + Sync { blknum: u32, lsn: Lsn, base_img: Option, - records: Vec, + records: Vec<(Lsn, WALRecord)>, ) -> Result; } @@ -99,7 +99,7 @@ impl crate::walredo::WalRedoManager for DummyRedoManager { _blknum: u32, _lsn: Lsn, _base_img: Option, - _records: Vec, + _records: Vec<(Lsn, WALRecord)>, ) -> Result { Err(WalRedoError::InvalidState) } @@ -150,7 +150,7 @@ struct WalRedoRequest { lsn: Lsn, base_img: Option, - records: Vec, + records: Vec<(Lsn, WALRecord)>, } /// An error happened in WAL redo @@ -179,7 +179,7 @@ impl WalRedoManager for PostgresRedoManager { blknum: u32, lsn: Lsn, base_img: Option, - records: Vec, + records: Vec<(Lsn, WALRecord)>, ) -> Result { let start_time; let lock_time; @@ -277,7 +277,7 @@ impl PostgresRedoManager { page.extend_from_slice(&ZERO_PAGE); } // Apply all collected WAL records - for record in records { + for (_lsn, record) in records { let mut buf = record.rec.clone(); WAL_REDO_RECORD_COUNTER.inc(); @@ -544,7 +544,7 @@ impl PostgresRedoProcess { &mut self, tag: BufferTag, base_img: Option, - records: &[WALRecord], + records: &[(Lsn, WALRecord)], ) -> Result { let stdout = &mut self.stdout; // Buffer the writes to avoid a lot of small syscalls. @@ -565,22 +565,16 @@ impl PostgresRedoProcess { stdin.write_all(&build_begin_redo_for_block_msg(tag)), ) .await??; - if base_img.is_some() { - timeout( - TIMEOUT, - stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())), - ) - .await??; + if let Some(img) = base_img { + timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??; } // Send WAL records. - for rec in records.iter() { - let r = rec.clone(); - + for (lsn, rec) in records.iter() { WAL_REDO_RECORD_COUNTER.inc(); stdin - .write_all(&build_apply_record_msg(r.lsn, r.rec)) + .write_all(&build_apply_record_msg(*lsn, &rec.rec)) .await?; //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}", @@ -617,58 +611,41 @@ impl PostgresRedoProcess { // process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for // explanation of the protocol. -fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes { +fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec { let len = 4 + 1 + 4 * 4; - let mut buf = BytesMut::with_capacity(1 + len); + let mut buf = Vec::with_capacity(1 + len); buf.put_u8(b'B'); buf.put_u32(len as u32); - // FIXME: this is a temporary hack that should go away when we refactor - // the postgres protocol serialization + handlers. - // - // BytesMut is a dynamic growable buffer, used a lot in tokio code but - // not in the std library. To write to a BytesMut from a serde serializer, - // we need to either: - // - pre-allocate the required buffer space. This is annoying because we - // shouldn't care what the exact serialized size is-- that's the - // serializer's job. - // - Or, we need to create a temporary "writer" (which implements the - // `Write` trait). It's a bit awkward, because the writer consumes the - // underlying BytesMut, and we need to extract it later with - // `into_inner`. - let mut writer = buf.writer(); - tag.ser_into(&mut writer) + tag.ser_into(&mut buf) .expect("serialize BufferTag should always succeed"); - let buf = writer.into_inner(); debug_assert!(buf.len() == 1 + len); - buf.freeze() + buf } -fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes { +fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec { assert!(base_img.len() == 8192); let len = 4 + 1 + 4 * 4 + base_img.len(); - let mut buf = BytesMut::with_capacity(1 + len); + let mut buf = Vec::with_capacity(1 + len); buf.put_u8(b'P'); buf.put_u32(len as u32); - let mut writer = buf.writer(); - tag.ser_into(&mut writer) + tag.ser_into(&mut buf) .expect("serialize BufferTag should always succeed"); - let mut buf = writer.into_inner(); buf.put(base_img); debug_assert!(buf.len() == 1 + len); - buf.freeze() + buf } -fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes { +fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec { let len = 4 + 8 + rec.len(); - let mut buf = BytesMut::with_capacity(1 + len); + let mut buf: Vec = Vec::with_capacity(1 + len); buf.put_u8(b'A'); buf.put_u32(len as u32); @@ -677,21 +654,19 @@ fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes { debug_assert!(buf.len() == 1 + len); - buf.freeze() + buf } -fn build_get_page_msg(tag: BufferTag) -> Bytes { +fn build_get_page_msg(tag: BufferTag) -> Vec { let len = 4 + 1 + 4 * 4; - let mut buf = BytesMut::with_capacity(1 + len); + let mut buf = Vec::with_capacity(1 + len); buf.put_u8(b'G'); buf.put_u32(len as u32); - let mut writer = buf.writer(); - tag.ser_into(&mut writer) + tag.ser_into(&mut buf) .expect("serialize BufferTag should always succeed"); - let buf = writer.into_inner(); debug_assert!(buf.len() == 1 + len); - buf.freeze() + buf } diff --git a/postgres_ffi/src/xlog_utils.rs b/postgres_ffi/src/xlog_utils.rs index c4caa18b32..7f88de4c85 100644 --- a/postgres_ffi/src/xlog_utils.rs +++ b/postgres_ffi/src/xlog_utils.rs @@ -9,7 +9,6 @@ use crate::pg_constants; use crate::CheckPoint; -use crate::ControlFileData; use crate::FullTransactionId; use crate::XLogLongPageHeaderData; use crate::XLogPageHeaderData; @@ -18,8 +17,8 @@ use crate::XLOG_PAGE_MAGIC; use anyhow::{bail, Result}; use byteorder::{ByteOrder, LittleEndian}; +use bytes::BytesMut; use bytes::{Buf, Bytes}; -use bytes::{BufMut, BytesMut}; use crc32c::*; use log::*; use std::cmp::max; @@ -329,7 +328,12 @@ pub fn main() { } impl XLogRecord { - pub fn from_bytes(buf: &mut Bytes) -> XLogRecord { + pub fn from_slice(buf: &[u8]) -> XLogRecord { + use zenith_utils::bin_ser::LeSer; + XLogRecord::des(buf).unwrap() + } + + pub fn from_bytes(buf: &mut B) -> XLogRecord { use zenith_utils::bin_ser::LeSer; XLogRecord::des_from(&mut buf.reader()).unwrap() } @@ -377,10 +381,12 @@ impl CheckPoint { Ok(CheckPoint::des(buf)?) } - // Update next XID based on provided new_xid and stored epoch. - // Next XID should be greater than new_xid. - // Also take in account 32-bit wrap-around. - pub fn update_next_xid(&mut self, xid: u32) { + /// Update next XID based on provided new_xid and stored epoch. + /// Next XID should be greater than new_xid. This handles 32-bit + /// XID wraparound correctly. + /// + /// Returns 'true' if the XID was updated. + pub fn update_next_xid(&mut self, xid: u32) -> bool { let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1); let full_xid = self.nextXid.value; let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID); @@ -391,35 +397,37 @@ impl CheckPoint { // wrap-around epoch += 1; } - self.nextXid = FullTransactionId { - value: (epoch << 32) | new_xid as u64, - }; + let nextXid = (epoch << 32) | new_xid as u64; + + if nextXid != self.nextXid.value { + self.nextXid = FullTransactionId { value: nextXid }; + return true; + } } + false } } // -// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record. +// Generate new, empty WAL segment. // We need this segment to start compute node. -// In order to minimize changes in Postgres core, we prefer to -// provide WAL segment from which is can extract checkpoint record in standard way, -// rather then implement some alternative mechanism. // -pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes { +pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes { let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize); + let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE); let hdr = XLogLongPageHeaderData { std: { XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, xlp_info: pg_constants::XLP_LONG_HEADER, xlp_tli: 1, // FIXME: always use Postgres timeline 1 - xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64, + xlp_pageaddr: pageaddr, xlp_rem_len: 0, ..Default::default() // Put 0 in padding fields. } }, - xlp_sysid: pg_control.system_identifier, + xlp_sysid: system_id, xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32, xlp_xlog_blcksz: XLOG_BLCKSZ as u32, }; @@ -427,36 +435,6 @@ pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes { let hdr_bytes = hdr.encode(); seg_buf.extend_from_slice(&hdr_bytes); - let rec_hdr = XLogRecord { - xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD - + SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT - + SIZEOF_CHECKPOINT) as u32, - xl_xid: 0, //0 is for InvalidTransactionId - xl_prev: 0, - xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN, - xl_rmid: pg_constants::RM_XLOG_ID, - xl_crc: 0, - ..Default::default() // Put 0 in padding fields. - }; - - let mut rec_shord_hdr_bytes = BytesMut::new(); - rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT); - rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8); - - let rec_bytes = rec_hdr.encode(); - let checkpoint_bytes = pg_control.checkPointCopy.encode(); - - //calculate record checksum - let mut crc = 0; - crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]); - crc = crc32c_append(crc, &checkpoint_bytes[..]); - crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]); - - seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]); - seg_buf.put_u32_le(crc); - seg_buf.extend_from_slice(&rec_shord_hdr_bytes); - seg_buf.extend_from_slice(&checkpoint_bytes); - //zero out the rest of the file seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0); seg_buf.freeze() diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 2b3259f8ec..1f33b68a1c 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -34,7 +34,7 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow: pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> { let mut conn_handler = MgmtHandler { state }; - let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?; + let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?; pgbackend.run(&mut conn_handler) } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index f246d4470a..61a742cf38 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -64,6 +64,7 @@ pub fn proxy_conn_main( socket, postgres_backend::AuthType::MD5, state.conf.ssl_config.clone(), + false, )?, md5_salt: [0u8; 4], psql_session_id: "".into(), diff --git a/test_runner/Pipfile b/test_runner/Pipfile index f5ff0d7e2b..a98acc5718 100644 --- a/test_runner/Pipfile +++ b/test_runner/Pipfile @@ -11,11 +11,14 @@ pyjwt = {extras = ["crypto"], version = "*"} requests = "*" pytest-xdist = "*" asyncpg = "*" +cached-property = "*" [dev-packages] -yapf = "*" flake8 = "*" mypy = "*" +# Behavior may change slightly between versions. These are run continuously, +# so we pin exact versions to avoid suprising breaks. Update if comfortable. +yapf = "==0.31.0" [requires] # we need at least 3.6, but pipenv doesn't allow to say this directly diff --git a/test_runner/Pipfile.lock b/test_runner/Pipfile.lock index 3c68c0ff3a..75fc17ffad 100644 --- a/test_runner/Pipfile.lock +++ b/test_runner/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "3cdc048691824d0b93912b6b78a0aa01dc98f278212c1badb0cc2edbd2103c3a" + "sha256": "3645ae8d2dcf55bd2a54963c44cfeedf577f3b289d1077365214a80a7f36e643" }, "pipfile-spec": 6, "requires": { @@ -43,94 +43,108 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==21.2.0" }, + "cached-property": { + "hashes": [ + "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130", + "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0" + ], + "index": "pypi", + "version": "==1.5.2" + }, "certifi": { "hashes": [ - "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee", - "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8" + "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", + "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569" ], - "version": "==2021.5.30" + "version": "==2021.10.8" }, "cffi": { "hashes": [ - "sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d", - "sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771", - "sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872", - "sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c", - "sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc", - "sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762", - "sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202", - "sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5", - "sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548", - "sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a", - "sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f", - "sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20", - "sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218", - "sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c", - "sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e", - "sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56", - "sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224", - "sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a", - "sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2", - "sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a", - "sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819", - "sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346", - "sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b", - "sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e", - "sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534", - "sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb", - "sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0", - "sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156", - "sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd", - "sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87", - "sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc", - "sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195", - "sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33", - "sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f", - "sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d", - "sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd", - "sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728", - "sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7", - "sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca", - "sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99", - "sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf", - "sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e", - "sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c", - "sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5", - "sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69" + "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3", + "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2", + "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636", + "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20", + "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728", + "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27", + "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66", + "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443", + "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0", + "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7", + "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39", + "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605", + "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a", + "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37", + "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029", + "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139", + "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc", + "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df", + "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14", + "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880", + "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2", + "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a", + "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e", + "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474", + "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024", + "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8", + "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0", + "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e", + "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a", + "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e", + "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032", + "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6", + "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e", + "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b", + "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e", + "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954", + "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962", + "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c", + "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4", + "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55", + "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962", + "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023", + "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c", + "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6", + "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8", + "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382", + "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7", + "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc", + "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997", + "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796" ], - "version": "==1.14.6" + "version": "==1.15.0" }, "charset-normalizer": { "hashes": [ - "sha256:5d209c0a931f215cee683b6445e2d77677e7e75e159f78def0db09d68fafcaa6", - "sha256:5ec46d183433dcbd0ab716f2d7f29d8dee50505b3fdb40c6b985c7c4f5a3591f" + "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0", + "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b" ], "markers": "python_version >= '3'", - "version": "==2.0.6" + "version": "==2.0.7" }, "cryptography": { "hashes": [ - "sha256:0a7dcbcd3f1913f664aca35d47c1331fce738d44ec34b7be8b9d332151b0b01e", - "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b", - "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7", - "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085", - "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc", - "sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d", - "sha256:3fa3a7ccf96e826affdf1a0a9432be74dc73423125c8f96a909e3835a5ef194a", - "sha256:5b0fbfae7ff7febdb74b574055c7466da334a5371f253732d7e2e7525d570498", - "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89", - "sha256:8695456444f277af73a4877db9fc979849cd3ee74c198d04fc0776ebc3db52b9", - "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c", - "sha256:94fff993ee9bc1b2440d3b7243d488c6a3d9724cc2b09cdb297f6a886d040ef7", - "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb", - "sha256:a00cf305f07b26c351d8d4e1af84ad7501eca8a342dedf24a7acb0e7b7406e14", - "sha256:a305600e7a6b7b855cd798e00278161b681ad6e9b7eca94c721d5f588ab212af", - "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e", - "sha256:d2a6e5ef66503da51d2110edf6c403dc6b494cc0082f85db12f54e9c5d4c3ec5", - "sha256:d9ec0e67a14f9d1d48dd87a2531009a9b251c02ea42851c060b25c782516ff06", - "sha256:f44d141b8c4ea5eb4dbc9b3ad992d45580c1d22bf5e24363f2fbf50c2d7ae8a7" + "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6", + "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6", + "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c", + "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999", + "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e", + "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992", + "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d", + "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588", + "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa", + "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d", + "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd", + "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d", + "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953", + "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2", + "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8", + "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6", + "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9", + "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6", + "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad", + "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76" ], - "version": "==3.4.8" + "version": "==35.0.0" }, "execnet": { "hashes": [ @@ -142,11 +156,11 @@ }, "idna": { "hashes": [ - "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a", - "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3" + "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", + "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], "markers": "python_version >= '3'", - "version": "==3.2" + "version": "==3.3" }, "iniconfig": { "hashes": [ @@ -207,11 +221,11 @@ "crypto" ], "hashes": [ - "sha256:934d73fbba91b0483d3857d1aff50e96b2a892384ee2c17417ed3203f173fca1", - "sha256:fba44e7898bbca160a2b2b501f492824fc8382485d3a6f11ba5d0c1937ce6130" + "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41", + "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f" ], "index": "pypi", - "version": "==2.1.0" + "version": "==2.3.0" }, "pyparsing": { "hashes": [ @@ -272,21 +286,21 @@ }, "urllib3": { "hashes": [ - "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4", - "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f" + "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece", + "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", - "version": "==1.26.6" + "version": "==1.26.7" } }, "develop": { "flake8": { "hashes": [ - "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b", - "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907" + "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d", + "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d" ], "index": "pypi", - "version": "==3.9.2" + "version": "==4.0.1" }, "mccabe": { "hashes": [ @@ -333,19 +347,19 @@ }, "pycodestyle": { "hashes": [ - "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068", - "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef" + "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20", + "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==2.7.0" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==2.8.0" }, "pyflakes": { "hashes": [ - "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3", - "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db" + "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c", + "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==2.3.1" + "version": "==2.4.0" }, "toml": { "hashes": [ diff --git a/test_runner/README.md b/test_runner/README.md index 62a95350aa..cdbf7e988d 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -53,8 +53,8 @@ Useful environment variables: should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. -Let stdout and stderr go to the terminal instead of capturing them: -`pytest -s ...` +Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: +`pytest -s --log-cli-level=INFO ...` (Note many tests capture subprocess outputs separately, so this may not show much.) @@ -95,11 +95,13 @@ Python destructors, e.g. `__del__()` aren't recommended for cleanup. ### Code quality +We force code formatting via yapf: + +1. Install `yapf` and other tools (`flake8`, `mypy`) with `pipenv install --dev`. +1. Reformat all your code by running `pipenv run yapf -ri .` in the `test_runner/` directory. + Before submitting a patch, please consider: * Writing a couple of docstrings to clarify the reasoning behind a new test. * Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any. -* Formatting the code with `yapf -r -i .` (TODO: implement an opt-in pre-commit hook for that). * (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`. - -The tools can be installed with `pipenv install --dev`. diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index 614883d4b8..9fe7567902 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -1,4 +1,3 @@ - from contextlib import closing from typing import Iterator from uuid import uuid4 @@ -6,7 +5,6 @@ import psycopg2 from fixtures.zenith_fixtures import PortDistributor, Postgres, ZenithCli, ZenithPageserver, PgBin import pytest - pytest_plugins = ("fixtures.zenith_fixtures") @@ -35,7 +33,9 @@ def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver): ps.safe_psql(f"tenant_create {uuid4().hex}", password=management_token) # fail to create tenant using tenant token - with pytest.raises(psycopg2.DatabaseError, match='Attempt to access management api with tenant scope. Permission denied'): + with pytest.raises( + psycopg2.DatabaseError, + match='Attempt to access management api with tenant scope. Permission denied'): ps.safe_psql(f"tenant_create {uuid4().hex}", password=tenant_token) @@ -60,14 +60,14 @@ def test_compute_auth_to_pageserver( wa_factory.start_n_new(3, management_token) with Postgres( - zenith_cli=zenith_cli, - repo_dir=repo_dir, - pg_bin=pg_bin, - tenant_id=ps.initial_tenant, - port=port_distributor.get_port(), + zenith_cli=zenith_cli, + repo_dir=repo_dir, + pg_bin=pg_bin, + tenant_id=ps.initial_tenant, + port=port_distributor.get_port(), ).create_start( - branch, - wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None, + branch, + wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None, ) as pg: with closing(pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 9189017050..887671bf99 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -1,6 +1,6 @@ import subprocess from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver - +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") @@ -13,7 +13,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg zenith_cli.run(["branch", "test_branch_behind", "empty"]) pgmain = postgres.create_start('test_branch_behind') - print("postgres is running on 'test_branch_behind' branch") + log.info("postgres is running on 'test_branch_behind' branch") main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() @@ -27,7 +27,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg ''') main_cur.execute('SELECT pg_current_wal_insert_lsn()') lsn_a = main_cur.fetchone()[0] - print('LSN after 100 rows: ' + lsn_a) + log.info(f'LSN after 100 rows: {lsn_a}') # Insert some more rows. (This generates enough WAL to fill a few segments.) main_cur.execute(''' @@ -37,7 +37,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg ''') main_cur.execute('SELECT pg_current_wal_insert_lsn()') lsn_b = main_cur.fetchone()[0] - print('LSN after 200100 rows: ' + lsn_b) + log.info(f'LSN after 200100 rows: {lsn_b}') # Branch at the point where only 100 rows were inserted zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a]) @@ -52,7 +52,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg main_cur.execute('SELECT pg_current_wal_insert_lsn()') lsn_c = main_cur.fetchone()[0] - print('LSN after 400100 rows: ' + lsn_c) + log.info(f'LSN after 400100 rows: {lsn_c}') # Branch at the point where only 200100 rows were inserted zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b]) @@ -86,7 +86,10 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg assert cur.fetchone() == (1, ) # branch at pre-initdb lsn + # + # FIXME: This works currently, but probably shouldn't be allowed try: zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"]) + # FIXME: assert false, "branch with invalid LSN should have failed" except subprocess.CalledProcessError: - print("Branch creation with pre-initdb LSN failed (as expected)") + log.info("Branch creation with pre-initdb LSN failed (as expected)") diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index e9233986e4..a70e14d9a9 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -4,6 +4,7 @@ import os from contextlib import closing from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") @@ -17,14 +18,17 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg # set agressive autovacuum to make sure that truncation will happen config = [ - 'autovacuum_max_workers=10', 'autovacuum_vacuum_threshold=0', - 'autovacuum_vacuum_insert_threshold=0', 'autovacuum_vacuum_cost_delay=0', - 'autovacuum_vacuum_cost_limit=10000', 'autovacuum_naptime =1s', + 'autovacuum_max_workers=10', + 'autovacuum_vacuum_threshold=0', + 'autovacuum_vacuum_insert_threshold=0', + 'autovacuum_vacuum_cost_delay=0', + 'autovacuum_vacuum_cost_limit=10000', + 'autovacuum_naptime =1s', 'autovacuum_freeze_max_age=100000' ] pg = postgres.create_start('test_clog_truncate', config_lines=config) - print('postgres is running on test_clog_truncate branch') + log.info('postgres is running on test_clog_truncate branch') # Install extension containing function needed for test pg.safe_psql('CREATE EXTENSION zenith_test_utils') @@ -33,22 +37,22 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute('select test_consume_xids(1000*1000*10);') - print('xids consumed') + log.info('xids consumed') # call a checkpoint to trigger TruncateSubtrans cur.execute('CHECKPOINT;') # ensure WAL flush cur.execute('select txid_current()') - print(cur.fetchone()) + log.info(cur.fetchone()) # wait for autovacuum to truncate the pg_xact # XXX Is it worth to add a timeout here? pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), '0000') - print("pg_xact_0000_path = " + pg_xact_0000_path) + log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") while os.path.isfile(pg_xact_0000_path): - print("file exists. wait for truncation. " "pg_xact_0000_path = " + pg_xact_0000_path) + log.info(f"file exists. wait for truncation. " "pg_xact_0000_path = {pg_xact_0000_path}") time.sleep(5) # checkpoint to advance latest lsn @@ -59,14 +63,14 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg lsn_after_truncation = cur.fetchone()[0] # create new branch after clog truncation and start a compute node on it - print('create branch at lsn_after_truncation ' + lsn_after_truncation) + log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') zenith_cli.run( ["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation]) pg2 = postgres.create_start('test_clog_truncate_new') - print('postgres is running on test_clog_truncate_new branch') + log.info('postgres is running on test_clog_truncate_new branch') # check that new node doesn't contain truncated segment pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000') - print("pg_xact_0000_path_new = " + pg_xact_0000_path_new) + log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}") assert os.path.isfile(pg_xact_0000_path_new) is False diff --git a/test_runner/batch_others/test_config.py b/test_runner/batch_others/test_config.py index d8cc798839..d7c59c4e77 100644 --- a/test_runner/batch_others/test_config.py +++ b/test_runner/batch_others/test_config.py @@ -1,6 +1,7 @@ from contextlib import closing from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") @@ -14,7 +15,7 @@ def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFact # change config pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1']) - print('postgres is running on test_config branch') + log.info('postgres is running on test_config branch') with closing(pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index cbe89a77cb..5fe103496d 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -3,6 +3,7 @@ import pathlib from contextlib import closing from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") @@ -19,7 +20,7 @@ def test_createdb( zenith_cli.run(["branch", "test_createdb", "empty"]) pg = postgres.create_start('test_createdb') - print("postgres is running on 'test_createdb' branch") + log.info("postgres is running on 'test_createdb' branch") with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -40,6 +41,7 @@ def test_createdb( for db in (pg, pg2): db.connect(dbname='foodb').close() + # # Test DROP DATABASE # @@ -48,12 +50,12 @@ def test_dropdb( pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, - test_output_dir + test_output_dir, ): zenith_cli.run(["branch", "test_dropdb", "empty"]) pg = postgres.create_start('test_dropdb') - print("postgres is running on 'test_dropdb' branch") + log.info("postgres is running on 'test_dropdb' branch") with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -65,7 +67,6 @@ def test_dropdb( cur.execute("SELECT oid FROM pg_database WHERE datname='foodb';") dboid = cur.fetchone()[0] - with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute('DROP DATABASE foodb') @@ -75,7 +76,6 @@ def test_dropdb( cur.execute('SELECT pg_current_wal_insert_lsn()') lsn_after_drop = cur.fetchone()[0] - # Create two branches before and after database drop. zenith_cli.run(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop]) pg_before = postgres.create_start('test_before_dropdb') @@ -88,13 +88,13 @@ def test_dropdb( # Test that database subdir exists on the branch before drop dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid) - print(dbpath) + log.info(dbpath) assert os.path.isdir(dbpath) == True # Test that database subdir doesn't exist on the branch after drop dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid) - print(dbpath) + log.info(dbpath) assert os.path.isdir(dbpath) == False diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index f44df91c3c..57cc610f55 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -1,6 +1,7 @@ from contextlib import closing from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") @@ -12,7 +13,7 @@ def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: Postgres zenith_cli.run(["branch", "test_createuser", "empty"]) pg = postgres.create_start('test_createuser') - print("postgres is running on 'test_createuser' branch") + log.info("postgres is running on 'test_createuser' branch") with closing(pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/batch_others/test_multixact.py index aaa9e7f58d..78504b95ed 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/batch_others/test_multixact.py @@ -1,4 +1,5 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") @@ -9,13 +10,17 @@ pytest_plugins = ("fixtures.zenith_fixtures") # it only checks next_multixact_id field in restored pg_control, # since we don't have functions to check multixact internals. # -def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory, - pg_bin, zenith_cli, base_dir, test_output_dir): +def test_multixact(pageserver: ZenithPageserver, + postgres: PostgresFactory, + pg_bin, + zenith_cli, + base_dir, + test_output_dir): # Create a branch for us zenith_cli.run(["branch", "test_multixact", "empty"]) pg = postgres.create_start('test_multixact') - print("postgres is running on 'test_multixact' branch") + log.info("postgres is running on 'test_multixact' branch") pg_conn = pg.connect() cur = pg_conn.cursor() @@ -55,7 +60,7 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory, zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn]) pg_new = postgres.create_start('test_multixact_new') - print("postgres is running on 'test_multixact_new' branch") + log.info("postgres is running on 'test_multixact_new' branch") pg_new_conn = pg_new.connect() cur_new = pg_new_conn.cursor() diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index bb28bdd83f..6cc5c01b83 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -1,9 +1,11 @@ from contextlib import closing from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") + # # Test where Postgres generates a lot of WAL, and it's garbage collected away, but # no pages are evicted so that Postgres uses an old LSN in a GetPage request. @@ -14,11 +16,14 @@ pytest_plugins = ("fixtures.zenith_fixtures") # just a hint that the page hasn't been modified since that LSN, and the page # server should return the latest page version regardless of the LSN. # -def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin): +def test_old_request_lsn(zenith_cli, + pageserver: ZenithPageserver, + postgres: PostgresFactory, + pg_bin): # Create a branch for us zenith_cli.run(["branch", "test_old_request_lsn", "empty"]) pg = postgres.create_start('test_old_request_lsn') - print('postgres is running on test_old_request_lsn branch') + log.info('postgres is running on test_old_request_lsn branch') pg_conn = pg.connect() cur = pg_conn.cursor() @@ -46,20 +51,20 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos from pg_settings where name = 'shared_buffers' ''') row = cur.fetchone() - print(f'shared_buffers is {row[0]}, table size {row[1]}'); + log.info(f'shared_buffers is {row[0]}, table size {row[1]}') assert int(row[0]) < int(row[1]) - cur.execute('VACUUM foo'); + cur.execute('VACUUM foo') # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") for j in range(100): - cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;'); + cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;') # All (or at least most of) the updates should've been on the same page, so # that we haven't had to evict any dirty pages for a long time. Now run # a query that sends GetPage@LSN requests with the old LSN. - cur.execute("SELECT COUNT(*), SUM(val) FROM foo"); + cur.execute("SELECT COUNT(*), SUM(val) FROM foo") assert cur.fetchone() == (100000, 101000) diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 8d0f92a263..95b0172e4c 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -63,7 +63,8 @@ def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli): cur = conn.cursor() # check same tenant cannot be created twice - with pytest.raises(psycopg2.DatabaseError, match=f'tenant {pageserver.initial_tenant} already exists'): + with pytest.raises(psycopg2.DatabaseError, + match=f'tenant {pageserver.initial_tenant} already exists'): cur.execute(f'tenant_create {pageserver.initial_tenant}') # create one more tenant @@ -102,5 +103,6 @@ def test_pageserver_http_api_client(pageserver: ZenithPageserver): def test_pageserver_http_api_client_auth_enabled(pageserver_auth_enabled: ZenithPageserver): - client = pageserver_auth_enabled.http_client(auth_token=pageserver_auth_enabled.auth_keys.generate_management_token()) + client = pageserver_auth_enabled.http_client( + auth_token=pageserver_auth_enabled.auth_keys.generate_management_token()) check_client(client, pageserver_auth_enabled.initial_tenant) diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 18b17a4efb..5b4943aa27 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -5,20 +5,24 @@ import time from contextlib import closing from multiprocessing import Process, Value from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") + # Check that dead minority doesn't prevent the commits: execute insert n_inserts # times, with fault_probability chance of getting a wal acceptor down or up # along the way. 2 of 3 are always alive, so the work keeps going. -def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory): +def test_pageserver_restart(zenith_cli, + pageserver: ZenithPageserver, + postgres: PostgresFactory, + wa_factory: WalAcceptorFactory): # One safekeeper is enough for this test. wa_factory.start_n_new(1) zenith_cli.run(["branch", "test_pageserver_restart", "empty"]) - pg = postgres.create_start('test_pageserver_restart', - wal_acceptors=wa_factory.get_connstrs()) + pg = postgres.create_start('test_pageserver_restart', wal_acceptors=wa_factory.get_connstrs()) pg_conn = pg.connect() cur = pg_conn.cursor() @@ -40,14 +44,14 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: from pg_settings where name = 'shared_buffers' ''') row = cur.fetchone() - print("shared_buffers is {}, table size {}", row[0], row[1]); + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) # Stop and restart pageserver. This is a more or less graceful shutdown, although # the page server doesn't currently have a shutdown routine so there's no difference # between stopping and crashing. - pageserver.stop(); - pageserver.start(); + pageserver.stop() + pageserver.start() # Stopping the pageserver breaks the connection from the postgres backend to # the page server, and causes the next query on the connection to fail. Start a new @@ -61,6 +65,5 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: assert cur.fetchone() == (100000, ) # Stop the page server by force, and restart it - pageserver.stop(); - pageserver.start(); - + pageserver.stop() + pageserver.start() diff --git a/test_runner/batch_others/test_pgbench.py b/test_runner/batch_others/test_pgbench.py index a5423cf3d7..46633daa34 100644 --- a/test_runner/batch_others/test_pgbench.py +++ b/test_runner/batch_others/test_pgbench.py @@ -1,4 +1,5 @@ from fixtures.zenith_fixtures import PostgresFactory +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") @@ -8,7 +9,7 @@ def test_pgbench(postgres: PostgresFactory, pg_bin, zenith_cli): zenith_cli.run(["branch", "test_pgbench", "empty"]) pg = postgres.create_start('test_pgbench') - print("postgres is running on 'test_pgbench' branch") + log.info("postgres is running on 'test_pgbench' branch") connstr = pg.connstr() diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py new file mode 100644 index 0000000000..cc6c11caad --- /dev/null +++ b/test_runner/batch_others/test_readonly_node.py @@ -0,0 +1,89 @@ +import subprocess +from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver + +pytest_plugins = ("fixtures.zenith_fixtures") + + +# +# Create read-only compute nodes, anchored at historical points in time. +# +# This is very similar to the 'test_branch_behind' test, but instead of +# creating branches, creates read-only nodes. +# +def test_readonly_node(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin): + zenith_cli.run(["branch", "test_readonly_node", "empty"]) + + pgmain = postgres.create_start('test_readonly_node') + print("postgres is running on 'test_readonly_node' branch") + + main_pg_conn = pgmain.connect() + main_cur = main_pg_conn.cursor() + + # Create table, and insert the first 100 rows + main_cur.execute('CREATE TABLE foo (t text)') + main_cur.execute(''' + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100) g + ''') + main_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn_a = main_cur.fetchone()[0] + print('LSN after 100 rows: ' + lsn_a) + + # Insert some more rows. (This generates enough WAL to fill a few segments.) + main_cur.execute(''' + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 200000) g + ''') + main_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn_b = main_cur.fetchone()[0] + print('LSN after 200100 rows: ' + lsn_b) + + # Insert many more rows. This generates enough WAL to fill a few segments. + main_cur.execute(''' + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 200000) g + ''') + + main_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn_c = main_cur.fetchone()[0] + print('LSN after 400100 rows: ' + lsn_c) + + # Create first read-only node at the point where only 100 rows were inserted + pg_hundred = postgres.create_start("test_readonly_node_hundred", + branch=f'test_readonly_node@{lsn_a}') + + # And another at the point where 200100 rows were inserted + pg_more = postgres.create_start("test_readonly_node_more", branch=f'test_readonly_node@{lsn_b}') + + # On the 'hundred' node, we should see only 100 rows + hundred_pg_conn = pg_hundred.connect() + hundred_cur = hundred_pg_conn.cursor() + hundred_cur.execute('SELECT count(*) FROM foo') + assert hundred_cur.fetchone() == (100, ) + + # On the 'more' node, we should see 100200 rows + more_pg_conn = pg_more.connect() + more_cur = more_pg_conn.cursor() + more_cur.execute('SELECT count(*) FROM foo') + assert more_cur.fetchone() == (200100, ) + + # All the rows are visible on the main branch + main_cur.execute('SELECT count(*) FROM foo') + assert main_cur.fetchone() == (400100, ) + + # Check creating a node at segment boundary + pg = postgres.create_start("test_branch_segment_boundary", + branch="test_readonly_node@0/3000000") + cur = pg.connect().cursor() + cur.execute('SELECT 1') + assert cur.fetchone() == (1, ) + + # Create node at pre-initdb lsn + try: + zenith_cli.run(["pg", "start", "test_branch_preinitdb", "test_readonly_node@0/42"]) + assert false, "compute node startup with invalid LSN should have failed" + except Exception: + print("Node creation with pre-initdb LSN failed (as expected)") diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index 193b675e23..5d47d32aac 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -2,6 +2,7 @@ import pytest from contextlib import closing from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") @@ -11,13 +12,13 @@ pytest_plugins = ("fixtures.zenith_fixtures") # @pytest.mark.parametrize('with_wal_acceptors', [False, True]) def test_restart_compute( - zenith_cli, - pageserver: ZenithPageserver, - postgres: PostgresFactory, - pg_bin, - wa_factory, - with_wal_acceptors: bool, - ): + zenith_cli, + pageserver: ZenithPageserver, + postgres: PostgresFactory, + pg_bin, + wa_factory, + with_wal_acceptors: bool, +): wal_acceptor_connstrs = None zenith_cli.run(["branch", "test_restart_compute", "empty"]) @@ -25,9 +26,8 @@ def test_restart_compute( wa_factory.start_n_new(3) wal_acceptor_connstrs = wa_factory.get_connstrs() - pg = postgres.create_start('test_restart_compute', - wal_acceptors=wal_acceptor_connstrs) - print("postgres is running on 'test_restart_compute' branch") + pg = postgres.create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs) + log.info("postgres is running on 'test_restart_compute' branch") with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -36,12 +36,10 @@ def test_restart_compute( cur.execute('SELECT sum(key) FROM t') r = cur.fetchone() assert r == (5000050000, ) - print("res = ", r) + log.info(f"res = {r}") # Remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute', - wal_acceptors=wal_acceptor_connstrs) - + pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs) with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -49,7 +47,7 @@ def test_restart_compute( cur.execute('SELECT sum(key) FROM t') r = cur.fetchone() assert r == (5000050000, ) - print("res = ", r) + log.info(f"res = {r}") # Insert another row cur.execute("INSERT INTO t VALUES (100001, 'payload2')") @@ -57,11 +55,10 @@ def test_restart_compute( r = cur.fetchone() assert r == (100001, ) - print("res = ", r) + log.info(f"res = {r}") # Again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute', - wal_acceptors=wal_acceptor_connstrs) + pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs) # That select causes lots of FPI's and increases probability of wakeepers # lagging behind after query completion @@ -72,11 +69,10 @@ def test_restart_compute( r = cur.fetchone() assert r == (100001, ) - print("res = ", r) + log.info(f"res = {r}") # And again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute', - wal_acceptors=wal_acceptor_connstrs) + pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs) with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -85,4 +81,4 @@ def test_restart_compute( r = cur.fetchone() assert r == (100001, ) - print("res = ", r) + log.info(f"res = {r}") diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py index e01bf7f179..a799b34aa6 100644 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ b/test_runner/batch_others/test_snapfiles_gc.py @@ -1,13 +1,19 @@ from contextlib import closing import psycopg2.extras -import time; +import time +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") + def print_gc_result(row): - print("GC duration {elapsed} ms".format_map(row)); - print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row)) - print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row)) + log.info("GC duration {elapsed} ms".format_map(row)) + log.info( + " REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}" + .format_map(row)) + log.info( + " NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}" + .format_map(row)) # @@ -23,7 +29,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): with closing(pg.connect()) as conn: with conn.cursor() as cur: with closing(pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur: + with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: # Get the timeline ID of our branch. We need it for the 'do_gc' command cur.execute("SHOW zenith.zenith_timeline") @@ -33,9 +39,9 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): cur.execute("CREATE TABLE foo(x integer)") cur.execute("INSERT INTO foo VALUES (1)") - cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass"); - row = cur.fetchone(); - print("relfilenode is {}", row[0]); + cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass") + row = cur.fetchone() + log.info(f"relfilenode is {row[0]}") # Run GC, to clear out any garbage left behind in the catalogs by # the CREATE TABLE command. We want to have a clean slate with no garbage @@ -50,22 +56,23 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): # update to confuse our numbers either. cur.execute("DELETE FROM foo") - print("Running GC before test") + log.info("Running GC before test") pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") row = pscur.fetchone() - print_gc_result(row); + print_gc_result(row) # remember the number of files - layer_relfiles_remain = row['layer_relfiles_total'] - row['layer_relfiles_removed'] + layer_relfiles_remain = (row['layer_relfiles_total'] - + row['layer_relfiles_removed']) assert layer_relfiles_remain > 0 # Insert a row and run GC. Checkpoint should freeze the layer # so that there is only the most recent image layer left for the rel, # removing the old image and delta layer. - print("Inserting one row and running GC") + log.info("Inserting one row and running GC") cur.execute("INSERT INTO foo VALUES (1)") pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") row = pscur.fetchone() - print_gc_result(row); + print_gc_result(row) assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 assert row['layer_relfiles_removed'] == 2 assert row['layer_relfiles_dropped'] == 0 @@ -73,34 +80,34 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): # Insert two more rows and run GC. # This should create new image and delta layer file with the new contents, and # then remove the old one image and the just-created delta layer. - print("Inserting two more rows and running GC") + log.info("Inserting two more rows and running GC") cur.execute("INSERT INTO foo VALUES (2)") cur.execute("INSERT INTO foo VALUES (3)") pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") row = pscur.fetchone() - print_gc_result(row); + print_gc_result(row) assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 assert row['layer_relfiles_removed'] == 2 assert row['layer_relfiles_dropped'] == 0 # Do it again. Should again create two new layer files and remove old ones. - print("Inserting two more rows and running GC") + log.info("Inserting two more rows and running GC") cur.execute("INSERT INTO foo VALUES (2)") cur.execute("INSERT INTO foo VALUES (3)") pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") row = pscur.fetchone() - print_gc_result(row); + print_gc_result(row) assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 assert row['layer_relfiles_removed'] == 2 assert row['layer_relfiles_dropped'] == 0 # Run GC again, with no changes in the database. Should not remove anything. - print("Run GC again, with nothing to do") + log.info("Run GC again, with nothing to do") pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") row = pscur.fetchone() - print_gc_result(row); + print_gc_result(row) assert row['layer_relfiles_total'] == layer_relfiles_remain assert row['layer_relfiles_removed'] == 0 assert row['layer_relfiles_dropped'] == 0 @@ -108,12 +115,12 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): # # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage # - print("Drop table and run GC again"); + log.info("Drop table and run GC again") cur.execute("DROP TABLE foo") pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") row = pscur.fetchone() - print_gc_result(row); + print_gc_result(row) # We still cannot remove the latest layers # because they serve as tombstones for earlier layers. diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index ee6bb0bfd3..d646f10666 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -21,18 +21,30 @@ def test_tenants_normal_work( tenant_1 = tenant_factory.create() tenant_2 = tenant_factory.create() - zenith_cli.run(["branch", f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", "main", f"--tenantid={tenant_1}"]) - zenith_cli.run(["branch", f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", "main", f"--tenantid={tenant_2}"]) + zenith_cli.run([ + "branch", + f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", + "main", + f"--tenantid={tenant_1}" + ]) + zenith_cli.run([ + "branch", + f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", + "main", + f"--tenantid={tenant_2}" + ]) if with_wal_acceptors: wa_factory.start_n_new(3) pg_tenant1 = postgres.create_start( f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", + None, # branch name, None means same as node name tenant_1, wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None, ) pg_tenant2 = postgres.create_start( f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", + None, # branch name, None means same as node name tenant_2, wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None, ) @@ -45,4 +57,4 @@ def test_tenants_normal_work( cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (5000050000,) + assert cur.fetchone() == (5000050000, ) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 45b0c98d40..819edc26b4 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -2,11 +2,10 @@ from contextlib import closing from uuid import UUID import psycopg2.extras from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log -def test_timeline_size( - zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin -): +def test_timeline_size(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin): # Branch at the point where only 100 rows were inserted zenith_cli.run(["branch", "test_timeline_size", "empty"]) @@ -15,7 +14,7 @@ def test_timeline_size( assert res["current_logical_size"] == res["current_logical_size_non_incremental"] pgmain = postgres.create_start("test_timeline_size") - print("postgres is running on 'test_timeline_size' branch") + log.info("postgres is running on 'test_timeline_size' branch") with closing(pgmain.connect()) as conn: with conn.cursor() as cur: @@ -23,13 +22,11 @@ def test_timeline_size( # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (t text)") - cur.execute( - """ + cur.execute(""" INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10) g - """ - ) + """) res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size") assert res["current_logical_size"] == res["current_logical_size_non_incremental"] diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/batch_others/test_twophase.py index d818f04da4..bc6ee076c1 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/batch_others/test_twophase.py @@ -1,7 +1,7 @@ import os from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, PgBin - +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") @@ -9,11 +9,14 @@ pytest_plugins = ("fixtures.zenith_fixtures") # # Test branching, when a transaction is in prepared state # -def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin: PgBin): +def test_twophase(zenith_cli, + pageserver: ZenithPageserver, + postgres: PostgresFactory, + pg_bin: PgBin): zenith_cli.run(["branch", "test_twophase", "empty"]) pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) - print("postgres is running on 'test_twophase' branch") + log.info("postgres is running on 'test_twophase' branch") conn = pg.connect() cur = conn.cursor() @@ -45,7 +48,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa cur.execute('CHECKPOINT') twophase_files = os.listdir(pg.pg_twophase_dir_path()) - print(twophase_files) + log.info(twophase_files) assert len(twophase_files) == 4 cur.execute("COMMIT PREPARED 'insert_three'") @@ -53,7 +56,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa cur.execute('CHECKPOINT') twophase_files = os.listdir(pg.pg_twophase_dir_path()) - print(twophase_files) + log.info(twophase_files) assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state @@ -67,7 +70,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa # Check that we restored only needed twophase files twophase_files2 = os.listdir(pg2.pg_twophase_dir_path()) - print(twophase_files2) + log.info(twophase_files2) assert twophase_files2.sort() == twophase_files.sort() conn2 = pg2.connect() @@ -79,8 +82,8 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa cur2.execute("ROLLBACK PREPARED 'insert_two'") cur2.execute('SELECT * FROM foo') - assert cur2.fetchall() == [('one',), ('three',)] + assert cur2.fetchall() == [('one', ), ('three', )] # Only one committed insert is visible on the original branch cur.execute('SELECT * FROM foo') - assert cur.fetchall() == [('three',)] + assert cur.fetchall() == [('three', )] diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index 92509fcbbb..6f19940f2f 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -1,17 +1,23 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") + # # Test that the VM bit is cleared correctly at a HEAP_DELETE and # HEAP_UPDATE record. # -def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, base_dir): +def test_vm_bit_clear(pageserver: ZenithPageserver, + postgres: PostgresFactory, + pg_bin, + zenith_cli, + base_dir): # Create a branch for us zenith_cli.run(["branch", "test_vm_bit_clear", "empty"]) pg = postgres.create_start('test_vm_bit_clear') - print("postgres is running on 'test_vm_bit_clear' branch") + log.info("postgres is running on 'test_vm_bit_clear' branch") pg_conn = pg.connect() cur = pg_conn.cursor() @@ -48,13 +54,12 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p ''') cur.execute('SELECT * FROM vmtest_delete WHERE id = 1') - assert(cur.fetchall() == []); + assert (cur.fetchall() == []) cur.execute('SELECT * FROM vmtest_update WHERE id = 1') - assert(cur.fetchall() == []); + assert (cur.fetchall() == []) cur.close() - # Check the same thing on the branch that we created right after the DELETE # # As of this writing, the code in smgrwrite() creates a full-page image whenever @@ -63,7 +68,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p # server at the right point-in-time avoids that full-page image. pg_new = postgres.create_start('test_vm_bit_clear_new') - print("postgres is running on 'test_vm_bit_clear_new' branch") + log.info("postgres is running on 'test_vm_bit_clear_new' branch") pg_new_conn = pg_new.connect() cur_new = pg_new_conn.cursor() @@ -74,6 +79,6 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p ''') cur_new.execute('SELECT * FROM vmtest_delete WHERE id = 1') - assert(cur_new.fetchall() == []); + assert (cur_new.fetchall() == []) cur_new.execute('SELECT * FROM vmtest_update WHERE id = 1') - assert(cur_new.fetchall() == []); + assert (cur_new.fetchall() == []) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index b5577f28d0..3eaadc78a6 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -9,13 +9,17 @@ from contextlib import closing from multiprocessing import Process, Value from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory, PgBin from fixtures.utils import lsn_to_hex, mkdir_if_needed +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") # basic test, write something in setup with wal acceptors, ensure that commits # succeed and data is written -def test_normal_work(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory): +def test_normal_work(zenith_cli, + pageserver: ZenithPageserver, + postgres: PostgresFactory, + wa_factory): zenith_cli.run(["branch", "test_wal_acceptors_normal_work", "empty"]) wa_factory.start_n_new(3) pg = postgres.create_start('test_wal_acceptors_normal_work', @@ -33,7 +37,10 @@ def test_normal_work(zenith_cli, pageserver: ZenithPageserver, postgres: Postgre # Run page server and multiple acceptors, and multiple compute nodes running # against different timelines. -def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory): +def test_many_timelines(zenith_cli, + pageserver: ZenithPageserver, + postgres: PostgresFactory, + wa_factory): n_timelines = 2 wa_factory.start_n_new(3) @@ -65,7 +72,10 @@ def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: Post # Check that dead minority doesn't prevent the commits: execute insert n_inserts # times, with fault_probability chance of getting a wal acceptor down or up # along the way. 2 of 3 are always alive, so the work keeps going. -def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory): +def test_restarts(zenith_cli, + pageserver: ZenithPageserver, + postgres: PostgresFactory, + wa_factory: WalAcceptorFactory): fault_probability = 0.01 n_inserts = 1000 n_acceptors = 3 @@ -176,7 +186,11 @@ def stop_value(): # do inserts while concurrently getting up/down subsets of acceptors -def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory, stop_value): +def test_race_conditions(zenith_cli, + pageserver: ZenithPageserver, + postgres: PostgresFactory, + wa_factory, + stop_value): wa_factory.start_n_new(3) @@ -203,6 +217,7 @@ def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: Pos stop_value.value = 1 proc.join() + class ProposerPostgres: """Object for running safekeepers sync with walproposer""" def __init__(self, pgdata_dir: str, pg_bin: PgBin, timeline_id: str, tenant_id: str): @@ -284,10 +299,37 @@ def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorF ) lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"]) lsn_after_append.append(lsn_hex) - print(f"safekeeper[{i}] lsn after append: {lsn_hex}") + log.info(f"safekeeper[{i}] lsn after append: {lsn_hex}") # run sync safekeepers lsn_after_sync = pg.sync_safekeepers() - print(f"lsn after sync = {lsn_after_sync}") + log.info(f"lsn after sync = {lsn_after_sync}") assert all(lsn_after_sync == lsn for lsn in lsn_after_append) + + +def test_timeline_status(zenith_cli, pageserver, postgres, wa_factory: WalAcceptorFactory): + wa_factory.start_n_new(1) + + zenith_cli.run(["branch", "test_timeline_status", "empty"]) + pg = postgres.create_start('test_timeline_status', wal_acceptors=wa_factory.get_connstrs()) + + wa = wa_factory.instances[0] + wa_http_cli = wa.http_client() + wa_http_cli.check_status() + + # learn zenith timeline from compute + tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] + timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + + # fetch something sensible from status + epoch = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch + + pg.safe_psql("create table t(i int)") + + # ensure epoch goes up after reboot + pg.stop().start() + pg.safe_psql("insert into t values(10)") + + epoch_after_reboot = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch + assert epoch_after_reboot > epoch diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index b1647a8544..a5d4191375 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -3,9 +3,10 @@ import asyncpg import random from fixtures.zenith_fixtures import WalAcceptor, WalAcceptorFactory, ZenithPageserver, PostgresFactory, Postgres +from fixtures.log_helper import getLogger from typing import List -from fixtures.utils import debug_print +log = getLogger('root.wal_acceptor_async') pytest_plugins = ("fixtures.zenith_fixtures") @@ -18,13 +19,16 @@ class BankClient(object): async def initdb(self): await self.conn.execute('DROP TABLE IF EXISTS bank_accs') await self.conn.execute('CREATE TABLE bank_accs(uid int primary key, amount int)') - await self.conn.execute(''' + await self.conn.execute( + ''' INSERT INTO bank_accs SELECT *, $1 FROM generate_series(0, $2) - ''', self.init_amount, self.n_accounts - 1) + ''', + self.init_amount, + self.n_accounts - 1) await self.conn.execute('DROP TABLE IF EXISTS bank_log') await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)') - + # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed await self.conn.execute('ALTER TABLE bank_accs SET (autovacuum_enabled = false)') await self.conn.execute('ALTER TABLE bank_log SET (autovacuum_enabled = false)') @@ -33,6 +37,7 @@ class BankClient(object): row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs') assert row['sum'] == self.n_accounts * self.init_amount + async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount): # avoid deadlocks by sorting uids if from_uid > to_uid: @@ -41,16 +46,22 @@ async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount): async with conn.transaction(): await conn.execute( 'UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2', - amount, to_uid, + amount, + to_uid, ) await conn.execute( 'UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2', - amount, from_uid, + amount, + from_uid, ) - await conn.execute('INSERT INTO bank_log VALUES ($1, $2, $3)', - from_uid, to_uid, amount, + await conn.execute( + 'INSERT INTO bank_log VALUES ($1, $2, $3)', + from_uid, + to_uid, + amount, ) + class WorkerStats(object): def __init__(self, n_workers): self.counters = [0] * n_workers @@ -63,18 +74,18 @@ class WorkerStats(object): self.counters[worker_id] += 1 def check_progress(self): - debug_print("Workers progress: {}".format(self.counters)) + log.debug("Workers progress: {}".format(self.counters)) # every worker should finish at least one tx assert all(cnt > 0 for cnt in self.counters) progress = sum(self.counters) - print('All workers made {} transactions'.format(progress)) + log.info('All workers made {} transactions'.format(progress)) async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer): pg_conn = await pg.connect_async() - debug_print('Started worker {}'.format(worker_id)) + log.debug('Started worker {}'.format(worker_id)) while stats.running: from_uid = random.randint(0, n_accounts - 1) @@ -84,9 +95,9 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou await bank_transfer(pg_conn, from_uid, to_uid, amount) stats.inc_progress(worker_id) - debug_print('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid)) + log.debug('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid)) - debug_print('Finished worker {}'.format(worker_id)) + log.debug('Finished worker {}'.format(worker_id)) await pg_conn.close() @@ -113,7 +124,6 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_ worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer) workers.append(asyncio.create_task(worker)) - for it in range(iterations): victim = acceptors[it % len(acceptors)] victim.stop() @@ -121,10 +131,7 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_ # Wait till previous victim recovers so it is ready for the next # iteration by making any writing xact. conn = await pg.connect_async() - await conn.execute( - 'UPDATE bank_accs SET amount = amount WHERE uid = 1', - timeout=120 - ) + await conn.execute('UPDATE bank_accs SET amount = amount WHERE uid = 1', timeout=120) await conn.close() stats.reset() @@ -134,7 +141,7 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_ victim.start() - print('Iterations are finished, exiting coroutines...') + log.info('Iterations are finished, exiting coroutines...') stats.running = False # await all workers await asyncio.gather(*workers) @@ -144,7 +151,9 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_ # restart acceptors one by one, while executing and validating bank transactions -def test_restarts_under_load(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, +def test_restarts_under_load(zenith_cli, + pageserver: ZenithPageserver, + postgres: PostgresFactory, wa_factory: WalAcceptorFactory): wa_factory.start_n_new(3) diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index be9e2b07fd..7379cf2981 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -23,8 +23,11 @@ def helper_compare_branch_list(page_server_cur, zenith_cli, initial_tenant: str) res = zenith_cli.run(["branch", f"--tenantid={initial_tenant}"]) res.check_returncode() - branches_cli_with_tenant_arg = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - branches_cli_with_tenant_arg = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')] + branches_cli_with_tenant_arg = sorted( + map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) + branches_cli_with_tenant_arg = [ + b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main') + ] assert branches_api == branches_cli == branches_cli_with_tenant_arg @@ -54,6 +57,7 @@ def test_cli_branch_list(pageserver: ZenithPageserver, zenith_cli): assert 'test_cli_branch_list_main' in branches_cli assert 'test_cli_branch_list_nested' in branches_cli + def helper_compare_tenant_list(page_server_cur, zenith_cli: ZenithCli): page_server_cur.execute(f'tenant_list') tenants_api = sorted(json.loads(page_server_cur.fetchone()[0])) diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index ae654401cc..0f215337be 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -6,8 +6,14 @@ from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory pytest_plugins = ("fixtures.zenith_fixtures") -def test_isolation(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, - base_dir, capsys): +def test_isolation(pageserver: ZenithPageserver, + postgres: PostgresFactory, + pg_bin, + zenith_cli, + test_output_dir, + pg_distrib_dir, + base_dir, + capsys): # Create a branch for us zenith_cli.run(["branch", "test_isolation", "empty"]) diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index 6f61b77ebc..2fd7fee314 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -6,8 +6,14 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_re pytest_plugins = ("fixtures.zenith_fixtures") -def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, - base_dir, capsys): +def test_pg_regress(pageserver: ZenithPageserver, + postgres: PostgresFactory, + pg_bin, + zenith_cli, + test_output_dir, + pg_distrib_dir, + base_dir, + capsys): # Create a branch for us zenith_cli.run(["branch", "test_pg_regress", "empty"]) diff --git a/test_runner/batch_pg_regress/test_zenith_regress.py b/test_runner/batch_pg_regress/test_zenith_regress.py index 09f5f83933..ca1422388e 100644 --- a/test_runner/batch_pg_regress/test_zenith_regress.py +++ b/test_runner/batch_pg_regress/test_zenith_regress.py @@ -2,12 +2,19 @@ import os from fixtures.utils import mkdir_if_needed from fixtures.zenith_fixtures import PageserverPort, PostgresFactory, check_restored_datadir_content +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures") -def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, - base_dir, capsys, pageserver_port: PageserverPort): +def test_zenith_regress(postgres: PostgresFactory, + pg_bin, + zenith_cli, + test_output_dir, + pg_distrib_dir, + base_dir, + capsys, + pageserver_port: PageserverPort): # Create a branch for us zenith_cli.run(["branch", "test_zenith_regress", "empty"]) @@ -38,7 +45,7 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp '--inputdir={}'.format(src_path), ] - print(pg_regress_command) + log.info(pg_regress_command) env = { 'PGPORT': str(pg.port), 'PGUSER': pg.username, diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 328ebcc1f8..f41d66674d 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -1,5 +1,3 @@ -from pprint import pprint - import os import re import timeit @@ -26,7 +24,6 @@ from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast from typing_extensions import Literal from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture) - """ This file contains fixtures for micro-benchmarks. @@ -57,7 +54,6 @@ in the test initialization, or measure disk usage after the test query. """ - # All the results are collected in this list, as a tuple: # (test_name: str, metric_name: str, metric_value: float, unit: str) # @@ -67,6 +63,7 @@ in the test initialization, or measure disk usage after the test query. global zenbenchmark_results zenbenchmark_results = [] + class ZenithBenchmarkResults: """ An object for recording benchmark results. """ def __init__(self): @@ -79,6 +76,7 @@ class ZenithBenchmarkResults: self.results.append((test_name, metric_name, metric_value, unit)) + # Session scope fixture that initializes the results object @pytest.fixture(autouse=True, scope='session') def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]: @@ -90,6 +88,7 @@ def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]: yield zenbenchmark_results + class ZenithBenchmarker: """ An object for recording benchmark results. This is created for each test @@ -105,7 +104,6 @@ class ZenithBenchmarker: """ self.results.record(self.request.node.name, metric_name, metric_value, unit) - @contextmanager def record_duration(self, metric_name): """ @@ -136,7 +134,8 @@ class ZenithBenchmarker: # The metric should be an integer, as it's a number of bytes. But in general # all prometheus metrics are floats. So to be pedantic, read it as a float # and round to integer. - matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$', all_metrics, + matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$', + all_metrics, re.MULTILINE) return int(round(float(matches.group(1)))) @@ -147,8 +146,7 @@ class ZenithBenchmarker: # Fetch all the exposed prometheus metrics from page server all_metrics = pageserver.http_client().get_metrics() # See comment in get_io_writes() - matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, - re.MULTILINE) + matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE) return int(round(float(matches.group(1)))) def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str): @@ -173,7 +171,11 @@ class ZenithBenchmarker: yield after = self.get_io_writes(pageserver) - self.results.record(self.request.node.name, metric_name, round((after - before) / (1024 * 1024)), 'MB') + self.results.record(self.request.node.name, + metric_name, + round((after - before) / (1024 * 1024)), + 'MB') + @pytest.fixture(scope='function') def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]: @@ -187,9 +189,7 @@ def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]: # Hook to print the results at the end @pytest.hookimpl(hookwrapper=True) -def pytest_terminal_summary( - terminalreporter: TerminalReporter, exitstatus: int, config: Config -): +def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config): yield global zenbenchmark_results diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py new file mode 100644 index 0000000000..cab7462a51 --- /dev/null +++ b/test_runner/fixtures/log_helper.py @@ -0,0 +1,45 @@ +import logging +import logging.config +""" +This file configures logging to use in python tests. +Logs are automatically captured and shown in their +own section after all tests are executed. + +To see logs for all (even successful) tests, run +pytest with the following command: +- `pipenv run pytest -n8 -rA` + +Other log config can be set in pytest.ini file. +You can add `log_cli = true` to it to watch +logs in real time. + +To get more info about logging with pytest, see +https://docs.pytest.org/en/6.2.x/logging.html +""" + +# this config is only used for default log levels, +# log format is specified in pytest.ini file +LOGGING = { + "version": 1, + "loggers": { + "root": { + "level": "INFO" + }, + "root.wal_acceptor_async": { + "level": "INFO" # a lot of logs on DEBUG level + } + } +} + + +def getLogger(name='root') -> logging.Logger: + """Method to get logger for tests. + + Should be used to get correctly initialized logger. """ + return logging.getLogger(name) + + +# default logger for tests +log = getLogger() + +logging.config.dictConfig(LOGGING) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 92bd25ed24..dbb1809a2b 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -2,6 +2,7 @@ import os import subprocess from typing import Any, List +from fixtures.log_helper import log def get_self_dir() -> str: @@ -39,7 +40,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: with open(stdout_filename, 'w') as stdout_f: with open(stderr_filename, 'w') as stderr_f: - print('(capturing output to "{}.stdout")'.format(base)) + log.info('(capturing output to "{}.stdout")'.format(base)) subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) return basepath @@ -58,13 +59,6 @@ def global_counter() -> int: _global_counter += 1 return _global_counter -def debug_print(*args, **kwargs) -> None: - """ Print to the console if TEST_DEBUG_PRINT is set in env. - - All parameters are passed to print(). - """ - if os.environ.get('TEST_DEBUG_PRINT') is not None: - print(*args, **kwargs) def lsn_to_hex(num: int) -> str: """ Convert lsn from int to standard hex notation. """ diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index d29d278cdd..868f14ab29 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from functools import cached_property +from cached_property import cached_property import asyncpg import os import pathlib @@ -13,9 +13,8 @@ import signal import subprocess import time import filecmp -import difflib -from contextlib import closing +from contextlib import closing, suppress from pathlib import Path from dataclasses import dataclass @@ -27,6 +26,7 @@ from typing_extensions import Literal import requests from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture) +from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -54,17 +54,18 @@ DEFAULT_POSTGRES_DIR = 'tmp_install' BASE_PORT = 15000 WORKER_PORT_NUM = 100 + def pytest_configure(config): """ Ensure that no unwanted daemons are running before we start testing. Check that we do not owerflow available ports range. """ numprocesses = config.getoption('numprocesses') - if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768: # do not use ephemeral ports - raise Exception('Too many workers configured. Cannot distrubute ports for services.') + if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768: # do not use ephemeral ports + raise Exception('Too many workers configured. Cannot distrubute ports for services.') # does not use -c as it is not supported on macOS - cmd = ['pgrep', 'pageserver|postgres|wal_acceptor'] + cmd = ['pgrep', 'pageserver|postgres|safekeeper'] result = subprocess.run(cmd, stdout=subprocess.DEVNULL) if result.returncode == 0: # returncode of 0 means it found something. @@ -72,7 +73,7 @@ def pytest_configure(config): # result of the test. # NOTE this shows as an internal pytest error, there might be a better way raise Exception( - 'Found interfering processes running. Stop all Zenith pageservers, nodes, WALs, as well as stand-alone Postgres.' + 'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.' ) @@ -105,7 +106,11 @@ class PgProtocol: self.port = port self.username = username or "zenith_admin" - def connstr(self, *, dbname: str = 'postgres', username: Optional[str] = None, password: Optional[str] = None) -> str: + def connstr(self, + *, + dbname: str = 'postgres', + username: Optional[str] = None, + password: Optional[str] = None) -> str: """ Build a libpq connection string for the Postgres instance. """ @@ -117,7 +122,12 @@ class PgProtocol: return f'{res} password={password}' # autocommit=True here by default because that's what we need most of the time - def connect(self, *, autocommit=True, dbname: str = 'postgres', username: Optional[str] = None, password: Optional[str] = None) -> PgConnection: + def connect(self, + *, + autocommit=True, + dbname: str = 'postgres', + username: Optional[str] = None, + password: Optional[str] = None) -> PgConnection: """ Connect to the node. Returns psycopg2's connection object. @@ -133,7 +143,11 @@ class PgProtocol: conn.autocommit = autocommit return conn - async def connect_async(self, *, dbname: str = 'postgres', username: Optional[str] = None, password: Optional[str] = None) -> asyncpg.Connection: + async def connect_async(self, + *, + dbname: str = 'postgres', + username: Optional[str] = None, + password: Optional[str] = None) -> asyncpg.Connection: """ Connect to the node from async python. Returns asyncpg's connection object. @@ -188,22 +202,22 @@ class ZenithCli: >>> result = zenith_cli.run(...) >>> assert result.stderr == "" - >>> print(result.stdout) + >>> log.info(result.stdout) """ assert type(arguments) == list args = [self.bin_zenith] + arguments - print('Running command "{}"'.format(' '.join(args))) + log.info('Running command "{}"'.format(' '.join(args))) # Interceipt CalledProcessError and print more info try: res = subprocess.run(args, - env=self.env, - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + env=self.env, + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) except subprocess.CalledProcessError as exc: # this way command output will be in recorded and shown in CI in failure message msg = f"""\ @@ -211,7 +225,7 @@ class ZenithCli: stdout: {exc.stdout} stderr: {exc.stderr} """ - print(msg) + log.info(msg) raise Exception(msg) from exc @@ -241,21 +255,17 @@ class ZenithPageserverHttpClient(requests.Session): return res.json() def branch_create(self, tenant_id: uuid.UUID, name: str, start_point: str) -> Dict: - res = self.post( - f"http://localhost:{self.port}/v1/branch", - json={ - 'tenant_id': tenant_id.hex, - 'name': name, - 'start_point': start_point, - } - ) + res = self.post(f"http://localhost:{self.port}/v1/branch", + json={ + 'tenant_id': tenant_id.hex, + 'name': name, + 'start_point': start_point, + }) res.raise_for_status() return res.json() def branch_detail(self, tenant_id: uuid.UUID, name: str) -> Dict: - res = self.get( - f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}", - ) + res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}", ) res.raise_for_status() return res.json() @@ -297,7 +307,11 @@ class AuthKeys: return token def generate_tenant_token(self, tenant_id): - token = jwt.encode({"scope": "tenant", "tenant_id": tenant_id}, self.priv, algorithm="RS256") + token = jwt.encode({ + "scope": "tenant", "tenant_id": tenant_id + }, + self.priv, + algorithm="RS256") if isinstance(token, bytes): token = token.decode() @@ -322,6 +336,7 @@ def worker_base_port(worker_seq_no: int): # so workers have disjoint set of ports for services return BASE_PORT + worker_seq_no * WORKER_PORT_NUM + class PortDistributor: def __init__(self, base_port: int, port_number: int) -> None: self.iterator = iter(range(base_port, base_port + port_number)) @@ -330,13 +345,15 @@ class PortDistributor: try: return next(self.iterator) except StopIteration: - raise RuntimeError('port range configured for test is exhausted, consider enlarging the range') + raise RuntimeError( + 'port range configured for test is exhausted, consider enlarging the range') @zenfixture def port_distributor(worker_base_port): return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) + @dataclass class PageserverPort: pg: int @@ -351,14 +368,18 @@ class ZenithPageserver(PgProtocol): self.running = False self.initial_tenant = None self.repo_dir = repo_dir - self.service_port = port # do not shadow PgProtocol.port which is just int + self.service_port = port # do not shadow PgProtocol.port which is just int def init(self, enable_auth: bool = False) -> 'ZenithPageserver': """ Initialize the repository, i.e. run "zenith init". Returns self. """ - cmd = ['init', f'--pageserver-pg-port={self.service_port.pg}', f'--pageserver-http-port={self.service_port.http}'] + cmd = [ + 'init', + f'--pageserver-pg-port={self.service_port.pg}', + f'--pageserver-http-port={self.service_port.http}' + ] if enable_auth: cmd.append('--enable-auth') self.zenith_cli.run(cmd) @@ -375,6 +396,7 @@ class ZenithPageserver(PgProtocol): Start the page server. Returns self. """ + assert self.running == False self.zenith_cli.run(['start']) self.running = True @@ -382,14 +404,18 @@ class ZenithPageserver(PgProtocol): self.initial_tenant = self.zenith_cli.run(['tenant', 'list']).stdout.strip() return self - def stop(self) -> 'ZenithPageserver': + def stop(self, immediate=False) -> 'ZenithPageserver': """ Stop the page server. Returns self. """ + cmd = ['stop'] + if immediate: + cmd.append('immediate') + log.info(f"Stopping pageserver with {cmd}") if self.running: - self.zenith_cli.run(['stop']) + self.zenith_cli.run(cmd) self.running = False return self @@ -398,7 +424,7 @@ class ZenithPageserver(PgProtocol): return self def __exit__(self, exc_type, exc, tb): - self.stop() + self.stop(True) @cached_property def auth_keys(self) -> AuthKeys: @@ -413,18 +439,17 @@ class ZenithPageserver(PgProtocol): ) - - @zenfixture def pageserver_port(port_distributor: PortDistributor) -> PageserverPort: pg = port_distributor.get_port() http = port_distributor.get_port() - print(f"pageserver_port: pg={pg} http={http}") + log.info(f"pageserver_port: pg={pg} http={http}") return PageserverPort(pg=pg, http=http) @zenfixture -def pageserver(zenith_cli: ZenithCli, repo_dir: str, pageserver_port: PageserverPort) -> Iterator[ZenithPageserver]: +def pageserver(zenith_cli: ZenithCli, repo_dir: str, + pageserver_port: PageserverPort) -> Iterator[ZenithPageserver]: """ The 'pageserver' fixture provides a Page Server that's up and running. @@ -436,15 +461,17 @@ def pageserver(zenith_cli: ZenithCli, repo_dir: str, pageserver_port: Pageserver By convention, the test branches are named after the tests. For example, test called 'test_foo' would create and use branches with the 'test_foo' prefix. """ - ps = ZenithPageserver(zenith_cli=zenith_cli, repo_dir=repo_dir, port=pageserver_port).init().start() + ps = ZenithPageserver(zenith_cli=zenith_cli, repo_dir=repo_dir, + port=pageserver_port).init().start() # For convenience in tests, create a branch from the freshly-initialized cluster. zenith_cli.run(["branch", "empty", "main"]) yield ps # After the yield comes any cleanup code we need. - print('Starting pageserver cleanup') - ps.stop() + log.info('Starting pageserver cleanup') + ps.stop(True) + class PgBin: """ A helper class for executing postgres binaries """ @@ -481,7 +508,7 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(' '.join(command))) + log.info('Running command "{}"'.format(' '.join(command))) env = self._build_env(env) subprocess.run(command, env=env, cwd=cwd, check=True) @@ -498,7 +525,7 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(' '.join(command))) + log.info('Running command "{}"'.format(' '.join(command))) env = self._build_env(env) return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs) @@ -507,9 +534,11 @@ class PgBin: def pg_bin(test_output_dir: str, pg_distrib_dir: str) -> PgBin: return PgBin(test_output_dir, pg_distrib_dir) + @pytest.fixture def pageserver_auth_enabled(zenith_cli: ZenithCli, repo_dir: str, pageserver_port: PageserverPort): - with ZenithPageserver(zenith_cli=zenith_cli, repo_dir=repo_dir, port=pageserver_port).init(enable_auth=True).start() as ps: + with ZenithPageserver(zenith_cli=zenith_cli, repo_dir=repo_dir, + port=pageserver_port).init(enable_auth=True).start() as ps: # For convenience in tests, create a branch from the freshly-initialized cluster. zenith_cli.run(["branch", "empty", "main"]) yield ps @@ -517,21 +546,27 @@ def pageserver_auth_enabled(zenith_cli: ZenithCli, repo_dir: str, pageserver_por class Postgres(PgProtocol): """ An object representing a running postgres daemon. """ - def __init__(self, zenith_cli: ZenithCli, repo_dir: str, pg_bin: PgBin, tenant_id: str, port: int): + def __init__(self, + zenith_cli: ZenithCli, + repo_dir: str, + pg_bin: PgBin, + tenant_id: str, + port: int): super().__init__(host='localhost', port=port) self.zenith_cli = zenith_cli self.running = False self.repo_dir = repo_dir - self.branch: Optional[str] = None # dubious, see asserts below - self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA + self.node_name: Optional[str] = None # dubious, see asserts below + self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_bin = pg_bin - # path to conf is /pgdatadirs/tenants///postgresql.conf + # path to conf is /pgdatadirs/tenants///postgresql.conf def create( self, - branch: str, + node_name: str, + branch: Optional[str] = None, wal_acceptors: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': @@ -545,9 +580,19 @@ class Postgres(PgProtocol): if not config_lines: config_lines = [] - self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}', f'--port={self.port}']) - self.branch = branch - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch + if branch is None: + branch = node_name + + self.zenith_cli.run([ + 'pg', + 'create', + f'--tenantid={self.tenant_id}', + f'--port={self.port}', + node_name, + branch + ]) + self.node_name = node_name + path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.node_name self.pgdata_dir = os.path.join(self.repo_dir, path) if wal_acceptors is not None: @@ -564,20 +609,21 @@ class Postgres(PgProtocol): Returns self. """ - assert self.branch is not None + assert self.node_name is not None - print(f"Starting postgres on branch {self.branch}") + log.info(f"Starting postgres node {self.node_name}") - run_result = self.zenith_cli.run(['pg', 'start', self.branch, f'--tenantid={self.tenant_id}', f'--port={self.port}']) + run_result = self.zenith_cli.run( + ['pg', 'start', f'--tenantid={self.tenant_id}', f'--port={self.port}', self.node_name]) self.running = True - print(f"stdout: {run_result.stdout}") + log.info(f"stdout: {run_result.stdout}") return self def pg_data_dir_path(self) -> str: """ Path to data directory """ - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch + path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.node_name return os.path.join(self.repo_dir, path) def pg_xact_dir_path(self) -> str: @@ -634,8 +680,8 @@ class Postgres(PgProtocol): """ if self.running: - assert self.branch is not None - self.zenith_cli.run(['pg', 'stop', self.branch, f'--tenantid={self.tenant_id}']) + assert self.node_name is not None + self.zenith_cli.run(['pg', 'stop', self.node_name, f'--tenantid={self.tenant_id}']) self.running = False return self @@ -646,15 +692,17 @@ class Postgres(PgProtocol): Returns self. """ - assert self.branch is not None + assert self.node_name is not None assert self.tenant_id is not None - self.zenith_cli.run(['pg', 'stop', '--destroy', self.branch, f'--tenantid={self.tenant_id}']) + self.zenith_cli.run( + ['pg', 'stop', '--destroy', self.node_name, f'--tenantid={self.tenant_id}']) return self def create_start( self, - branch: str, + node_name: str, + branch: Optional[str] = None, wal_acceptors: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': @@ -665,6 +713,7 @@ class Postgres(PgProtocol): """ self.create( + node_name=node_name, branch=branch, wal_acceptors=wal_acceptors, config_lines=config_lines, @@ -678,9 +727,15 @@ class Postgres(PgProtocol): def __exit__(self, exc_type, exc, tb): self.stop() + class PostgresFactory: """ An object representing multiple running postgres daemons. """ - def __init__(self, zenith_cli: ZenithCli, repo_dir: str, pg_bin: PgBin, initial_tenant: str, port_distributor: PortDistributor): + def __init__(self, + zenith_cli: ZenithCli, + repo_dir: str, + pg_bin: PgBin, + initial_tenant: str, + port_distributor: PortDistributor): self.zenith_cli = zenith_cli self.repo_dir = repo_dir self.num_instances = 0 @@ -689,13 +744,13 @@ class PostgresFactory: self.port_distributor = port_distributor self.pg_bin = pg_bin - def create_start( - self, - branch: str = "main", - tenant_id: Optional[str] = None, - wal_acceptors: Optional[str] = None, - config_lines: Optional[List[str]] = None - ) -> Postgres: + def create_start(self, + node_name: str = "main", + branch: Optional[str] = None, + tenant_id: Optional[str] = None, + wal_acceptors: Optional[str] = None, + config_lines: Optional[List[str]] = None) -> Postgres: + pg = Postgres( zenith_cli=self.zenith_cli, repo_dir=self.repo_dir, @@ -707,18 +762,18 @@ class PostgresFactory: self.instances.append(pg) return pg.create_start( + node_name=node_name, branch=branch, wal_acceptors=wal_acceptors, config_lines=config_lines, ) - def create( - self, - branch: str = "main", - tenant_id: Optional[str] = None, - wal_acceptors: Optional[str] = None, - config_lines: Optional[List[str]] = None - ) -> Postgres: + def create(self, + node_name: str = "main", + branch: Optional[str] = None, + tenant_id: Optional[str] = None, + wal_acceptors: Optional[str] = None, + config_lines: Optional[List[str]] = None) -> Postgres: pg = Postgres( zenith_cli=self.zenith_cli, @@ -732,18 +787,17 @@ class PostgresFactory: self.instances.append(pg) return pg.create( + node_name=node_name, branch=branch, wal_acceptors=wal_acceptors, config_lines=config_lines, ) - def config( - self, - branch: str = "main", - tenant_id: Optional[str] = None, - wal_acceptors: Optional[str] = None, - config_lines: Optional[List[str]] = None - ) -> Postgres: + def config(self, + node_name: str = "main", + tenant_id: Optional[str] = None, + wal_acceptors: Optional[str] = None, + config_lines: Optional[List[str]] = None) -> Postgres: pg = Postgres( zenith_cli=self.zenith_cli, @@ -757,7 +811,7 @@ class PostgresFactory: self.instances.append(pg) return pg.config( - branch=branch, + node_name=node_name, wal_acceptors=wal_acceptors, config_lines=config_lines, ) @@ -768,13 +822,18 @@ class PostgresFactory: return self + @zenfixture def initial_tenant(pageserver: ZenithPageserver): return pageserver.initial_tenant @zenfixture -def postgres(zenith_cli: ZenithCli, initial_tenant: str, repo_dir: str, pg_bin: PgBin, port_distributor: PortDistributor) -> Iterator[PostgresFactory]: +def postgres(zenith_cli: ZenithCli, + initial_tenant: str, + repo_dir: str, + pg_bin: PgBin, + port_distributor: PortDistributor) -> Iterator[PostgresFactory]: pgfactory = PostgresFactory( zenith_cli=zenith_cli, repo_dir=repo_dir, @@ -786,53 +845,69 @@ def postgres(zenith_cli: ZenithCli, initial_tenant: str, repo_dir: str, pg_bin: yield pgfactory # After the yield comes any cleanup code we need. - print('Starting postgres cleanup') + log.info('Starting postgres cleanup') pgfactory.stop_all() + def read_pid(path: Path): """ Read content of file into number """ return int(path.read_text()) +@dataclass +class WalAcceptorPort: + pg: int + http: int + + @dataclass class WalAcceptor: """ An object representing a running wal acceptor daemon. """ wa_bin_path: Path data_dir: Path - port: int - num: int # identifier for logging + port: WalAcceptorPort + num: int # identifier for logging pageserver_port: int auth_token: Optional[str] = None def start(self) -> 'WalAcceptor': # create data directory if not exists self.data_dir.mkdir(parents=True, exist_ok=True) - self.pidfile.unlink(missing_ok=True) + with suppress(FileNotFoundError): + self.pidfile.unlink() cmd = [str(self.wa_bin_path)] cmd.extend(["-D", str(self.data_dir)]) - cmd.extend(["-l", f"localhost:{self.port}"]) + cmd.extend(["--listen-pg", f"localhost:{self.port.pg}"]) + cmd.extend(["--listen-http", f"localhost:{self.port.http}"]) cmd.append("--daemonize") cmd.append("--no-sync") # Tell page server it can receive WAL from this WAL safekeeper cmd.extend(["--pageserver", f"localhost:{self.pageserver_port}"]) cmd.extend(["--recall", "1 second"]) - print('Running command "{}"'.format(' '.join(cmd))) + log.info('Running command "{}"'.format(' '.join(cmd))) env = {'PAGESERVER_AUTH_TOKEN': self.auth_token} if self.auth_token else None subprocess.run(cmd, check=True, env=env) - # wait for wal acceptor start by checkking that pid is readable - for _ in range(3): - pid = self.get_pid() - if pid is not None: - return self - time.sleep(0.5) - - raise RuntimeError("cannot get wal acceptor pid") + # wait for wal acceptor start by checking its status + started_at = time.time() + while True: + try: + http_cli = self.http_client() + http_cli.check_status() + except Exception as e: + elapsed = time.time() - started_at + if elapsed > 3: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for wal acceptor start: {e}") + time.sleep(0.5) + else: + break # success + return self @property def pidfile(self) -> Path: - return self.data_dir / "wal_acceptor.pid" + return self.data_dir / "safekeeper.pid" def get_pid(self) -> Optional[int]: if not self.pidfile.exists(): @@ -846,20 +921,21 @@ class WalAcceptor: return pid def stop(self) -> 'WalAcceptor': - print('Stopping wal acceptor {}'.format(self.num)) + log.info('Stopping wal acceptor {}'.format(self.num)) pid = self.get_pid() if pid is None: - print("Wal acceptor {} is not running".format(self.num)) + log.info("Wal acceptor {} is not running".format(self.num)) return self try: os.kill(pid, signal.SIGTERM) except Exception: # TODO: cleanup pid file on exit in wal acceptor - pass # pidfile might be obsolete + pass # pidfile might be obsolete return self - def append_logical_message(self, tenant_id: str, timeline_id: str, request: Dict[str, Any]) -> Dict[str, Any]: + def append_logical_message(self, tenant_id: str, timeline_id: str, + request: Dict[str, Any]) -> Dict[str, Any]: """ Send JSON_CTRL query to append LogicalMessage to WAL and modify safekeeper state. It will construct LogicalMessage from provided @@ -868,23 +944,31 @@ class WalAcceptor: # "replication=0" hacks psycopg not to send additional queries # on startup, see https://github.com/psycopg/psycopg2/pull/482 - connstr = f"host=localhost port={self.port} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" with closing(psycopg2.connect(connstr)) as conn: # server doesn't support transactions conn.autocommit = True with conn.cursor() as cur: request_json = json.dumps(request) - print(f"JSON_CTRL request on port {self.port}: {request_json}") + log.info(f"JSON_CTRL request on port {self.port.pg}: {request_json}") cur.execute("JSON_CTRL " + request_json) all = cur.fetchall() - print(f"JSON_CTRL response: {all[0][0]}") + log.info(f"JSON_CTRL response: {all[0][0]}") return json.loads(all[0][0]) + def http_client(self): + return WalAcceptorHttpClient(port=self.port.http) + + class WalAcceptorFactory: """ An object representing multiple running wal acceptors. """ - def __init__(self, zenith_binpath: Path, data_dir: Path, pageserver_port: int, port_distributor: PortDistributor): - self.wa_bin_path = zenith_binpath / 'wal_acceptor' + def __init__(self, + zenith_binpath: Path, + data_dir: Path, + pageserver_port: int, + port_distributor: PortDistributor): + self.wa_bin_path = zenith_binpath / 'safekeeper' self.data_dir = data_dir self.instances: List[WalAcceptor] = [] self.port_distributor = port_distributor @@ -898,7 +982,10 @@ class WalAcceptorFactory: wa = WalAcceptor( wa_bin_path=self.wa_bin_path, data_dir=self.data_dir / "wal_acceptor_{}".format(wa_num), - port=self.port_distributor.get_port(), + port=WalAcceptorPort( + pg=self.port_distributor.get_port(), + http=self.port_distributor.get_port(), + ), num=wa_num, pageserver_port=self.pageserver_port, auth_token=auth_token, @@ -922,11 +1009,14 @@ class WalAcceptorFactory: def get_connstrs(self) -> str: """ Get list of wal acceptor endpoints suitable for wal_acceptors GUC """ - return ','.join(["localhost:{}".format(wa.port) for wa in self.instances]) + return ','.join(["localhost:{}".format(wa.port.pg) for wa in self.instances]) @zenfixture -def wa_factory(zenith_binpath: str, repo_dir: str, pageserver_port: PageserverPort, port_distributor: PortDistributor) -> Iterator[WalAcceptorFactory]: +def wa_factory(zenith_binpath: str, + repo_dir: str, + pageserver_port: PageserverPort, + port_distributor: PortDistributor) -> Iterator[WalAcceptorFactory]: """ Gives WalAcceptorFactory providing wal acceptors. """ wafactory = WalAcceptorFactory( zenith_binpath=Path(zenith_binpath), @@ -936,16 +1026,36 @@ def wa_factory(zenith_binpath: str, repo_dir: str, pageserver_port: PageserverPo ) yield wafactory # After the yield comes any cleanup code we need. - print('Starting wal acceptors cleanup') + log.info('Starting wal acceptors cleanup') wafactory.stop_all() +@dataclass +class PageserverTimelineStatus: + acceptor_epoch: int + + +class WalAcceptorHttpClient(requests.Session): + def __init__(self, port: int) -> None: + super().__init__() + self.port = port + + def check_status(self): + self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + + def timeline_status(self, tenant_id: str, timeline_id: str) -> PageserverTimelineStatus: + res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id}/{timeline_id}") + res.raise_for_status() + resj = res.json() + return PageserverTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch']) + + @zenfixture def base_dir() -> str: """ find the base directory (currently this is the git root) """ base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..')) - print('\nbase_dir is', base_dir) + log.info(f'base_dir is {base_dir}') return base_dir @@ -974,7 +1084,7 @@ def test_output_dir(request: Any, top_output_dir: str) -> str: test_name = 'shared' test_output_dir = os.path.join(top_output_dir, test_name) - print('test_output_dir is', test_output_dir) + log.info(f'test_output_dir is {test_output_dir}') shutil.rmtree(test_output_dir, ignore_errors=True) mkdir_if_needed(test_output_dir) return test_output_dir @@ -1016,7 +1126,7 @@ def pg_distrib_dir(base_dir: str) -> str: pg_dir = env_postgres_bin else: pg_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) - print('postgres dir is', pg_dir) + log.info(f'postgres dir is {pg_dir}') if not os.path.exists(os.path.join(pg_dir, 'bin/postgres')): raise Exception('postgres not found at "{}"'.format(pg_dir)) return pg_dir @@ -1038,6 +1148,7 @@ class TenantFactory: def tenant_factory(zenith_cli: ZenithCli): return TenantFactory(zenith_cli) + # # Test helpers # @@ -1048,18 +1159,29 @@ def list_files_to_compare(pgdata_dir: str): rel_dir = os.path.relpath(root, pgdata_dir) # Skip some dirs and files we don't want to compare skip_dirs = ['pg_wal', 'pg_stat', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical'] - skip_files = ['pg_internal.init', 'pg.log', 'zenith.signal', 'postgresql.conf', - 'postmaster.opts', 'postmaster.pid', 'pg_control'] + skip_files = [ + 'pg_internal.init', + 'pg.log', + 'zenith.signal', + 'postgresql.conf', + 'postmaster.opts', + 'postmaster.pid', + 'pg_control' + ] if rel_dir not in skip_dirs and filename not in skip_files: rel_file = os.path.join(rel_dir, filename) pgdata_files.append(rel_file) pgdata_files.sort() - print(pgdata_files) + log.info(pgdata_files) return pgdata_files + # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(zenith_cli: ZenithCli, test_output_dir: str, pg: Postgres, pageserver_pg_port: int): +def check_restored_datadir_content(zenith_cli: ZenithCli, + test_output_dir: str, + pg: Postgres, + pageserver_pg_port: int): # Get the timeline ID of our branch. We need it for the 'basebackup' command with closing(pg.connect()) as conn: @@ -1071,7 +1193,7 @@ def check_restored_datadir_content(zenith_cli: ZenithCli, test_output_dir: str, pg.stop() # Take a basebackup from pageserver - restored_dir_path = os.path.join(test_output_dir, f"{pg.branch}_restored_datadir") + restored_dir_path = os.path.join(test_output_dir, f"{pg.node_name}_restored_datadir") mkdir_if_needed(restored_dir_path) psql_path = os.path.join(pg.pg_bin.pg_bin_path, 'psql') @@ -1101,9 +1223,7 @@ def check_restored_datadir_content(zenith_cli: ZenithCli, test_output_dir: str, restored_dir_path, pgdata_files, shallow=False) - print('filecmp result mismatch and error lists:') - print(mismatch) - print(error) + log.info(f'filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}') for f in mismatch: diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 95f1ea5e4a..cf6fa03703 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,9 +1,11 @@ import os from contextlib import closing from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture") + # # Run bulk INSERT test. # @@ -14,16 +16,21 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture") # 3. Disk space used # 4. Peak memory usage # -def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str): +def test_bulk_insert(postgres: PostgresFactory, + pageserver: ZenithPageserver, + pg_bin, + zenith_cli, + zenbenchmark, + repo_dir: str): # Create a branch for us zenith_cli.run(["branch", "test_bulk_insert", "empty"]) pg = postgres.create_start('test_bulk_insert') - print("postgres is running on 'test_bulk_insert' branch") + log.info("postgres is running on 'test_bulk_insert' branch") # Open a connection directly to the page server that we'll use to force # flushing the layers to disk - psconn = pageserver.connect(); + psconn = pageserver.connect() pscur = psconn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command @@ -47,5 +54,7 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB') # Report disk space used by the repository - timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline) - zenbenchmark.record('size', timeline_size / (1024*1024), 'MB') + timeline_size = zenbenchmark.get_timeline_size(repo_dir, + pageserver.initial_tenant, + timeline) + zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB') diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index e1de1dd014..1e2a17c2c9 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -37,7 +37,9 @@ def test_bulk_tenant_create( tenant = tenant_factory.create() zenith_cli.run([ - "branch", f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", "main", + "branch", + f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", + "main", f"--tenantid={tenant}" ]) @@ -46,6 +48,7 @@ def test_bulk_tenant_create( pg_tenant = postgres.create_start( f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", + None, # branch name, None means same as node name tenant, wal_acceptors=wa_factory.get_connstrs() if use_wal_acceptors == 'with_wa' else None, ) diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py new file mode 100644 index 0000000000..5a80978cf0 --- /dev/null +++ b/test_runner/performance/test_gist_build.py @@ -0,0 +1,61 @@ +import os +from contextlib import closing +from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log + +pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture") + + +# +# Test buffering GisT build. It WAL-logs the whole relation, in 32-page chunks. +# As of this writing, we're duplicate those giant WAL records for each page, +# which makes the delta layer about 32x larger than it needs to be. +# +def test_gist_buffering_build(postgres: PostgresFactory, + pageserver: ZenithPageserver, + pg_bin, + zenith_cli, + zenbenchmark, + repo_dir: str): + # Create a branch for us + zenith_cli.run(["branch", "test_gist_buffering_build", "empty"]) + + pg = postgres.create_start('test_gist_buffering_build') + log.info("postgres is running on 'test_gist_buffering_build' branch") + + # Open a connection directly to the page server that we'll use to force + # flushing the layers to disk + psconn = pageserver.connect() + pscur = psconn.cursor() + + # Get the timeline ID of our branch. We need it for the 'do_gc' command + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SHOW zenith.zenith_timeline") + timeline = cur.fetchone()[0] + + # Create test table. + cur.execute("create table gist_point_tbl(id int4, p point)") + cur.execute( + "insert into gist_point_tbl select g, point(g, g) from generate_series(1, 1000000) g;" + ) + + # Build the index. + with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'): + with zenbenchmark.record_duration('build'): + cur.execute( + "create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)" + ) + + # Flush the layers from memory to disk. This is included in the reported + # time and I/O + pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 1000000") + + # Record peak memory usage + zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB') + + # Report disk space used by the repository + timeline_size = zenbenchmark.get_timeline_size(repo_dir, + pageserver.initial_tenant, + timeline) + zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB') diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 18db78f12a..388ac4314c 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -1,9 +1,11 @@ import os from contextlib import closing from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture") + # # Run a very short pgbench test. # @@ -13,16 +15,21 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture") # 2. Time to run 5000 pgbench transactions # 3. Disk space used # -def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str): +def test_pgbench(postgres: PostgresFactory, + pageserver: ZenithPageserver, + pg_bin, + zenith_cli, + zenbenchmark, + repo_dir: str): # Create a branch for us zenith_cli.run(["branch", "test_pgbench_perf", "empty"]) pg = postgres.create_start('test_pgbench_perf') - print("postgres is running on 'test_pgbench_perf' branch") + log.info("postgres is running on 'test_pgbench_perf' branch") # Open a connection directly to the page server that we'll use to force # flushing the layers to disk - psconn = pageserver.connect(); + psconn = pageserver.connect() pscur = psconn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command @@ -52,4 +59,4 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin # Report disk space used by the repository timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline) - zenbenchmark.record('size', timeline_size / (1024*1024), 'MB') + zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB') diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py index 09310c702b..1a1cc7bf21 100644 --- a/test_runner/performance/test_write_amplification.py +++ b/test_runner/performance/test_write_amplification.py @@ -13,19 +13,26 @@ import os from contextlib import closing from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver +from fixtures.log_helper import log pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture") -def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str): + +def test_write_amplification(postgres: PostgresFactory, + pageserver: ZenithPageserver, + pg_bin, + zenith_cli, + zenbenchmark, + repo_dir: str): # Create a branch for us zenith_cli.run(["branch", "test_write_amplification", "empty"]) pg = postgres.create_start('test_write_amplification') - print("postgres is running on 'test_write_amplification' branch") + log.info("postgres is running on 'test_write_amplification' branch") # Open a connection directly to the page server that we'll use to force # flushing the layers to disk - psconn = pageserver.connect(); + psconn = pageserver.connect() pscur = psconn.cursor() with closing(pg.connect()) as conn: @@ -70,5 +77,7 @@ def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPagese pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") # Report disk space used by the repository - timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline) - zenbenchmark.record('size', timeline_size / (1024*1024), 'MB') + timeline_size = zenbenchmark.get_timeline_size(repo_dir, + pageserver.initial_tenant, + timeline) + zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB') diff --git a/test_runner/pytest.ini b/test_runner/pytest.ini index 78b5304f78..e6c7013559 100644 --- a/test_runner/pytest.ini +++ b/test_runner/pytest.ini @@ -1,2 +1,4 @@ [pytest] minversion = 6.0 +log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s +log_date_format = %Y-%m-%d %H:%M:%S diff --git a/test_runner/setup.cfg b/test_runner/setup.cfg index 578cb28efc..cff4c7f86e 100644 --- a/test_runner/setup.cfg +++ b/test_runner/setup.cfg @@ -10,6 +10,7 @@ max-line-length = 100 [yapf] based_on_style = pep8 column_limit = 100 +split_all_top_level_comma_separated_values = true [mypy] # some tests don't typecheck when this flag is set diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index da715d7387..66bfe1192c 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -1,6 +1,8 @@ import pytest import os +from fixtures.log_helper import log + pytest_plugins = ("fixtures.zenith_fixtures") """ Use this test to see what happens when tests fail. @@ -22,7 +24,7 @@ def test_broken(zenith_cli, pageserver, postgres, pg_bin): zenith_cli.run(["branch", "test_broken", "empty"]) postgres.create_start("test_broken") - print('postgres is running') + log.info('postgres is running') - print('THIS NEXT COMMAND WILL FAIL:') + log.info('THIS NEXT COMMAND WILL FAIL:') pg_bin.run('pgbench -i_am_a_broken_test'.split()) diff --git a/vendor/postgres b/vendor/postgres index 93b1dd0055..6b58de66ec 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 93b1dd005527f3c82aec2dbf3b220aba8c9eab2c +Subproject commit 6b58de66ec08e5dd8747353b3c33e696e5bfde81 diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml index 16790ca214..2e2e435236 100644 --- a/walkeeper/Cargo.toml +++ b/walkeeper/Cargo.toml @@ -11,6 +11,8 @@ regex = "1.4.5" bincode = "1.3" bytes = "1.0.1" byteorder = "1.4.3" +hyper = "0.14" +routerify = "2" fs2 = "0.4.3" lazy_static = "1.4.0" serde_json = "1" @@ -28,9 +30,11 @@ humantime = "2.1.0" walkdir = "2" serde = { version = "1.0", features = ["derive"] } hex = "0.4.3" +const_format = "0.2.21" # FIXME: 'pageserver' is needed for ZTimelineId. Refactor pageserver = { path = "../pageserver" } postgres_ffi = { path = "../postgres_ffi" } workspace_hack = { path = "../workspace_hack" } +zenith_metrics = { path = "../zenith_metrics" } zenith_utils = { path = "../zenith_utils" } diff --git a/walkeeper/README b/walkeeper/README index 6c5a69e926..05325bafd9 100644 --- a/walkeeper/README +++ b/walkeeper/README @@ -89,12 +89,12 @@ A: Page Server is a single server which can be lost. As our primary Q: What if the compute node evicts a page, needs it back, but the page is yet to reach the Page Server? -A: If the compute node has evicted a page, all changes from that page are - already committed, i.e. they are saved on majority of WAL safekeepers. These - WAL records will eventually reach the Page Server. The Page Server notes - that the compute note requests pages with a very recent LSN and will not - respond to the compute node until it a corresponding WAL is received from WAL - safekeepers. +A: If the compute node has evicted a page, changes to it have been WAL-logged + (that's why it is called Write Ahead logging; there are some exceptions like + index builds, but these are exceptions). These WAL records will eventually + reach the Page Server. The Page Server notes that the compute note requests + pages with a very recent LSN and will not respond to the compute node until a + corresponding WAL is received from WAL safekeepers. Q: How long may Page Server wait for? A: Not too long, hopefully. If a page is evicted, it probably was not used for diff --git a/walkeeper/src/bin/wal_acceptor.rs b/walkeeper/src/bin/safekeeper.rs similarity index 64% rename from walkeeper/src/bin/wal_acceptor.rs rename to walkeeper/src/bin/safekeeper.rs index d8a0ab6737..7ce8765789 100644 --- a/walkeeper/src/bin/wal_acceptor.rs +++ b/walkeeper/src/bin/safekeeper.rs @@ -1,35 +1,48 @@ // -// Main entry point for the wal_acceptor executable +// Main entry point for the safekeeper executable // use anyhow::Result; use clap::{App, Arg}; +use const_format::formatcp; use daemonize::Daemonize; use log::*; use std::env; +use std::net::TcpListener; use std::path::{Path, PathBuf}; use std::thread; +use zenith_utils::http::endpoint; use zenith_utils::logging; +use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; +use walkeeper::http; use walkeeper::s3_offload; use walkeeper::wal_service; use walkeeper::WalAcceptorConf; fn main() -> Result<()> { - let arg_matches = App::new("Zenith wal_acceptor") + zenith_metrics::set_common_metrics_prefix("safekeeper"); + let arg_matches = App::new("Zenith safekeeper") .about("Store WAL stream to local file system and push it to WAL receivers") .arg( Arg::with_name("datadir") .short("D") .long("dir") .takes_value(true) - .help("Path to the WAL acceptor data directory"), + .help("Path to the safekeeper data directory"), ) .arg( - Arg::with_name("listen") + Arg::with_name("listen-pg") .short("l") - .long("listen") + .long("listen-pg") + .alias("listen") // for compatibility .takes_value(true) - .help("listen for incoming connections on ip:port (default: 127.0.0.1:5454)"), + .help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")), + ) + .arg( + Arg::with_name("listen-http") + .long("listen-http") + .takes_value(true) + .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")), ) .arg( Arg::with_name("pageserver") @@ -70,7 +83,8 @@ fn main() -> Result<()> { daemonize: false, no_sync: false, pageserver_addr: None, - listen_addr: "localhost:5454".to_string(), + listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(), + listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(), ttl: None, recall_period: None, pageserver_auth_token: env::var("PAGESERVER_AUTH_TOKEN").ok(), @@ -91,8 +105,12 @@ fn main() -> Result<()> { conf.daemonize = true; } - if let Some(addr) = arg_matches.value_of("listen") { - conf.listen_addr = addr.to_owned(); + if let Some(addr) = arg_matches.value_of("listen-pg") { + conf.listen_pg_addr = addr.to_owned(); + } + + if let Some(addr) = arg_matches.value_of("listen-http") { + conf.listen_http_addr = addr.to_owned(); } if let Some(addr) = arg_matches.value_of("pageserver") { @@ -111,8 +129,19 @@ fn main() -> Result<()> { } fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> { - let log_filename = conf.data_dir.join("wal_acceptor.log"); - let (_scope_guard, log_file) = logging::init(log_filename, conf.daemonize)?; + let log_filename = conf.data_dir.join("safekeeper.log"); + let log_file = logging::init(log_filename, conf.daemonize)?; + + let http_listener = TcpListener::bind(conf.listen_http_addr.clone()).map_err(|e| { + error!("failed to bind to address {}: {}", conf.listen_http_addr, e); + e + })?; + + info!("Starting safekeeper on {}", conf.listen_pg_addr); + let pg_listener = TcpListener::bind(conf.listen_pg_addr.clone()).map_err(|e| { + error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); + e + })?; if conf.daemonize { info!("daemonizing..."); @@ -123,7 +152,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> { let stderr = log_file; let daemonize = Daemonize::new() - .pid_file("wal_acceptor.pid") + .pid_file("safekeeper.pid") .working_directory(Path::new(".")) .stdout(stdout) .stderr(stderr); @@ -136,6 +165,17 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> { let mut threads = Vec::new(); + let conf_cloned = conf.clone(); + let http_endpoint_thread = thread::Builder::new() + .name("http_endpoint_thread".into()) + .spawn(|| { + // TODO authentication + let router = http::make_router(conf_cloned); + endpoint::serve_thread_main(router, http_listener).unwrap(); + }) + .unwrap(); + threads.push(http_endpoint_thread); + if conf.ttl.is_some() { let s3_conf = conf.clone(); let s3_offload_thread = thread::Builder::new() @@ -152,7 +192,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> { .name("WAL acceptor thread".into()) .spawn(|| { // thread code - let thread_result = wal_service::thread_main(conf); + let thread_result = wal_service::thread_main(conf, pg_listener); if let Err(e) = thread_result { info!("wal_service thread terminated: {}", e); } diff --git a/walkeeper/src/http/mod.rs b/walkeeper/src/http/mod.rs new file mode 100644 index 0000000000..c82d1c0362 --- /dev/null +++ b/walkeeper/src/http/mod.rs @@ -0,0 +1,2 @@ +pub mod routes; +pub use routes::make_router; diff --git a/walkeeper/src/http/routes.rs b/walkeeper/src/http/routes.rs new file mode 100644 index 0000000000..8ab405508e --- /dev/null +++ b/walkeeper/src/http/routes.rs @@ -0,0 +1,88 @@ +use hyper::{Body, Request, Response, StatusCode}; +use routerify::ext::RequestExt; +use routerify::RouterBuilder; +use serde::Serialize; +use serde::Serializer; +use std::fmt::Display; +use std::sync::Arc; +use zenith_utils::lsn::Lsn; + +use crate::safekeeper::AcceptorState; +use crate::timeline::CreateControlFile; +use crate::timeline::GlobalTimelines; +use crate::WalAcceptorConf; +use zenith_utils::http::endpoint; +use zenith_utils::http::error::ApiError; +use zenith_utils::http::json::json_response; +use zenith_utils::http::request::parse_request_param; +use zenith_utils::zid::{ZTenantId, ZTimelineId}; + +/// Healthcheck handler. +async fn status_handler(_: Request) -> Result, ApiError> { + Ok(json_response(StatusCode::OK, "")?) +} + +fn get_conf(request: &Request) -> &WalAcceptorConf { + request + .data::>() + .expect("unknown state type") + .as_ref() +} + +fn display_serialize(z: &F, s: S) -> Result +where + S: Serializer, + F: Display, +{ + s.serialize_str(&format!("{}", z)) +} + +/// Info about timeline on safekeeper ready for reporting. +#[derive(Debug, Serialize)] +struct TimelineStatus { + #[serde(serialize_with = "display_serialize")] + tenant_id: ZTenantId, + #[serde(serialize_with = "display_serialize")] + timeline_id: ZTimelineId, + acceptor_state: AcceptorState, + #[serde(serialize_with = "display_serialize")] + commit_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + truncate_lsn: Lsn, +} + +/// Report info about timeline. +async fn timeline_status_handler(request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + + let tli = GlobalTimelines::get( + get_conf(&request), + tenant_id, + timeline_id, + CreateControlFile::False, + ) + .map_err(ApiError::from_err)?; + let sk_state = tli.get_info(); + + let status = TimelineStatus { + tenant_id, + timeline_id, + acceptor_state: sk_state.acceptor_state, + commit_lsn: sk_state.commit_lsn, + truncate_lsn: sk_state.truncate_lsn, + }; + Ok(json_response(StatusCode::OK, status)?) +} + +/// Safekeeper http router. +pub fn make_router(conf: WalAcceptorConf) -> RouterBuilder { + let router = endpoint::make_router(); + router + .data(Arc::new(conf)) + .get("/v1/status", status_handler) + .get( + "/v1/timeline/:tenant_id/:timeline_id", + timeline_status_handler, + ) +} diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index fb04459c47..4406823076 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use std::time::Duration; +pub mod http; pub mod json_ctrl; pub mod receive_wal; pub mod replication; @@ -11,12 +12,23 @@ pub mod send_wal; pub mod timeline; pub mod wal_service; +pub mod defaults { + use const_format::formatcp; + + pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; + pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + + pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; + pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +} + #[derive(Debug, Clone)] pub struct WalAcceptorConf { pub data_dir: PathBuf, pub daemonize: bool, pub no_sync: bool, - pub listen_addr: String, + pub listen_pg_addr: String, + pub listen_http_addr: String, pub pageserver_addr: Option, // TODO (create issue) this is temporary, until protocol between PG<->SK<->PS rework pub pageserver_auth_token: Option, diff --git a/walkeeper/src/receive_wal.rs b/walkeeper/src/receive_wal.rs index 4596344b76..527c8d891c 100644 --- a/walkeeper/src/receive_wal.rs +++ b/walkeeper/src/receive_wal.rs @@ -42,7 +42,7 @@ fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId, tenantid: ZT ); // use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses - let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_addr); + let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_pg_addr); let me_conf: Config = me_connstr.parse().unwrap(); let (host, port) = connection_host_port(&me_conf); let callme = format!( diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index 95f0e9e0c2..49e5945c95 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -15,8 +15,11 @@ use std::cmp::min; use std::io; use std::io::Read; +use lazy_static::lazy_static; + use crate::replication::HotStandbyFeedback; use postgres_ffi::xlog_utils::MAX_SEND_SIZE; +use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec}; use zenith_utils::bin_ser::LeSer; use zenith_utils::lsn::Lsn; use zenith_utils::pq_proto::SystemId; @@ -281,6 +284,45 @@ pub trait Storage { fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>; } +lazy_static! { + // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). + // i64 is faster than f64, so update to u64 when available. + static ref FLUSH_LSN_GAUGE: GaugeVec = register_gauge_vec!( + "safekeeper_flush_lsn", + "Current flush_lsn, grouped by timeline", + &["ztli"] + ) + .expect("Failed to register safekeeper_flush_lsn gauge vec"); + static ref COMMIT_LSN_GAUGE: GaugeVec = register_gauge_vec!( + "safekeeper_commit_lsn", + "Current commit_lsn (not necessarily persisted to disk), grouped by timeline", + &["ztli"] + ) + .expect("Failed to register safekeeper_commit_lsn gauge vec"); +} + +struct SafeKeeperMetrics { + flush_lsn: Gauge, + commit_lsn: Gauge, +} + +impl SafeKeeperMetrics { + fn new(ztli: ZTimelineId) -> SafeKeeperMetrics { + let ztli_str = format!("{}", ztli); + SafeKeeperMetrics { + flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&ztli_str]), + commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&ztli_str]), + } + } + + fn new_noname() -> SafeKeeperMetrics { + SafeKeeperMetrics { + flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&["n/a"]), + commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&["n/a"]), + } + } +} + /// SafeKeeper which consumes events (messages from compute) and provides /// replies. pub struct SafeKeeper { @@ -288,6 +330,8 @@ pub struct SafeKeeper { /// Established by reading wal. pub flush_lsn: Lsn, pub tli: u32, + // Cached metrics so we don't have to recompute labels on each update. + metrics: SafeKeeperMetrics, /// not-yet-flushed pairs of same named fields in s.* pub commit_lsn: Lsn, pub truncate_lsn: Lsn, @@ -306,6 +350,7 @@ where SafeKeeper { flush_lsn, tli, + metrics: SafeKeeperMetrics::new_noname(), commit_lsn: state.commit_lsn, truncate_lsn: state.truncate_lsn, storage, @@ -357,6 +402,8 @@ where self.s.server.wal_seg_size = msg.wal_seg_size; self.storage.persist(&self.s, true)?; + self.metrics = SafeKeeperMetrics::new(self.s.server.ztli); + info!( "processed greeting from proposer {:?}, sending term {:?}", msg.proposer_id, self.s.acceptor_state.term @@ -481,6 +528,7 @@ where } if last_rec_lsn > self.flush_lsn { self.flush_lsn = last_rec_lsn; + self.metrics.flush_lsn.set(u64::from(self.flush_lsn) as f64); } // Advance commit_lsn taking into account what we have locally. xxx this @@ -498,6 +546,9 @@ where sync_control_file |= commit_lsn >= msg.h.epoch_start_lsn && self.s.commit_lsn < msg.h.epoch_start_lsn; self.commit_lsn = commit_lsn; + self.metrics + .commit_lsn + .set(u64::from(self.commit_lsn) as f64); } self.truncate_lsn = msg.h.truncate_lsn; diff --git a/walkeeper/src/send_wal.rs b/walkeeper/src/send_wal.rs index e81b6c5eac..fcd8595e15 100644 --- a/walkeeper/src/send_wal.rs +++ b/walkeeper/src/send_wal.rs @@ -13,14 +13,13 @@ use std::str::FromStr; use std::sync::Arc; use zenith_utils::postgres_backend; use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::pq_proto::{BeMessage, FeStartupMessage, RowDescriptor}; +use zenith_utils::pq_proto::{BeMessage, FeStartupMessage, RowDescriptor, INT4_OID, TEXT_OID}; use zenith_utils::zid::{ZTenantId, ZTimelineId}; use crate::timeline::CreateControlFile; /// Handler for streaming WAL from acceptor pub struct SendWalHandler { - /// wal acceptor configuration pub conf: WalAcceptorConf, /// assigned application name pub appname: Option, @@ -72,19 +71,16 @@ impl postgres_backend::Handler for SendWalHandler { } if query_string.starts_with(b"IDENTIFY_SYSTEM") { self.handle_identify_system(pgb)?; - Ok(()) } else if query_string.starts_with(b"START_REPLICATION") { ReplicationConn::new(pgb).run(self, pgb, &query_string)?; - Ok(()) } else if query_string.starts_with(b"START_WAL_PUSH") { ReceiveWalConn::new(pgb)?.run(self)?; - Ok(()) } else if query_string.starts_with(b"JSON_CTRL") { handle_json_ctrl(self, pgb, &query_string)?; - Ok(()) } else { bail!("Unexpected command {:?}", query_string); } + Ok(()) } } @@ -114,25 +110,25 @@ impl SendWalHandler { pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor { name: b"systemid", - typoid: 25, + typoid: TEXT_OID, typlen: -1, ..Default::default() }, RowDescriptor { name: b"timeline", - typoid: 23, + typoid: INT4_OID, typlen: 4, ..Default::default() }, RowDescriptor { name: b"xlogpos", - typoid: 25, + typoid: TEXT_OID, typlen: -1, ..Default::default() }, RowDescriptor { name: b"dbname", - typoid: 25, + typoid: TEXT_OID, typlen: -1, ..Default::default() }, diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index 42e8afabb8..b30c061c9c 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -155,7 +155,7 @@ impl SharedState { } match opts.open(&control_file_path) { Ok(mut file) => { - // Lock file to prevent two or more active wal_acceptors + // Lock file to prevent two or more active safekeepers match file.try_lock_exclusive() { Ok(()) => {} Err(e) => { @@ -340,7 +340,7 @@ lazy_static! { } /// A zero-sized struct used to manage access to the global timelines map. -struct GlobalTimelines; +pub struct GlobalTimelines; impl GlobalTimelines { /// Get a timeline with control file loaded from the global TIMELINES map. diff --git a/walkeeper/src/wal_service.rs b/walkeeper/src/wal_service.rs index c77078560c..4a294e9c95 100644 --- a/walkeeper/src/wal_service.rs +++ b/walkeeper/src/wal_service.rs @@ -12,13 +12,7 @@ use crate::WalAcceptorConf; use zenith_utils::postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. -pub fn thread_main(conf: WalAcceptorConf) -> Result<()> { - info!("Starting wal acceptor on {}", conf.listen_addr); - let listener = TcpListener::bind(conf.listen_addr.clone()).map_err(|e| { - error!("failed to bind to address {}: {}", conf.listen_addr, e); - e - })?; - +pub fn thread_main(conf: WalAcceptorConf, listener: TcpListener) -> Result<()> { loop { match listener.accept() { Ok((socket, peer_addr)) => { @@ -41,8 +35,8 @@ fn handle_socket(socket: TcpStream, conf: WalAcceptorConf) -> Result<()> { socket.set_nodelay(true)?; let mut conn_handler = SendWalHandler::new(conf); - let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?; - // libpq replication protocol between wal_acceptor and replicas/pagers + let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, false)?; + // libpq replication protocol between safekeeper and replicas/pagers pgbackend.run(&mut conn_handler)?; Ok(()) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 1c04e803e6..e79d42377e 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -32,12 +32,16 @@ struct BranchTreeEl { // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { - let timeline_arg = Arg::with_name("timeline") - .short("n") + let node_arg = Arg::with_name("node") .index(1) - .help("Timeline name") + .help("Node name") .required(true); + let timeline_arg = Arg::with_name("timeline") + .index(2) + .help("Branch name or a point-in time specification") + .required(false); + let tenantid_arg = Arg::with_name("tenantid") .long("tenantid") .help("Tenant id. Represented as a hexadecimal string 32 symbols length") @@ -88,7 +92,12 @@ fn main() -> Result<()> { ) .subcommand(SubCommand::with_name("status")) .subcommand(SubCommand::with_name("start").about("Start local pageserver")) - .subcommand(SubCommand::with_name("stop").about("Stop local pageserver")) + .subcommand(SubCommand::with_name("stop").about("Stop local pageserver") + .arg(Arg::with_name("immediate") + .help("Don't flush repository data at shutdown") + .required(false) + ) + ) .subcommand(SubCommand::with_name("restart").about("Restart local pageserver")) .subcommand( SubCommand::with_name("pg") @@ -97,7 +106,10 @@ fn main() -> Result<()> { .subcommand(SubCommand::with_name("list").arg(tenantid_arg.clone())) .subcommand(SubCommand::with_name("create") .about("Create a postgres compute node") - .arg(timeline_arg.clone()).arg(tenantid_arg.clone()).arg(port_arg.clone()) + .arg(node_arg.clone()) + .arg(timeline_arg.clone()) + .arg(tenantid_arg.clone()) + .arg(port_arg.clone()) .arg( Arg::with_name("config-only") .help("Don't do basebackup, create compute node with only config files") @@ -106,13 +118,13 @@ fn main() -> Result<()> { )) .subcommand(SubCommand::with_name("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") - .arg( - timeline_arg.clone() - ).arg( - tenantid_arg.clone() - ).arg(port_arg.clone())) + .arg(node_arg.clone()) + .arg(timeline_arg.clone()) + .arg(tenantid_arg.clone()) + .arg(port_arg.clone())) .subcommand( SubCommand::with_name("stop") + .arg(node_arg.clone()) .arg(timeline_arg.clone()) .arg(tenantid_arg.clone()) .arg( @@ -196,10 +208,12 @@ fn main() -> Result<()> { } } - ("stop", Some(_sub_m)) => { + ("stop", Some(stop_match)) => { let pageserver = PageServerNode::from_env(&env); - if let Err(e) = pageserver.stop() { + let immediate = stop_match.is_present("immediate"); + + if let Err(e) = pageserver.stop(immediate) { eprintln!("pageserver stop failed: {}", e); exit(1); } @@ -208,7 +222,8 @@ fn main() -> Result<()> { ("restart", Some(_sub_m)) => { let pageserver = PageServerNode::from_env(&env); - if let Err(e) = pageserver.stop() { + //TODO what shutdown strategy should we use here? + if let Err(e) = pageserver.stop(false) { eprintln!("pageserver stop failed: {}", e); exit(1); } @@ -422,25 +437,32 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let tenantid: ZTenantId = list_match .value_of("tenantid") .map_or(Ok(env.tenantid), |value| value.parse())?; + let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| { eprintln!("Failed to load branch info: {}", e); HashMap::new() }); - println!("BRANCH\tADDRESS\t\tLSN\t\tSTATUS"); - for ((_, timeline_name), node) in cplane + println!("NODE\tADDRESS\t\tBRANCH\tLSN\t\tSTATUS"); + for ((_, node_name), node) in cplane .nodes .iter() .filter(|((node_tenantid, _), _)| node_tenantid == &tenantid) { + // FIXME: This shows the LSN at the end of the timeline. It's not the + // right thing to do for read-only nodes that might be anchored at an + // older point in time, or following but lagging behind the primary. + let lsn_str = branch_infos + .get(&node.timelineid) + .map(|bi| bi.latest_valid_lsn.to_string()) + .unwrap_or_else(|| "?".to_string()); + println!( - "{}\t{}\t{}\t{}", - timeline_name, + "{}\t{}\t{}\t{}\t{}", + node_name, node.address, - branch_infos - .get(&node.timelineid) - .map(|bi| bi.latest_valid_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()), + node.timelineid, // FIXME: resolve human-friendly branch name + lsn_str, node.status(), ); } @@ -449,26 +471,28 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let tenantid: ZTenantId = create_match .value_of("tenantid") .map_or(Ok(env.tenantid), |value| value.parse())?; - let timeline_name = create_match.value_of("timeline").unwrap_or("main"); + let node_name = create_match.value_of("node").unwrap_or("main"); + let timeline_name = create_match.value_of("timeline").unwrap_or(node_name); let port: Option = match create_match.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenantid, timeline_name, port)?; + cplane.new_node(tenantid, node_name, timeline_name, port)?; } ("start", Some(start_match)) => { let tenantid: ZTenantId = start_match .value_of("tenantid") .map_or(Ok(env.tenantid), |value| value.parse())?; - let timeline_name = start_match.value_of("timeline").unwrap_or("main"); + let node_name = start_match.value_of("node").unwrap_or("main"); + let timeline_name = start_match.value_of("timeline"); let port: Option = match start_match.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; - let node = cplane.nodes.get(&(tenantid, timeline_name.to_owned())); + let node = cplane.nodes.get(&(tenantid, node_name.to_owned())); let auth_token = if matches!(env.auth_type, AuthType::ZenithJWT) { let claims = Claims::new(Some(tenantid), Scope::Tenant); @@ -477,12 +501,11 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { None }; - println!( - "Starting {} postgres on timeline {}...", - if node.is_some() { "existing" } else { "new" }, - timeline_name - ); if let Some(node) = node { + if timeline_name.is_some() { + println!("timeline name ignored because node exists already"); + } + println!("Starting existing postgres {}...", node_name); node.start(&auth_token)?; } else { // when used with custom port this results in non obvious behaviour @@ -490,12 +513,17 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { // start --port X // stop // start <-- will also use port X even without explicit port argument - let node = cplane.new_node(tenantid, timeline_name, port)?; + let timeline_name = timeline_name.unwrap_or(node_name); + println!( + "Starting new postgres {} on {}...", + node_name, timeline_name + ); + let node = cplane.new_node(tenantid, node_name, timeline_name, port)?; node.start(&auth_token)?; } } ("stop", Some(stop_match)) => { - let timeline_name = stop_match.value_of("timeline").unwrap_or("main"); + let node_name = stop_match.value_of("node").unwrap_or("main"); let destroy = stop_match.is_present("destroy"); let tenantid: ZTenantId = stop_match .value_of("tenantid") @@ -503,8 +531,8 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane .nodes - .get(&(tenantid, timeline_name.to_owned())) - .ok_or_else(|| anyhow!("postgres {} is not found", timeline_name))?; + .get(&(tenantid, node_name.to_owned())) + .ok_or_else(|| anyhow!("postgres {} is not found", node_name))?; node.stop(destroy)?; } diff --git a/zenith_metrics/src/lib.rs b/zenith_metrics/src/lib.rs index e3c3c81ee7..59a8a31c9e 100644 --- a/zenith_metrics/src/lib.rs +++ b/zenith_metrics/src/lib.rs @@ -5,6 +5,8 @@ use lazy_static::lazy_static; use once_cell::race::OnceBox; pub use prometheus::{exponential_buckets, linear_buckets}; +pub use prometheus::{register_gauge, Gauge}; +pub use prometheus::{register_gauge_vec, GaugeVec}; pub use prometheus::{register_histogram, Histogram}; pub use prometheus::{register_histogram_vec, HistogramVec}; pub use prometheus::{register_int_counter, IntCounter}; @@ -44,7 +46,7 @@ pub fn set_common_metrics_prefix(prefix: &'static str) { } /// Prepends a prefix to a common metric name so they are distinguished between -/// different services, see https://github.com/zenithdb/zenith/pull/681 +/// different services, see /// A call to set_common_metrics_prefix() is necessary prior to calling this. pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String { // Not unwrap() because metrics may be initialized after multiple threads have been started. diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index 22c1c9bab6..6571fae042 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -18,12 +18,9 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" thiserror = "1.0" tokio = "1.11" - -slog-async = "2.6.0" -slog-stdlog = "4.1.0" -slog-scope = "4.4.0" -slog-term = "2.8.0" -slog = "2.7.0" +tracing = "0.1" +tracing-log = "0.1" +tracing-subscriber = "0.2" zenith_metrics = { path = "../zenith_metrics" } workspace_hack = { path = "../workspace_hack" } diff --git a/zenith_utils/src/http/endpoint.rs b/zenith_utils/src/http/endpoint.rs index 3c5b53b77a..30e7bfc921 100644 --- a/zenith_utils/src/http/endpoint.rs +++ b/zenith_utils/src/http/endpoint.rs @@ -12,8 +12,17 @@ use std::net::TcpListener; use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; use zenith_metrics::{Encoder, TextEncoder}; +use std::sync::Mutex; +use tokio::sync::oneshot::Sender; + use super::error::ApiError; +lazy_static! { + /// Channel used to send shutdown signal - wrapped in an Option to allow + /// it to be taken by value (since oneshot channels consume themselves on send) + static ref SHUTDOWN_SENDER: Mutex>> = Mutex::new(None); +} + lazy_static! { static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!( new_common_metric_name("serve_metrics_count"), @@ -143,11 +152,18 @@ pub fn check_permission(req: &Request, tenantid: Option) -> Res } } +// Send shutdown signal +pub fn shutdown() { + if let Some(tx) = SHUTDOWN_SENDER.lock().unwrap().take() { + let _ = tx.send(()); + } +} + pub fn serve_thread_main( router_builder: RouterBuilder, listener: TcpListener, ) -> anyhow::Result<()> { - log::info!("Starting a http endoint at {}", listener.local_addr()?); + log::info!("Starting a http endpoint at {}", listener.local_addr()?); // Create a Service from the router above to handle incoming requests. let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap(); @@ -159,7 +175,14 @@ pub fn serve_thread_main( let _guard = runtime.enter(); - let server = Server::from_tcp(listener)?.serve(service); + let (send, recv) = tokio::sync::oneshot::channel::<()>(); + *SHUTDOWN_SENDER.lock().unwrap() = Some(send); + + let server = Server::from_tcp(listener)? + .serve(service) + .with_graceful_shutdown(async { + recv.await.ok(); + }); runtime.block_on(server)?; diff --git a/zenith_utils/src/http/mod.rs b/zenith_utils/src/http/mod.rs index b6740ad543..16b7e87721 100644 --- a/zenith_utils/src/http/mod.rs +++ b/zenith_utils/src/http/mod.rs @@ -1,3 +1,4 @@ pub mod endpoint; pub mod error; pub mod json; +pub mod request; diff --git a/zenith_utils/src/http/request.rs b/zenith_utils/src/http/request.rs new file mode 100644 index 0000000000..3bc8993c26 --- /dev/null +++ b/zenith_utils/src/http/request.rs @@ -0,0 +1,33 @@ +use std::str::FromStr; + +use super::error::ApiError; +use hyper::{Body, Request}; +use routerify::ext::RequestExt; + +pub fn get_request_param<'a>( + request: &'a Request, + param_name: &str, +) -> Result<&'a str, ApiError> { + match request.param(param_name) { + Some(arg) => Ok(arg), + None => { + return Err(ApiError::BadRequest(format!( + "no {} specified in path param", + param_name + ))) + } + } +} + +pub fn parse_request_param( + request: &Request, + param_name: &str, +) -> Result { + match get_request_param(request, param_name)?.parse() { + Ok(v) => Ok(v), + Err(_) => Err(ApiError::BadRequest(format!( + "failed to parse {}", + param_name + ))), + } +} diff --git a/zenith_utils/src/lib.rs b/zenith_utils/src/lib.rs index ca26be5df2..96b3cf5066 100644 --- a/zenith_utils/src/lib.rs +++ b/zenith_utils/src/lib.rs @@ -8,6 +8,9 @@ pub mod lsn; /// SeqWait allows waiting for a future sequence number to arrive pub mod seqwait; +/// append only ordered map implemented with a Vec +pub mod vec_map; + // Async version of SeqWait. Currently unused. // pub mod seqwait_async; diff --git a/zenith_utils/src/logging.rs b/zenith_utils/src/logging.rs index c6ed35cbf4..53dbfc305d 100644 --- a/zenith_utils/src/logging.rs +++ b/zenith_utils/src/logging.rs @@ -1,4 +1,3 @@ -use slog::{Drain, Level}; use std::{ fs::{File, OpenOptions}, path::Path, @@ -6,10 +5,12 @@ use std::{ use anyhow::{Context, Result}; -pub fn init( - log_filename: impl AsRef, - daemonize: bool, -) -> Result<(slog_scope::GlobalLoggerGuard, File)> { +use tracing::subscriber::set_global_default; +use tracing_log::LogTracer; +use tracing_subscriber::fmt; +use tracing_subscriber::{layer::SubscriberExt, EnvFilter, Registry}; + +pub fn init(log_filename: impl AsRef, daemonize: bool) -> Result { // Don't open the same file for output multiple times; // the different fds could overwrite each other's output. let log_file = OpenOptions::new() @@ -18,30 +19,38 @@ pub fn init( .open(&log_filename) .with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?; + let default_filter_str = "info"; + + // We fall back to printing all spans at info-level or above if + // the RUST_LOG environment variable is not set. + let env_filter = + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(default_filter_str)); + // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly // for example to be in line with docker log command which expects logs comimg from stdout - let guard = if daemonize { - let decorator = slog_term::PlainSyncDecorator::new(log_file.try_clone()?); - let drain = slog_term::FullFormat::new(decorator) - .build() - .filter_level(Level::Info) - .fuse(); - let logger = slog::Logger::root(drain, slog::o!()); - slog_scope::set_global_logger(logger) + // + // TODO: perhaps use a more human-readable format when !daemonize + if daemonize { + let x = log_file.try_clone().unwrap(); + + let fmt_layer = fmt::layer() + .pretty() + .with_target(false) // don't include event targets + .with_ansi(false) // don't use colors in log file + .with_writer(move || x.try_clone().unwrap()); + let subscriber = Registry::default().with(env_filter).with(fmt_layer); + + set_global_default(subscriber).expect("Failed to set subscriber"); } else { - let decorator = slog_term::TermDecorator::new().build(); - let drain = slog_term::FullFormat::new(decorator) - .build() - .filter_level(Level::Info) - .fuse(); - let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse(); - let logger = slog::Logger::root(drain, slog::o!()); - slog_scope::set_global_logger(logger) - }; + let fmt_layer = fmt::layer().with_target(false); // don't include event targets + let subscriber = Registry::default().with(env_filter).with(fmt_layer); - // initialise forwarding of std log calls - slog_stdlog::init()?; + set_global_default(subscriber).expect("Failed to set subscriber"); + } - Ok((guard, log_file)) + // Redirect all `log`'s events to our subscriber + LogTracer::init().expect("Failed to set logger"); + + Ok(log_file) } diff --git a/zenith_utils/src/postgres_backend.rs b/zenith_utils/src/postgres_backend.rs index b2e0a1a525..02eb330f3b 100644 --- a/zenith_utils/src/postgres_backend.rs +++ b/zenith_utils/src/postgres_backend.rs @@ -13,7 +13,11 @@ use serde::{Deserialize, Serialize}; use std::io::{self, Write}; use std::net::{Shutdown, SocketAddr, TcpStream}; use std::str::FromStr; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; +use std::time::Duration; + +static PGBACKEND_SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false); pub trait Handler { /// Handle single query. @@ -135,13 +139,32 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec { query_string } +// Helper function for socket read loops +pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool { + for cause in error.chain() { + if let Some(io_error) = cause.downcast_ref::() { + if io_error.kind() == std::io::ErrorKind::WouldBlock { + return true; + } + } + } + false +} + impl PostgresBackend { pub fn new( socket: TcpStream, auth_type: AuthType, tls_config: Option>, + set_read_timeout: bool, ) -> io::Result { let peer_addr = socket.peer_addr()?; + if set_read_timeout { + socket + .set_read_timeout(Some(Duration::from_secs(5))) + .unwrap(); + } + Ok(Self { stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))), buf_out: BytesMut::with_capacity(10 * 1024), @@ -229,12 +252,26 @@ impl PostgresBackend { let mut unnamed_query_string = Bytes::new(); - while let Some(msg) = self.read_message()? { - trace!("got message {:?}", msg); + while !PGBACKEND_SHUTDOWN_REQUESTED.load(Ordering::Relaxed) { + match self.read_message() { + Ok(message) => { + if let Some(msg) = message { + trace!("got message {:?}", msg); - match self.process_message(handler, msg, &mut unnamed_query_string)? { - ProcessMsgResult::Continue => continue, - ProcessMsgResult::Break => break, + match self.process_message(handler, msg, &mut unnamed_query_string)? { + ProcessMsgResult::Continue => continue, + ProcessMsgResult::Break => break, + } + } else { + break; + } + } + Err(e) => { + // If it is a timeout error, continue the loop + if !is_socket_read_timed_out(&e) { + return Err(e); + } + } } } @@ -427,3 +464,8 @@ impl PostgresBackend { Ok(ProcessMsgResult::Continue) } } + +// Set the flag to inform connections to cancel +pub fn set_pgbackend_shutdown_requested() { + PGBACKEND_SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed); +} diff --git a/zenith_utils/src/pq_proto.rs b/zenith_utils/src/pq_proto.rs index 12e08737bf..1941784332 100644 --- a/zenith_utils/src/pq_proto.rs +++ b/zenith_utils/src/pq_proto.rs @@ -15,8 +15,9 @@ use std::str; pub type Oid = u32; pub type SystemId = u64; -pub const TEXT_OID: Oid = 25; pub const INT8_OID: Oid = 20; +pub const INT4_OID: Oid = 23; +pub const TEXT_OID: Oid = 25; #[derive(Debug)] pub enum FeMessage { diff --git a/zenith_utils/src/vec_map.rs b/zenith_utils/src/vec_map.rs new file mode 100644 index 0000000000..4e2c827b47 --- /dev/null +++ b/zenith_utils/src/vec_map.rs @@ -0,0 +1,293 @@ +use std::{cmp::Ordering, ops::RangeBounds}; + +use serde::{Deserialize, Serialize}; + +/// Ordered map datastructure implemented in a Vec. +/// Append only - can only add keys that are larger than the +/// current max key. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct VecMap(Vec<(K, V)>); + +impl Default for VecMap { + fn default() -> Self { + VecMap(Default::default()) + } +} + +#[derive(Debug)] +pub struct InvalidKey; + +impl VecMap { + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn as_slice(&self) -> &[(K, V)] { + self.0.as_slice() + } + + /// This function may panic if given a range where the lower bound is + /// greater than the upper bound. + pub fn slice_range>(&self, range: R) -> &[(K, V)] { + use std::ops::Bound::*; + + let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key); + + let start_idx = match range.start_bound() { + Unbounded => 0, + Included(k) => binary_search(k).unwrap_or_else(std::convert::identity), + Excluded(k) => match binary_search(k) { + Ok(idx) => idx + 1, + Err(idx) => idx, + }, + }; + + let end_idx = match range.end_bound() { + Unbounded => self.0.len(), + Included(k) => match binary_search(k) { + Ok(idx) => idx + 1, + Err(idx) => idx, + }, + Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity), + }; + + &self.0[start_idx..end_idx] + } + + /// Add a key value pair to the map. + /// If `key` is less than or equal to the current maximum key + /// the pair will not be added and InvalidKey error will be returned. + pub fn append(&mut self, key: K, value: V) -> Result<(), InvalidKey> { + if let Some((last_key, _last_value)) = self.0.last() { + if &key <= last_key { + return Err(InvalidKey); + } + } + + self.0.push((key, value)); + Ok(()) + } + + /// Update the maximum key value pair or add a new key value pair to the map. + /// If `key` is less than the current maximum key no updates or additions + /// will occur and InvalidKey error will be returned. + pub fn append_or_update_last(&mut self, key: K, mut value: V) -> Result, InvalidKey> { + if let Some((last_key, last_value)) = self.0.last_mut() { + match key.cmp(last_key) { + Ordering::Less => return Err(InvalidKey), + Ordering::Equal => { + std::mem::swap(last_value, &mut value); + return Ok(Some(value)); + } + Ordering::Greater => {} + } + } + + self.0.push((key, value)); + Ok(None) + } + + /// Split the map into two. + /// + /// The left map contains everything before `cutoff` (exclusive). + /// Right map contains `cutoff` and everything after (inclusive). + pub fn split_at(&self, cutoff: &K) -> (Self, Self) + where + K: Clone, + V: Clone, + { + let split_idx = self + .0 + .binary_search_by_key(&cutoff, extract_key) + .unwrap_or_else(std::convert::identity); + + ( + VecMap(self.0[..split_idx].to_vec()), + VecMap(self.0[split_idx..].to_vec()), + ) + } + + /// Move items from `other` to the end of `self`, leaving `other` empty. + /// If any keys in `other` is less than or equal to any key in `self`, + /// `InvalidKey` error will be returned and no mutation will occur. + pub fn extend(&mut self, other: &mut Self) -> Result<(), InvalidKey> { + let self_last_opt = self.0.last().map(extract_key); + let other_first_opt = other.0.last().map(extract_key); + + if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) { + if self_last >= other_first { + return Err(InvalidKey); + } + } + + self.0.append(&mut other.0); + + Ok(()) + } +} + +fn extract_key(entry: &(K, V)) -> &K { + &entry.0 +} + +#[cfg(test)] +mod tests { + use std::{collections::BTreeMap, ops::Bound}; + + use super::VecMap; + + #[test] + fn unbounded_range() { + let mut vec = VecMap::default(); + vec.append(0, ()).unwrap(); + + assert_eq!(vec.slice_range(0..0), &[]); + } + + #[test] + #[should_panic] + fn invalid_ordering_range() { + let mut vec = VecMap::default(); + vec.append(0, ()).unwrap(); + + #[allow(clippy::reversed_empty_ranges)] + vec.slice_range(1..0); + } + + #[test] + fn range_tests() { + let mut vec = VecMap::default(); + vec.append(0, ()).unwrap(); + vec.append(2, ()).unwrap(); + vec.append(4, ()).unwrap(); + + assert_eq!(vec.slice_range(0..0), &[]); + assert_eq!(vec.slice_range(0..1), &[(0, ())]); + assert_eq!(vec.slice_range(0..2), &[(0, ())]); + assert_eq!(vec.slice_range(0..3), &[(0, ()), (2, ())]); + + assert_eq!(vec.slice_range(..0), &[]); + assert_eq!(vec.slice_range(..1), &[(0, ())]); + + assert_eq!(vec.slice_range(..3), &[(0, ()), (2, ())]); + assert_eq!(vec.slice_range(..3), &[(0, ()), (2, ())]); + + assert_eq!(vec.slice_range(0..=0), &[(0, ())]); + assert_eq!(vec.slice_range(0..=1), &[(0, ())]); + assert_eq!(vec.slice_range(0..=2), &[(0, ()), (2, ())]); + assert_eq!(vec.slice_range(0..=3), &[(0, ()), (2, ())]); + + assert_eq!(vec.slice_range(..=0), &[(0, ())]); + assert_eq!(vec.slice_range(..=1), &[(0, ())]); + assert_eq!(vec.slice_range(..=2), &[(0, ()), (2, ())]); + assert_eq!(vec.slice_range(..=3), &[(0, ()), (2, ())]); + } + + struct BoundIter { + min: i32, + max: i32, + + next: Option>, + } + + impl BoundIter { + fn new(min: i32, max: i32) -> Self { + Self { + min, + max, + + next: Some(Bound::Unbounded), + } + } + } + + impl Iterator for BoundIter { + type Item = Bound; + + fn next(&mut self) -> Option { + let cur = self.next?; + + self.next = match &cur { + Bound::Unbounded => Some(Bound::Included(self.min)), + Bound::Included(x) => { + if *x >= self.max { + Some(Bound::Excluded(self.min)) + } else { + Some(Bound::Included(x + 1)) + } + } + Bound::Excluded(x) => { + if *x >= self.max { + None + } else { + Some(Bound::Excluded(x + 1)) + } + } + }; + + Some(cur) + } + } + + #[test] + fn range_exhaustive() { + let map: BTreeMap = (1..=7).step_by(2).map(|x| (x, ())).collect(); + let mut vec = VecMap::default(); + for &key in map.keys() { + vec.append(key, ()).unwrap(); + } + + const RANGE_MIN: i32 = 0; + const RANGE_MAX: i32 = 8; + for lower_bound in BoundIter::new(RANGE_MIN, RANGE_MAX) { + let ub_min = match lower_bound { + Bound::Unbounded => RANGE_MIN, + Bound::Included(x) => x, + Bound::Excluded(x) => x + 1, + }; + for upper_bound in BoundIter::new(ub_min, RANGE_MAX) { + let map_range: Vec<(i32, ())> = map + .range((lower_bound, upper_bound)) + .map(|(&x, _)| (x, ())) + .collect(); + let vec_slice = vec.slice_range((lower_bound, upper_bound)); + + assert_eq!(map_range, vec_slice); + } + } + } + + #[test] + fn extend() { + let mut left = VecMap::default(); + left.append(0, ()).unwrap(); + assert_eq!(left.as_slice(), &[(0, ())]); + + let mut empty = VecMap::default(); + left.extend(&mut empty).unwrap(); + assert_eq!(left.as_slice(), &[(0, ())]); + assert_eq!(empty.as_slice(), &[]); + + let mut right = VecMap::default(); + right.append(1, ()).unwrap(); + + left.extend(&mut right).unwrap(); + + assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); + assert_eq!(right.as_slice(), &[]); + + let mut zero_map = VecMap::default(); + zero_map.append(0, ()).unwrap(); + + left.extend(&mut zero_map).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); + assert_eq!(zero_map.as_slice(), &[(0, ())]); + + let mut one_map = VecMap::default(); + one_map.append(1, ()).unwrap(); + + left.extend(&mut one_map).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); + assert_eq!(one_map.as_slice(), &[(1, ())]); + } +} diff --git a/zenith_utils/tests/ssl_test.rs b/zenith_utils/tests/ssl_test.rs index ba0f63d6ec..2a597700ae 100644 --- a/zenith_utils/tests/ssl_test.rs +++ b/zenith_utils/tests/ssl_test.rs @@ -110,7 +110,7 @@ fn ssl() { .unwrap(); let tls_config = Some(Arc::new(cfg)); - let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config).unwrap(); + let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap(); pgb.run(&mut handler).unwrap(); assert!(handler.got_query); @@ -150,7 +150,7 @@ fn no_ssl() { let mut handler = TestHandler; - let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None).unwrap(); + let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None, true).unwrap(); pgb.run(&mut handler).unwrap(); client_jh.join().unwrap(); @@ -214,7 +214,7 @@ fn server_forces_ssl() { .unwrap(); let tls_config = Some(Arc::new(cfg)); - let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config).unwrap(); + let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap(); let res = pgb.run(&mut handler).unwrap_err(); assert_eq!("client did not connect with TLS", format!("{}", res));