Fix typo

Run multiple times
check more often
2026-02-16 09:00:38 +00:00 · 2022-05-30 16:49:38 -04:00 · 2022-05-30 16:21:43 -04:00 · 2022-05-30 16:17:48 -04:00 · 2022-05-30 15:39:08 -04:00 · 2022-05-25 20:11:26 +03:00
48 changed files with 947 additions and 312 deletions
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -1,5 +1,6 @@
 [pageservers]
-zenith-1-ps-1 console_region_id=1
+#zenith-1-ps-1 console_region_id=1
+zenith-1-ps-2 console_region_id=1

 [safekeepers]
 zenith-1-sk-1 console_region_id=1
@@ -15,4 +16,4 @@ console_mgmt_base_url = http://console-release.local
 bucket_name           = zenith-storage-oregon
 bucket_region         = us-west-2
 etcd_endpoints        = etcd-release.local:2379
-safekeeper_enable_s3_offload = true
+safekeeper_enable_s3_offload = false
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -6,6 +6,7 @@ zenith-us-stage-ps-2 console_region_id=27
 zenith-us-stage-sk-1 console_region_id=27
 zenith-us-stage-sk-4 console_region_id=27
 zenith-us-stage-sk-5 console_region_id=27
+zenith-us-stage-sk-6 console_region_id=27

 [storage:children]
 pageservers
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -293,7 +293,7 @@ jobs:
          # `Too long with no output` error, if a test is running for a long time.
          # In that case, tests should have internal timeouts that are less than
          # no_output_timeout, specified here.
-          no_output_timeout: 10m
+          no_output_timeout: 1m
          environment:
            - ZENITH_BIN: /tmp/zenith/bin
            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
@@ -354,6 +354,7 @@ jobs:
              fi
            fi
      - run:
+          # TODO wait for processes to die in case of timeout?
          # CircleCI artifacts are preserved one file at a time, so skipping
          # this step isn't a good idea. If you want to extract the
          # pageserver state, perhaps a tarball would be a better idea.
@@ -361,7 +362,7 @@ jobs:
          when: always
          command: |
            du -sh /tmp/test_output/*
-            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "etcd.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
+            find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
            du -sh /tmp/test_output/*
      - store_artifacts:
          path: /tmp/test_output
@@ -640,7 +641,8 @@ jobs:
          name: Re-deploy proxy
          command: |
            DOCKER_TAG=$(git log --oneline|wc -l)
-            helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-stress-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait

  deploy-release:
    docker:
@@ -684,12 +686,13 @@ jobs:
          name: Setup helm v3
          command: |
            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-            helm repo add zenithdb https://neondatabase.github.io/helm-charts
+            helm repo add neondatabase https://neondatabase.github.io/helm-charts
      - run:
          name: Re-deploy proxy
          command: |
            DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait

  # Trigger a new remote CI job
  remote-ci-trigger:
--- a/.circleci/helm-values/neon-stress.proxy-scram.yaml
+++ b/.circleci/helm-values/neon-stress.proxy-scram.yaml
@@ -0,0 +1,26 @@
+fullnameOverride: "neon-stress-proxy-scram"
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://neon-stress-console.local/management/api/v2"
+  domain: "*.stress.neon.tech"
+
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: staging
+  zenith_region: eu-west-1
+  zenith_region_slug: ireland
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/production.proxy-scram.yaml
+++ b/.circleci/helm-values/production.proxy-scram.yaml
@@ -0,0 +1,24 @@
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.cloud.neon.tech"
+
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: production
+  zenith_region: us-west-2
+  zenith_region_slug: oregon
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/production.proxy.yaml
+++ b/.circleci/helm-values/production.proxy.yaml
@@ -1,9 +1,3 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
 settings:
  authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
  uri: "https://console.neon.tech/psql_session/"
@@ -28,7 +22,7 @@ exposedService:
    service.beta.kubernetes.io/aws-load-balancer-type: external
    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: start.zenith.tech,connect.neon.tech,pg.neon.tech
+    external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech

 metrics:
  enabled: true
--- a/20
+++ b/20
@@ -1,20 +0,0 @@
-This software is licensed under the Apache 2.0 License:
-
----------------------------------------------------------------------------
-Copyright 2021 Zenith Labs, Inc
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
----------------------------------------------------------------------------
-
-The PostgreSQL submodule in vendor/postgres is licensed under the
-PostgreSQL license. See vendor/postgres/COPYRIGHT.
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2047,15 +2047,18 @@ dependencies = [
 "bytes",
 "chrono",
 "crc32c",
+ "env_logger",
 "hex",
 "lazy_static",
 "log",
 "memoffset",
+ "postgres",
 "rand",
 "regex",
 "serde",
 "thiserror",
 "utils",
+ "wal_generate",
 "workspace_hack",
 ]

@@ -3627,6 +3630,18 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"

+[[package]]
+name = "wal_generate"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap 3.0.14",
+ "env_logger",
+ "log",
+ "postgres",
+ "tempfile",
+]
+
 [[package]]
 name = "walkdir"
 version = "2.3.2"
--- a/8
+++ b/8
@@ -20,7 +20,13 @@ else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS = -O0 -g3 $(CFLAGS)
 else
-$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
+	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
+endif
+
+# macOS with brew-installed openssl requires explicit paths
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+    PG_CONFIGURE_OPTS += --with-includes=/usr/local/opt/openssl/include --with-libraries=/usr/local/opt/openssl/lib
 endif

 # Choose whether we should be silent or verbose
--- a/5
+++ b/5
@@ -0,0 +1,5 @@
+Neon
+Copyright 2022 Neon Inc.
+
+The PostgreSQL submodule in vendor/postgres is licensed under the
+PostgreSQL license. See vendor/postgres/COPYRIGHT.
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Pageserver consists of:
 On Ubuntu or Debian this set of packages should be sufficient to build the code:
 ```text
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev
+libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -52,9 +52,10 @@ make -j5
 ```

 #### building on OSX (12.3.1)
-1. Install XCode 
+1. Install XCode and dependencies
 ```
 xcode-select --install
+brew install protobuf etcd
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -146,8 +146,14 @@ impl ComputeNode {
            _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
        };
        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
-        let mut ar = tar::Archive::new(copyreader);

+        // Read the archive directly from the `CopyOutReader`
+        //
+        // Set `ignore_zeros` so that unpack() reads all the Copy data and
+        // doesn't stop at the end-of-archive marker. Otherwise, if the server
+        // sends an Error after finishing the tarball, we will not notice it.
+        let mut ar = tar::Archive::new(copyreader);
+        ar.set_ignore_zeros(true);
        ar.unpack(&self.pgdata)?;

        self.metrics.basebackup_ms.store(
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-tar = "0.4.33"
+tar = "0.4.38"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "1.12.0"
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -231,8 +231,13 @@ impl PostgresNode {
            .context("page server 'basebackup' command failed")?;

        // Read the archive directly from the `CopyOutReader`
-        tar::Archive::new(copyreader)
-            .unpack(&self.pgdata())
+        //
+        // Set `ignore_zeros` so that unpack() reads all the Copy data and
+        // doesn't stop at the end-of-archive marker. Otherwise, if the server
+        // sends an Error after finishing the tarball, we will not notice it.
+        let mut ar = tar::Archive::new(copyreader);
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata())
            .context("extracting base backup failed")?;

        Ok(())
@@ -274,6 +279,8 @@ impl PostgresNode {
        conf.append("listen_addresses", &self.address.ip().to_string());
        conf.append("port", &self.address.port().to_string());
        conf.append("wal_keep_size", "0");
+        // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
+        conf.append("restart_after_crash", "off");

        // Configure the node to fetch pages from pageserver
        let pageserver_connstr = {
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -48,6 +48,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
            format!("--data-dir={}", etcd_data_dir.display()),
            format!("--listen-client-urls={client_urls}"),
            format!("--advertise-client-urls={client_urls}"),
+            // Set --quota-backend-bytes to keep the etcd virtual memory
+            // size smaller. Our test etcd clusters are very small.
+            // See https://github.com/etcd-io/etcd/issues/7910
+            "--quota-backend-bytes=100000000".to_string(),
        ])
        .stdout(Stdio::from(etcd_stdout_file))
        .stderr(Stdio::from(etcd_stderr_file))
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -15,7 +15,7 @@ use std::process::{Command, Stdio};
 use utils::{
    auth::{encode_from_key_file, Claims, Scope},
    postgres_backend::AuthType,
-    zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
 };

 use crate::safekeeper::SafekeeperNode;
@@ -136,7 +136,7 @@ impl EtcdBroker {
 #[serde(default)]
 pub struct PageServerConf {
    // node id
-    pub id: ZNodeId,
+    pub id: NodeId,
    // Pageserver connection settings
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
@@ -151,7 +151,7 @@ pub struct PageServerConf {
 impl Default for PageServerConf {
    fn default() -> Self {
        Self {
-            id: ZNodeId(0),
+            id: NodeId(0),
            listen_pg_addr: String::new(),
            listen_http_addr: String::new(),
            auth_type: AuthType::Trust,
@@ -163,7 +163,7 @@ impl Default for PageServerConf {
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
 pub struct SafekeeperConf {
-    pub id: ZNodeId,
+    pub id: NodeId,
    pub pg_port: u16,
    pub http_port: u16,
    pub sync: bool,
@@ -172,7 +172,7 @@ pub struct SafekeeperConf {
 impl Default for SafekeeperConf {
    fn default() -> Self {
        Self {
-            id: ZNodeId(0),
+            id: NodeId(0),
            pg_port: 0,
            http_port: 0,
            sync: true,
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -18,7 +18,7 @@ use thiserror::Error;
 use utils::{
    connstring::connection_address,
    http::error::HttpErrorBody,
-    zid::{ZNodeId, ZTenantId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTimelineId},
 };

 use crate::local_env::{LocalEnv, SafekeeperConf};
@@ -65,7 +65,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct SafekeeperNode {
-    pub id: ZNodeId,
+    pub id: NodeId,

    pub conf: SafekeeperConf,

@@ -100,7 +100,7 @@ impl SafekeeperNode {
            .unwrap()
    }

-    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf {
+    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
    }

@@ -286,7 +286,7 @@ impl SafekeeperNode {
        &self,
        tenant_id: ZTenantId,
        timeline_id: ZTimelineId,
-        peer_ids: Vec<ZNodeId>,
+        peer_ids: Vec<NodeId>,
    ) -> Result<()> {
        Ok(self
            .http_request(
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup.

 ### Branch

-We can create branch at certain LSN using `zenith timeline branch` command.
+We can create branch at certain LSN using `neon_local timeline branch` command.
 Each Branch lives in a corresponding timeline[] and has an ancestor[].


@@ -91,7 +91,7 @@ The layer map tracks what layers exist in a timeline.

 ### Layered repository

-Zenith repository implementation that keeps data in layers.
+Neon repository implementation that keeps data in layers.
 ### LSN

 The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
@@ -101,7 +101,7 @@ It is printed as two hexadecimal numbers of up to 8 digits each, separated by a
 Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html)
 Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery.

-In postgres and Zenith lsns are used to describe certain points in WAL handling.
+In Postgres and Neon LSNs are used to describe certain points in WAL handling.

 PostgreSQL LSNs and functions to monitor them:
 * `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location.
@@ -111,13 +111,13 @@ PostgreSQL LSNs and functions to monitor them:
 * `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
 [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):

-Zenith safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
+Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
 * `CommitLSN`: position in WAL confirmed by quorum safekeepers.
 * `RestartLSN`: position in WAL confirmed by all safekeepers.
 * `FlushLSN`: part of WAL persisted to the disk by safekeeper.
 * `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.

-Zenith pageserver LSNs:
+Neon pageserver LSNs:
 * `last_record_lsn` - the end of last processed WAL record.
 * `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN.
 * `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash.
@@ -132,7 +132,7 @@ This is the unit of data exchange between compute node and pageserver.

 ### Pageserver

-Zenith storage engine: repositories + wal receiver + page service + wal redo.
+Neon storage engine: repositories + wal receiver + page service + wal redo.

 ### Page service

@@ -184,10 +184,10 @@ relation exceeds that size, it is split into multiple segments.
 SLRUs include pg_clog, pg_multixact/members, and
 pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
 they don't need to be stored permanently (e.g. pg_subtrans),
-or we do not support them in zenith yet (pg_commit_ts).
+or we do not support them in neon yet (pg_commit_ts).

 ### Tenant (Multitenancy)
-Tenant represents a single customer, interacting with Zenith.
+Tenant represents a single customer, interacting with Neon.
 Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
 One pageserver[] can serve multiple tenants at once.
 One safekeeper
--- a/libs/etcd_broker/src/lib.rs
+++ b/libs/etcd_broker/src/lib.rs
@@ -16,7 +16,7 @@ use tokio::{sync::mpsc, task::JoinHandle};
 use tracing::*;
 use utils::{
    lsn::Lsn,
-    zid::{ZNodeId, ZTenantId, ZTenantTimelineId},
+    zid::{NodeId, ZTenantId, ZTenantTimelineId},
 };

 /// Default value to use for prefixing to all etcd keys with.
@@ -25,7 +25,7 @@ pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";

 #[derive(Debug, Deserialize, Serialize)]
 struct SafekeeperTimeline {
-    safekeeper_id: ZNodeId,
+    safekeeper_id: NodeId,
    info: SkTimelineInfo,
 }

@@ -71,7 +71,7 @@ pub enum BrokerError {
 /// A way to control the data retrieval from a certain subscription.
 pub struct SkTimelineSubscription {
    safekeeper_timeline_updates:
-        mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>>,
+        mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>>,
    kind: SkTimelineSubscriptionKind,
    watcher_handle: JoinHandle<Result<(), BrokerError>>,
    watcher: Watcher,
@@ -81,7 +81,7 @@ impl SkTimelineSubscription {
    /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
    pub async fn fetch_data(
        &mut self,
-    ) -> Option<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>> {
+    ) -> Option<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>> {
        self.safekeeper_timeline_updates.recv().await
    }

@@ -221,7 +221,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates(
                break;
            }

-            let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>> = HashMap::new();
+            let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>> = HashMap::new();
            // Keep track that the timeline data updates from etcd arrive in the right order.
            // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
            // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
@@ -299,18 +299,18 @@ fn parse_etcd_key_value(
                parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
                parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
            ),
-            ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
+            NodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
        ),
        SubscriptionKind::Tenant(tenant_id) => (
            ZTenantTimelineId::new(
                tenant_id,
                parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
            ),
-            ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
+            NodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
        ),
        SubscriptionKind::Timeline(zttid) => (
            zttid,
-            ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
+            NodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
        ),
    };

--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -20,5 +20,10 @@ serde = { version = "1.0", features = ["derive"] }
 utils = { path = "../utils" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

+[dev-dependencies]
+env_logger = "0.9"
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+wal_generate = { path = "wal_generate" }
+
 [build-dependencies]
 bindgen = "0.59.1"
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -15,7 +15,7 @@ use crate::XLogPageHeaderData;
 use crate::XLogRecord;
 use crate::XLOG_PAGE_MAGIC;

-use anyhow::bail;
+use anyhow::{bail, ensure};
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
 use bytes::{Buf, Bytes};
@@ -30,6 +30,7 @@ use std::path::{Path, PathBuf};
 use std::time::SystemTime;
 use utils::bin_ser::DeserializeError;
 use utils::bin_ser::SerializeError;
+use utils::const_assert;
 use utils::lsn::Lsn;

 pub const XLOG_FNAME_LEN: usize = 24;
@@ -149,8 +150,9 @@ fn find_end_of_wal_segment(
 ) -> anyhow::Result<u32> {
    // step back to the beginning of the page to read it in...
    let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ;
+    let mut skipping_first_contrecord: bool = false;
    let mut contlen: usize = 0;
-    let mut wal_crc: u32 = 0;
+    let mut xl_crc: u32 = 0;
    let mut crc: u32 = 0;
    let mut rec_offs: usize = 0;
    let mut buf = [0u8; XLOG_BLCKSZ];
@@ -158,11 +160,15 @@ fn find_end_of_wal_segment(
    let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
    let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
    file.seek(SeekFrom::Start(offs as u64))?;
+    // xl_crc is the last field in XLogRecord, will not be read into rec_hdr
+    const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
    let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS];

+    trace!("find_end_of_wal_segment(data_dir={}, segno={}, tli={}, wal_seg_size={}, start_offset=0x{:x})", data_dir.display(), segno, tli, wal_seg_size, start_offset);
    while offs < wal_seg_size {
        // we are at the beginning of the page; read it in
        if offs % XLOG_BLCKSZ == 0 {
+            trace!("offs=0x{:x}: new page", offs);
            let bytes_read = file.read(&mut buf)?;
            if bytes_read != buf.len() {
                bail!(
@@ -176,30 +182,49 @@ fn find_end_of_wal_segment(
            let xlp_magic = LittleEndian::read_u16(&buf[0..2]);
            let xlp_info = LittleEndian::read_u16(&buf[2..4]);
            let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]);
+            trace!(
+                "  xlp_magic=0x{:x}, xlp_info=0x{:x}, xlp_rem_len={}",
+                xlp_magic,
+                xlp_info,
+                xlp_rem_len
+            );
            // this is expected in current usage when valid WAL starts after page header
            if xlp_magic != XLOG_PAGE_MAGIC as u16 {
                trace!(
-                    "invalid WAL file {}.partial magic {} at {:?}",
+                    "  invalid WAL file {}.partial magic {} at {:?}",
                    file_name,
                    xlp_magic,
                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
                );
            }
            if offs == 0 {
-                offs = XLOG_SIZE_OF_XLOG_LONG_PHD;
+                offs += XLOG_SIZE_OF_XLOG_LONG_PHD;
                if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 {
-                    offs += ((xlp_rem_len + 7) & !7) as usize;
+                    trace!("  first record is contrecord");
+                    skipping_first_contrecord = true;
+                    contlen = xlp_rem_len as usize;
+                    if offs < start_offset {
+                        // Pre-condition failed: the beginning of the segment is unexpectedly corrupted.
+                        ensure!(start_offset - offs >= contlen,
+                            "start_offset is in the middle of the first record (which happens to be a contrecord), \
+                             expected to be on a record boundary. Is beginning of the segment corrupted?");
+                        contlen = 0;
+                        // keep skipping_first_contrecord to avoid counting the contrecord as valid, we did not check it.
+                    }
+                } else {
+                    trace!("  first record is not contrecord");
                }
            } else {
                offs += XLOG_SIZE_OF_XLOG_SHORT_PHD;
            }
            // ... and step forward again if asked
+            trace!("  skipped header to 0x{:x}", offs);
            offs = max(offs, start_offset);
-
        // beginning of the next record
        } else if contlen == 0 {
            let page_offs = offs % XLOG_BLCKSZ;
            let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
+            trace!("offs=0x{:x}: new record, xl_tot_len={}", offs, xl_tot_len);
            if xl_tot_len == 0 {
                info!(
                    "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}",
@@ -212,10 +237,25 @@ fn find_end_of_wal_segment(
                );
                break; // zeros, reached the end
            }
-            last_valid_rec_pos = offs;
+            if skipping_first_contrecord {
+                skipping_first_contrecord = false;
+                trace!("  first contrecord has been just completed");
+            } else {
+                trace!(
+                    "  updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
+                    last_valid_rec_pos,
+                    offs
+                );
+                last_valid_rec_pos = offs;
+            }
            offs += 4;
            rec_offs = 4;
            contlen = xl_tot_len - 4;
+            trace!(
+                "  reading rec_hdr[0..4] <-- [0x{:x}; 0x{:x})",
+                page_offs,
+                page_offs + 4
+            );
            rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]);
        } else {
            // we're continuing a record, possibly from previous page.
@@ -224,42 +264,118 @@ fn find_end_of_wal_segment(

            // read the rest of the record, or as much as fits on this page.
            let n = min(contlen, pageleft);
-            // fill rec_hdr (header up to (but not including) xl_crc field)
+            trace!(
+                "offs=0x{:x}, record continuation, pageleft={}, contlen={}",
+                offs,
+                pageleft,
+                contlen
+            );
+            // fill rec_hdr header up to (but not including) xl_crc field
+            trace!(
+                "  rec_offs={}, XLOG_RECORD_CRC_OFFS={}, XLOG_SIZE_OF_XLOG_RECORD={}",
+                rec_offs,
+                XLOG_RECORD_CRC_OFFS,
+                XLOG_SIZE_OF_XLOG_RECORD
+            );
            if rec_offs < XLOG_RECORD_CRC_OFFS {
                let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n);
+                trace!(
+                    "  reading rec_hdr[{}..{}] <-- [0x{:x}; 0x{:x})",
+                    rec_offs,
+                    rec_offs + len,
+                    page_offs,
+                    page_offs + len
+                );
                rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]);
            }
            if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD {
                let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS;
-                wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
+                // All records are aligned on 8-byte boundary, so their 8-byte frames
+                // cannot be split between pages. As xl_crc is the last field,
+                // its content is always on the same page.
+                const_assert!(XLOG_RECORD_CRC_OFFS % 8 == 4);
+                // We should always start reading aligned records even in incorrect WALs so if
+                // the condition is false it is likely a bug. However, it is localized somewhere
+                // in this function, hence we do not crash and just report failure instead.
+                ensure!(crc_offs % 8 == 4, "Record is not aligned properly (bug?)");
+                xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
+                trace!(
+                    "  reading xl_crc: [0x{:x}; 0x{:x}) = 0x{:x}",
+                    crc_offs,
+                    crc_offs + 4,
+                    xl_crc
+                );
                crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]);
-            } else {
-                crc ^= 0xFFFFFFFFu32;
+                trace!(
+                    "  initializing crc: [0x{:x}; 0x{:x}); crc = 0x{:x}",
+                    crc_offs + 4,
+                    page_offs + n,
+                    crc
+                );
+            } else if rec_offs > XLOG_RECORD_CRC_OFFS {
+                // As all records are 8-byte aligned, the header is already fully read and `crc` is initialized in the branch above.
+                ensure!(rec_offs >= XLOG_SIZE_OF_XLOG_RECORD);
+                let old_crc = crc;
                crc = crc32c_append(crc, &buf[page_offs..page_offs + n]);
+                trace!(
+                    "  appending to crc: [0x{:x}; 0x{:x}); 0x{:x} --> 0x{:x}",
+                    page_offs,
+                    page_offs + n,
+                    old_crc,
+                    crc
+                );
+            } else {
+                // Correct because of the way conditions are written above.
+                assert!(rec_offs + n < XLOG_SIZE_OF_XLOG_RECORD);
+                // If `skipping_first_contrecord == true`, we may be reading from a middle of a record
+                // which started in the previous segment. Hence there is no point in validating the header.
+                if !skipping_first_contrecord && rec_offs + n > XLOG_RECORD_CRC_OFFS {
+                    info!(
+                        "Curiously corrupted WAL: a record stops inside the header; \
+                             offs=0x{:x}, record continuation, pageleft={}, contlen={}",
+                        offs, pageleft, contlen
+                    );
+                    break;
+                }
+                // Do nothing: we are still reading the header. It's accounted in CRC in the end of the record.
            }
-            crc = !crc;
            rec_offs += n;
            offs += n;
            contlen -= n;

            if contlen == 0 {
-                crc = !crc;
+                trace!("  record completed at 0x{:x}", offs);
                crc = crc32c_append(crc, &rec_hdr);
                offs = (offs + 7) & !7; // pad on 8 bytes boundary */
-                if crc == wal_crc {
+                trace!(
+                    "  padded offs to 0x{:x}, crc is {:x}, expected crc is {:x}",
+                    offs,
+                    crc,
+                    xl_crc
+                );
+                if skipping_first_contrecord {
+                    // do nothing, the flag will go down on next iteration when we're reading new record
+                    trace!("  first conrecord has been just completed");
+                } else if crc == xl_crc {
                    // record is valid, advance the result to its end (with
                    // alignment to the next record taken into account)
+                    trace!(
+                        "  updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
+                        last_valid_rec_pos,
+                        offs
+                    );
                    last_valid_rec_pos = offs;
                } else {
                    info!(
                        "CRC mismatch {} vs {} at {}",
-                        crc, wal_crc, last_valid_rec_pos
+                        crc, xl_crc, last_valid_rec_pos
                    );
                    break;
                }
            }
        }
    }
+    trace!("last_valid_rec_pos=0x{:x}", last_valid_rec_pos);
    Ok(last_valid_rec_pos as u32)
 }

@@ -476,78 +592,126 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali
 mod tests {
    use super::*;
    use regex::Regex;
-    use std::{env, process::Command, str::FromStr};
+    use std::{env, str::FromStr};

-    // Run find_end_of_wal against file in test_wal dir
-    // Ensure that it finds last record correctly
-    #[test]
-    pub fn test_find_end_of_wal() {
-        // 1. Run initdb to generate some WAL
+    fn init_logging() {
+        let _ = env_logger::Builder::from_env(
+            env_logger::Env::default()
+                .default_filter_or("wal_generate=info,postgres_ffi::xlog_utils=trace"),
+        )
+        .is_test(true)
+        .try_init();
+    }
+
+    fn test_end_of_wal(
+        test_name: &str,
+        generate_wal: impl Fn(&mut postgres::Client) -> anyhow::Result<postgres::types::PgLsn>,
+        expected_end_of_wal_non_partial: Lsn,
+        last_segment: &str,
+    ) {
+        use wal_generate::*;
+        // 1. Generate some WAL
        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("..")
            .join("..");
-        let data_dir = top_path.join("test_output/test_find_end_of_wal");
-        let initdb_path = top_path.join("tmp_install/bin/initdb");
-        let lib_path = top_path.join("tmp_install/lib");
-        if data_dir.exists() {
-            fs::remove_dir_all(&data_dir).unwrap();
+        let cfg = Conf {
+            pg_distrib_dir: top_path.join("tmp_install"),
+            datadir: top_path.join(format!("test_output/{}", test_name)),
+        };
+        if cfg.datadir.exists() {
+            fs::remove_dir_all(&cfg.datadir).unwrap();
        }
-        println!("Using initdb from '{}'", initdb_path.display());
-        println!("Data directory '{}'", data_dir.display());
-        let initdb_output = Command::new(initdb_path)
-            .args(&["-D", data_dir.to_str().unwrap()])
-            .arg("--no-instructions")
-            .arg("--no-sync")
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &lib_path)
-            .env("DYLD_LIBRARY_PATH", &lib_path)
-            .output()
-            .unwrap();
-        assert!(
-            initdb_output.status.success(),
-            "initdb failed. Status: '{}', stdout: '{}', stderr: '{}'",
-            initdb_output.status,
-            String::from_utf8_lossy(&initdb_output.stdout),
-            String::from_utf8_lossy(&initdb_output.stderr),
-        );
+        cfg.initdb().unwrap();
+        let mut srv = cfg.start_server().unwrap();
+        let expected_wal_end: Lsn =
+            u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into();
+        srv.kill();

        // 2. Pick WAL generated by initdb
-        let wal_dir = data_dir.join("pg_wal");
+        let wal_dir = cfg.datadir.join("pg_wal");
        let wal_seg_size = 16 * 1024 * 1024;

        // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
        let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
        let wal_end = Lsn(wal_end);
-        println!("wal_end={}, tli={}", wal_end, tli);
-        assert_eq!(wal_end, "0/2000000".parse::<Lsn>().unwrap());
+        info!(
+            "find_end_of_wal returned (wal_end={}, tli={})",
+            wal_end, tli
+        );
+        assert_eq!(wal_end, expected_end_of_wal_non_partial);

        // 4. Get the actual end of WAL by pg_waldump
-        let waldump_path = top_path.join("tmp_install/bin/pg_waldump");
-        let waldump_output = Command::new(waldump_path)
-            .arg(wal_dir.join("000000010000000000000001"))
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &lib_path)
-            .env("DYLD_LIBRARY_PATH", &lib_path)
-            .output()
-            .unwrap();
-        let waldump_output = std::str::from_utf8(&waldump_output.stderr).unwrap();
-        println!("waldump_output = '{}'", &waldump_output);
-        let re = Regex::new(r"invalid record length at (.+):").unwrap();
-        let caps = re.captures(waldump_output).unwrap();
+        let waldump_output = cfg
+            .pg_waldump("000000010000000000000001", last_segment)
+            .unwrap()
+            .stderr;
+        let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
+        let caps = match Regex::new(r"invalid record length at (.+):")
+            .unwrap()
+            .captures(waldump_output)
+        {
+            Some(caps) => caps,
+            None => {
+                error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
+                panic!();
+            }
+        };
        let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
+        info!(
+            "waldump erred on {}, expected wal end at {}",
+            waldump_wal_end, expected_wal_end
+        );
+        assert_eq!(waldump_wal_end, expected_wal_end);

        // 5. Rename file to partial to actually find last valid lsn
        fs::rename(
-            wal_dir.join("000000010000000000000001"),
-            wal_dir.join("000000010000000000000001.partial"),
+            wal_dir.join(last_segment),
+            wal_dir.join(format!("{}.partial", last_segment)),
        )
        .unwrap();
        let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
        let wal_end = Lsn(wal_end);
-        println!("wal_end={}, tli={}", wal_end, tli);
+        info!(
+            "find_end_of_wal returned (wal_end={}, tli={})",
+            wal_end, tli
+        );
        assert_eq!(wal_end, waldump_wal_end);
    }

+    #[test]
+    pub fn test_find_end_of_wal_simple() {
+        init_logging();
+        test_end_of_wal(
+            "test_find_end_of_wal_simple",
+            wal_generate::generate_simple,
+            "0/2000000".parse::<Lsn>().unwrap(),
+            "000000010000000000000001",
+        );
+    }
+
+    #[test]
+    pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
+        init_logging();
+        test_end_of_wal(
+            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
+            wal_generate::generate_wal_record_crossing_segment_followed_by_small_one,
+            "0/3000000".parse::<Lsn>().unwrap(),
+            "000000010000000000000002",
+        );
+    }
+
+    #[test]
+    #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
+    pub fn test_find_end_of_wal_last_crossing_segment() {
+        init_logging();
+        test_end_of_wal(
+            "test_find_end_of_wal_last_crossing_segment",
+            wal_generate::generate_last_wal_record_crossing_segment,
+            "0/3000000".parse::<Lsn>().unwrap(),
+            "000000010000000000000002",
+        );
+    }
+
    /// Check the math in update_next_xid
    ///
    /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
--- a/libs/postgres_ffi/wal_generate/Cargo.toml
+++ b/libs/postgres_ffi/wal_generate/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "wal_generate"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow = "1.0"
+clap = "3.0"
+env_logger = "0.9"
+log = "0.4"
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tempfile = "3.2"
--- a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs
+++ b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs
@@ -0,0 +1,58 @@
+use anyhow::*;
+use clap::{App, Arg};
+use wal_generate::*;
+
+fn main() -> Result<()> {
+    env_logger::Builder::from_env(
+        env_logger::Env::default().default_filter_or("wal_generate=info"),
+    )
+    .init();
+    let arg_matches = App::new("Postgres WAL generator")
+        .about("Generates Postgres databases with specific WAL properties")
+        .arg(
+            Arg::new("datadir")
+                .short('D')
+                .long("datadir")
+                .takes_value(true)
+                .help("Data directory for the Postgres server")
+                .required(true)
+        )
+        .arg(
+            Arg::new("pg-distrib-dir")
+                .long("pg-distrib-dir")
+                .takes_value(true)
+                .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
+                .default_value("/usr/local")
+        )
+        .arg(
+            Arg::new("type")
+                .long("type")
+                .takes_value(true)
+                .help("Type of WAL to generate")
+                .possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"])
+                .required(true)
+        )
+        .get_matches();
+
+    let cfg = Conf {
+        pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
+        datadir: arg_matches.value_of("datadir").unwrap().into(),
+    };
+    cfg.initdb()?;
+    let mut srv = cfg.start_server()?;
+    let lsn = match arg_matches.value_of("type").unwrap() {
+        "simple" => generate_simple(&mut srv.connect_with_timeout()?)?,
+        "last_wal_record_crossing_segment" => {
+            generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)?
+        }
+        "wal_record_crossing_segment_followed_by_small_one" => {
+            generate_wal_record_crossing_segment_followed_by_small_one(
+                &mut srv.connect_with_timeout()?,
+            )?
+        }
+        a => panic!("Unknown --type argument: {}", a),
+    };
+    println!("end_of_wal = {}", lsn);
+    srv.kill();
+    Ok(())
+}
--- a/libs/postgres_ffi/wal_generate/src/lib.rs
+++ b/libs/postgres_ffi/wal_generate/src/lib.rs
@@ -0,0 +1,278 @@
+use anyhow::*;
+use core::time::Duration;
+use log::*;
+use postgres::types::PgLsn;
+use postgres::Client;
+use std::cmp::Ordering;
+use std::path::{Path, PathBuf};
+use std::process::{Command, Stdio};
+use std::time::Instant;
+use tempfile::{tempdir, TempDir};
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Conf {
+    pub pg_distrib_dir: PathBuf,
+    pub datadir: PathBuf,
+}
+
+pub struct PostgresServer {
+    process: std::process::Child,
+    _unix_socket_dir: TempDir,
+    client_config: postgres::Config,
+}
+
+impl Conf {
+    fn pg_bin_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("bin")
+    }
+
+    fn pg_lib_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("lib")
+    }
+
+    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
+        let path = self.pg_bin_dir().join(command);
+        ensure!(path.exists(), "Command {:?} does not exist", path);
+        let mut cmd = Command::new(path);
+        cmd.env_clear()
+            .env("LD_LIBRARY_PATH", self.pg_lib_dir())
+            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir());
+        Ok(cmd)
+    }
+
+    pub fn initdb(&self) -> Result<()> {
+        if let Some(parent) = self.datadir.parent() {
+            info!("Pre-creating parent directory {:?}", parent);
+            // Tests may be run concurrently and there may be a race to create `test_output/`.
+            // std::fs::create_dir_all is guaranteed to have no races with another thread creating directories.
+            std::fs::create_dir_all(parent)?;
+        }
+        info!(
+            "Running initdb in {:?} with user \"postgres\"",
+            self.datadir
+        );
+        let output = self
+            .new_pg_command("initdb")?
+            .arg("-D")
+            .arg(self.datadir.as_os_str())
+            .args(&["-U", "postgres", "--no-instructions", "--no-sync"])
+            .output()?;
+        debug!("initdb output: {:?}", output);
+        ensure!(
+            output.status.success(),
+            "initdb failed, stdout and stderr follow:\n{}{}",
+            String::from_utf8_lossy(&output.stdout),
+            String::from_utf8_lossy(&output.stderr),
+        );
+        Ok(())
+    }
+
+    pub fn start_server(&self) -> Result<PostgresServer> {
+        info!("Starting Postgres server in {:?}", self.datadir);
+        let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
+        let unix_socket_dir_path = unix_socket_dir.path().to_owned();
+        let server_process = self
+            .new_pg_command("postgres")?
+            .args(&["-c", "listen_addresses="])
+            .arg("-k")
+            .arg(unix_socket_dir_path.as_os_str())
+            .arg("-D")
+            .arg(self.datadir.as_os_str())
+            .args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed
+            .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
+            .args(&["-c", "shared_preload_libraries=zenith"]) // can only be loaded at startup
+            // Disable background processes as much as possible
+            .args(&["-c", "wal_writer_delay=10s"])
+            .args(&["-c", "autovacuum=off"])
+            .stderr(Stdio::null())
+            .spawn()?;
+        let server = PostgresServer {
+            process: server_process,
+            _unix_socket_dir: unix_socket_dir,
+            client_config: {
+                let mut c = postgres::Config::new();
+                c.host_path(&unix_socket_dir_path);
+                c.user("postgres");
+                c.connect_timeout(Duration::from_millis(1000));
+                c
+            },
+        };
+        Ok(server)
+    }
+
+    pub fn pg_waldump(
+        &self,
+        first_segment_name: &str,
+        last_segment_name: &str,
+    ) -> Result<std::process::Output> {
+        let first_segment_file = self.datadir.join(first_segment_name);
+        let last_segment_file = self.datadir.join(last_segment_name);
+        info!(
+            "Running pg_waldump for {} .. {}",
+            first_segment_file.display(),
+            last_segment_file.display()
+        );
+        let output = self
+            .new_pg_command("pg_waldump")?
+            .args(&[
+                &first_segment_file.as_os_str(),
+                &last_segment_file.as_os_str(),
+            ])
+            .output()?;
+        debug!("waldump output: {:?}", output);
+        Ok(output)
+    }
+}
+
+impl PostgresServer {
+    pub fn connect_with_timeout(&self) -> Result<Client> {
+        let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap();
+        while Instant::now() < retry_until {
+            use std::result::Result::Ok;
+            if let Ok(client) = self.client_config.connect(postgres::NoTls) {
+                return Ok(client);
+            }
+            std::thread::sleep(Duration::from_millis(100));
+        }
+        bail!("Connection timed out");
+    }
+
+    pub fn kill(&mut self) {
+        self.process.kill().unwrap();
+        self.process.wait().unwrap();
+    }
+}
+
+impl Drop for PostgresServer {
+    fn drop(&mut self) {
+        use std::result::Result::Ok;
+        match self.process.try_wait() {
+            Ok(Some(_)) => return,
+            Ok(None) => {
+                warn!("Server was not terminated, will be killed");
+            }
+            Err(e) => {
+                error!("Unable to get status of the server: {}, will be killed", e);
+            }
+        }
+        let _ = self.process.kill();
+    }
+}
+
+pub trait PostgresClientExt: postgres::GenericClient {
+    fn pg_current_wal_insert_lsn(&mut self) -> Result<PgLsn> {
+        Ok(self
+            .query_one("SELECT pg_current_wal_insert_lsn()", &[])?
+            .get(0))
+    }
+    fn pg_current_wal_flush_lsn(&mut self) -> Result<PgLsn> {
+        Ok(self
+            .query_one("SELECT pg_current_wal_flush_lsn()", &[])?
+            .get(0))
+    }
+}
+
+impl<C: postgres::GenericClient> PostgresClientExt for C {}
+
+fn generate_internal<C: postgres::GenericClient>(
+    client: &mut C,
+    f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
+) -> Result<PgLsn> {
+    client.execute("create extension if not exists zenith_test_utils", &[])?;
+
+    let wal_segment_size = client.query_one(
+        "select cast(setting as bigint) as setting, unit \
+         from pg_settings where name = 'wal_segment_size'",
+        &[],
+    )?;
+    ensure!(
+        wal_segment_size.get::<_, String>("unit") == "B",
+        "Unexpected wal_segment_size unit"
+    );
+    ensure!(
+        wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024,
+        "Unexpected wal_segment_size in bytes"
+    );
+
+    let initial_lsn = client.pg_current_wal_insert_lsn()?;
+    info!("LSN initial = {}", initial_lsn);
+
+    let last_lsn = match f(client, initial_lsn)? {
+        None => client.pg_current_wal_insert_lsn()?,
+        Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
+            Ordering::Less => bail!("Some records were inserted after the generated WAL"),
+            Ordering::Equal => last_lsn,
+            Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+        },
+    };
+
+    // Some records may be not flushed, e.g. non-transactional logical messages.
+    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
+        Ordering::Less => bail!("Some records were flushed after the generated WAL"),
+        Ordering::Equal => {}
+        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
+    }
+    Ok(last_lsn)
+}
+
+pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
+    generate_internal(client, |client, _| {
+        client.execute("CREATE table t(x int)", &[])?;
+        Ok(None)
+    })
+}
+
+fn generate_single_logical_message(
+    client: &mut impl postgres::GenericClient,
+    transactional: bool,
+) -> Result<PgLsn> {
+    generate_internal(client, |client, initial_lsn| {
+        ensure!(
+            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
+            "Initial LSN is too far in the future"
+        );
+
+        let message_lsn: PgLsn = client
+            .query_one(
+                "select pg_logical_emit_message($1, 'big-16mb-msg', \
+                 concat(repeat('abcd', 16 * 256 * 1024), 'end')) as message_lsn",
+                &[&transactional],
+            )?
+            .get("message_lsn");
+        ensure!(
+            message_lsn > PgLsn::from(0x0200_0000 + 4 * 8192),
+            "Logical message did not cross the segment boundary"
+        );
+        ensure!(
+            message_lsn < PgLsn::from(0x0400_0000),
+            "Logical message crossed two segments"
+        );
+
+        if transactional {
+            // Transactional logical messages are part of a transaction, so the one above is
+            // followed by a small COMMIT record.
+
+            let after_message_lsn = client.pg_current_wal_insert_lsn()?;
+            ensure!(
+                message_lsn < after_message_lsn,
+                "No record found after the emitted message"
+            );
+            Ok(Some(after_message_lsn))
+        } else {
+            Ok(Some(message_lsn))
+        }
+    })
+}
+
+pub fn generate_wal_record_crossing_segment_followed_by_small_one(
+    client: &mut impl postgres::GenericClient,
+) -> Result<PgLsn> {
+    generate_single_logical_message(client, true)
+}
+
+pub fn generate_last_wal_record_crossing_segment<C: postgres::GenericClient>(
+    client: &mut C,
+) -> Result<PgLsn> {
+    generate_single_logical_message(client, false)
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -95,3 +95,11 @@ macro_rules! project_git_version {
        );
    };
 }
+
+/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
+#[macro_export]
+macro_rules! const_assert {
+    ($($args:tt)*) => {
+        const _: () = assert!($($args)*);
+    };
+}
--- a/libs/utils/src/zid.rs
+++ b/libs/utils/src/zid.rs
@@ -226,9 +226,9 @@ impl fmt::Display for ZTenantTimelineId {
 // by the console.
 #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct ZNodeId(pub u64);
+pub struct NodeId(pub u64);

-impl fmt::Display for ZNodeId {
+impl fmt::Display for NodeId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}", self.0)
    }
--- a/neon_local/src/main.rs
+++ b/neon_local/src/main.rs
@@ -22,14 +22,14 @@ use utils::{
    lsn::Lsn,
    postgres_backend::AuthType,
    project_git_version,
-    zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
 };

 use pageserver::timelines::TimelineInfo;

 // Default id of a safekeeper node, if not specified on the command line.
-const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1);
-const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1);
+const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1);
+const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

@@ -860,7 +860,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
    Ok(())
 }

-fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result<SafekeeperNode> {
+fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNode> {
    if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
        Ok(SafekeeperNode::from_env(env, node))
    } else {
@@ -876,7 +876,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul

    // All the commands take an optional safekeeper name argument
    let sk_id = if let Some(id_str) = sub_args.value_of("id") {
-        ZNodeId(id_str.parse().context("while parsing safekeeper id")?)
+        NodeId(id_str.parse().context("while parsing safekeeper id")?)
    } else {
        DEFAULT_SAFEKEEPER_ID
    };
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,8 +10,9 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{BufMut, BytesMut};
+use fail::fail_point;
 use std::fmt::Write as FmtWrite;
 use std::io;
 use std::io::Write;
@@ -30,11 +31,16 @@ use utils::lsn::Lsn;
 /// This is short-living object only for the time of tarball creation,
 /// created mostly to avoid passing a lot of parameters between various functions
 /// used for constructing tarball.
-pub struct Basebackup<'a> {
-    ar: Builder<&'a mut dyn Write>,
+pub struct Basebackup<'a, W>
+where
+    W: Write,
+{
+    ar: Builder<AbortableWrite<W>>,
    timeline: &'a Arc<DatadirTimelineImpl>,
    pub lsn: Lsn,
    prev_record_lsn: Lsn,
+
+    finished: bool,
 }

 // Create basebackup with non-rel data in it. Omit relational data.
@@ -44,12 +50,15 @@ pub struct Basebackup<'a> {
 //  * When working without safekeepers. In this situation it is important to match the lsn
 //    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
 //    to start the replication.
-impl<'a> Basebackup<'a> {
+impl<'a, W> Basebackup<'a, W>
+where
+    W: Write,
+{
    pub fn new(
-        write: &'a mut dyn Write,
+        write: W,
        timeline: &'a Arc<DatadirTimelineImpl>,
        req_lsn: Option<Lsn>,
-    ) -> Result<Basebackup<'a>> {
+    ) -> Result<Basebackup<'a, W>> {
        // Compute postgres doesn't have any previous WAL files, but the first
        // record that it's going to write needs to include the LSN of the
        // previous record (xl_prev). We include prev_record_lsn in the
@@ -90,14 +99,15 @@ impl<'a> Basebackup<'a> {
        );

        Ok(Basebackup {
-            ar: Builder::new(write),
+            ar: Builder::new(AbortableWrite::new(write)),
            timeline,
            lsn: backup_lsn,
            prev_record_lsn: backup_prev,
+            finished: false,
        })
    }

-    pub fn send_tarball(&mut self) -> anyhow::Result<()> {
+    pub fn send_tarball(mut self) -> anyhow::Result<()> {
        // Create pgdata subdirs structure
        for dir in pg_constants::PGDATA_SUBDIRS.iter() {
            let header = new_tar_header_dir(*dir)?;
@@ -135,9 +145,14 @@ impl<'a> Basebackup<'a> {
            self.add_twophase_file(xid)?;
        }

+        fail_point!("basebackup-before-control-file", |_| {
+            bail!("failpoint basebackup-before-control-file")
+        });
+
        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file()?;
        self.ar.finish()?;
+        self.finished = true;
        debug!("all tarred up!");
        Ok(())
    }
@@ -331,6 +346,19 @@ impl<'a> Basebackup<'a> {
    }
 }

+impl<'a, W> Drop for Basebackup<'a, W>
+where
+    W: Write,
+{
+    /// If the basebackup was not finished, prevent the Archive::drop() from
+    /// writing the end-of-archive marker.
+    fn drop(&mut self) {
+        if !self.finished {
+            self.ar.get_mut().abort();
+        }
+    }
+}
+
 //
 // Create new tarball entry header
 //
@@ -366,3 +394,49 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
    header.set_cksum();
    Ok(header)
 }
+
+/// A wrapper that passes through all data to the underlying Write,
+/// until abort() is called.
+///
+/// tar::Builder has an annoying habit of finishing the archive with
+/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
+/// even if an error occurs and we don't finish building the archive.
+/// We'd rather abort writing the tarball immediately than construct
+/// a seemingly valid but incomplete archive. This wrapper allows us
+/// to swallow the end-of-archive marker that Builder::drop() emits,
+/// without writing it to the underlying sink.
+///
+struct AbortableWrite<W> {
+    w: W,
+    aborted: bool,
+}
+
+impl<W> AbortableWrite<W> {
+    pub fn new(w: W) -> Self {
+        AbortableWrite { w, aborted: false }
+    }
+
+    pub fn abort(&mut self) {
+        self.aborted = true;
+    }
+}
+
+impl<W> Write for AbortableWrite<W>
+where
+    W: Write,
+{
+    fn write(&mut self, data: &[u8]) -> io::Result<usize> {
+        if self.aborted {
+            Ok(data.len())
+        } else {
+            self.w.write(data)
+        }
+    }
+    fn flush(&mut self) -> io::Result<()> {
+        if self.aborted {
+            Ok(())
+        } else {
+            self.w.flush()
+        }
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,6 +1,6 @@
 //! Main entry point for the Page Server executable.

-use std::{env, path::Path, str::FromStr};
+use std::{env, fs::File, path::Path, process, str::FromStr, thread::sleep, time::Duration};
 use tracing::*;

 use anyhow::{bail, Context, Result};
@@ -254,7 +254,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        // Otherwise, the coverage data will be damaged.
        match daemonize.exit_action(|| exit_now(0)).start() {
            Ok(_) => info!("Success, daemonized"),
-            Err(err) => error!(%err, "could not daemonize"),
+            Err(err) => bail!("{err}. could not daemonize. bailing."),
        }
    }

@@ -276,6 +276,22 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()

    let remote_index = tenant_mgr::init_tenant_mgr(conf)?;

+    // Create file and frequently check if it's still here
+    thread_mgr::spawn(
+        ThreadKind::HttpEndpointListener,
+        None,
+        None,
+        "http_endpoint_thread",
+        true,
+        move || {
+            File::create("delete-me.txt").expect("FFFF failed creating file");
+            loop {
+                File::open("delete-me.txt").expect("FFFF cannot find file");
+                sleep(Duration::from_millis(1));
+            }
+        },
+    )?;
+
    // Spawn a new thread for the http endpoint
    // bind before launching separate thread so the error reported before startup exits
    let auth_cloned = auth.clone();
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -16,7 +16,7 @@ use toml_edit::{Document, Item};
 use url::Url;
 use utils::{
    postgres_backend::AuthType,
-    zid::{ZNodeId, ZTenantId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTimelineId},
 };

 use crate::layered_repository::TIMELINES_SEGMENT_NAME;
@@ -78,7 +78,7 @@ pub mod defaults {
 pub struct PageServerConf {
    // Identifier of that particular pageserver so e g safekeepers
    // can safely distinguish different pageservers
-    pub id: ZNodeId,
+    pub id: NodeId,

    /// Example (default): 127.0.0.1:64000
    pub listen_pg_addr: String,
@@ -180,7 +180,7 @@ struct PageServerConfigBuilder {
    auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,

-    id: BuilderValue<ZNodeId>,
+    id: BuilderValue<NodeId>,

    profiling: BuilderValue<ProfilingConfig>,
    broker_etcd_prefix: BuilderValue<String>,
@@ -276,7 +276,7 @@ impl PageServerConfigBuilder {
        self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
    }

-    pub fn id(&mut self, node_id: ZNodeId) {
+    pub fn id(&mut self, node_id: NodeId) {
        self.id = BuilderValue::Set(node_id)
    }

@@ -399,7 +399,7 @@ impl PageServerConf {
                "tenant_config" => {
                    t_conf = Self::parse_toml_tenant_conf(item)?;
                }
-                "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)),
+                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
                "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
                "broker_endpoints" => builder.broker_endpoints(
@@ -550,7 +550,7 @@ impl PageServerConf {
    #[cfg(test)]
    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
        PageServerConf {
-            id: ZNodeId(0),
+            id: NodeId(0),
            wait_lsn_timeout: Duration::from_secs(60),
            wal_redo_timeout: Duration::from_secs(60),
            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
@@ -693,7 +693,7 @@ id = 10
        assert_eq!(
            parsed_config,
            PageServerConf {
-                id: ZNodeId(10),
+                id: NodeId(10),
                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
                wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
@@ -737,7 +737,7 @@ id = 10
        assert_eq!(
            parsed_config,
            PageServerConf {
-                id: ZNodeId(10),
+                id: NodeId(10),
                listen_pg_addr: "127.0.0.1:64000".to_string(),
                listen_http_addr: "127.0.0.1:9898".to_string(),
                wait_lsn_timeout: Duration::from_secs(111),
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::{
    lsn::Lsn,
-    zid::{ZNodeId, ZTenantId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTimelineId},
 };

 #[serde_as]
@@ -42,7 +42,7 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId

 #[derive(Serialize)]
 pub struct StatusResponse {
-    pub id: ZNodeId,
+    pub id: NodeId,
 }

 impl TenantCreateRequest {
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -18,7 +18,7 @@ use itertools::Itertools;
 use lazy_static::lazy_static;
 use tracing::*;

-use std::cmp::{max, Ordering};
+use std::cmp::{max, min, Ordering};
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::collections::{BTreeSet, HashSet};
@@ -2165,7 +2165,7 @@ impl LayeredTimeline {

        let gc_info = self.gc_info.read().unwrap();
        let retain_lsns = &gc_info.retain_lsns;
-        let cutoff = gc_info.cutoff;
+        let cutoff = min(gc_info.cutoff, disk_consistent_lsn);
        let pitr = gc_info.pitr;

        // Calculate pitr cutoff point.
@@ -2294,12 +2294,20 @@ impl LayeredTimeline {
            // is 102, then it might not have been fully flushed to disk
            // before crash.
            //
-            // FIXME: This logic is wrong. See https://github.com/zenithdb/zenith/issues/707
-            if !layers.newer_image_layer_exists(
-                &l.get_key_range(),
-                l.get_lsn_range().end,
-                disk_consistent_lsn + 1,
-            )? {
+            // For example, imagine that the following layers exist:
+            //
+            // 1000      - image (A)
+            // 1000-2000 - delta (B)
+            // 2000      - image (C)
+            // 2000-3000 - delta (D)
+            // 3000      - image (E)
+            //
+            // If GC horizon is at 2500, we can remove layers A and B, but
+            // we cannot remove C, even though it's older than 2500, because
+            // the delta layer 2000-3000 depends on it.
+            if !layers
+                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
+            {
                debug!(
                    "keeping {} because it is the latest layer",
                    l.filename().display()
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -201,18 +201,14 @@ impl LayerMap {
        NUM_ONDISK_LAYERS.dec();
    }

-    /// Is there a newer image layer for given key-range?
+    /// Is there a newer image layer for given key- and LSN-range?
    ///
    /// This is used for garbage collection, to determine if an old layer can
    /// be deleted.
-    /// We ignore layers newer than disk_consistent_lsn because they will be removed at restart
-    /// We also only look at historic layers
-    //#[allow(dead_code)]
-    pub fn newer_image_layer_exists(
+    pub fn image_layer_exists(
        &self,
        key_range: &Range<Key>,
-        lsn: Lsn,
-        disk_consistent_lsn: Lsn,
+        lsn_range: &Range<Lsn>,
    ) -> Result<bool> {
        let mut range_remain = key_range.clone();

@@ -225,8 +221,7 @@ impl LayerMap {
                let img_lsn = l.get_lsn_range().start;
                if !l.is_incremental()
                    && l.get_key_range().contains(&range_remain.start)
-                    && img_lsn > lsn
-                    && img_lsn < disk_consistent_lsn
+                    && lsn_range.contains(&img_lsn)
                {
                    made_progress = true;
                    let img_key_end = l.get_key_range().end;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -593,7 +593,8 @@ impl PageServerHandler {
        /* Send a tarball of the latest layer on the timeline */
        {
            let mut writer = CopyDataSink { pgb };
-            let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
+
+            let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
            span.record("lsn", &basebackup.lsn.to_string().as_str());
            basebackup.send_tarball()?;
        }
--- a/poetry.lock
+++ b/poetry.lock
@@ -21,9 +21,6 @@ category = "main"
 optional = false
 python-versions = ">=3.6"

-[package.dependencies]
-typing-extensions = {version = ">=3.6.5", markers = "python_version < \"3.8\""}
-
 [[package]]
 name = "asyncpg"
 version = "0.24.0"
@@ -32,9 +29,6 @@ category = "main"
 optional = false
 python-versions = ">=3.6.0"

-[package.dependencies]
-typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""}
-
 [package.extras]
 dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=6.0)", "Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"]
 docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"]
@@ -125,7 +119,6 @@ python-versions = ">=3.6"

 [package.dependencies]
 botocore-stubs = "*"
-typing-extensions = {version = "*", markers = "python_version < \"3.9\""}

 [package.extras]
 accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.20.0)"]
@@ -454,9 +447,6 @@ category = "main"
 optional = false
 python-versions = ">=3.6"

-[package.dependencies]
-typing-extensions = {version = "*", markers = "python_version < \"3.9\""}
-
 [[package]]
 name = "cached-property"
 version = "1.5.2"
@@ -524,7 +514,6 @@ python-versions = ">=3.6"

 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
-importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}

 [[package]]
 name = "colorama"
@@ -605,7 +594,6 @@ optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"

 [package.dependencies]
-importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
 mccabe = ">=0.6.0,<0.7.0"
 pycodestyle = ">=2.7.0,<2.8.0"
 pyflakes = ">=2.3.0,<2.4.0"
@@ -664,23 +652,6 @@ category = "main"
 optional = false
 python-versions = ">=3.5"

-[[package]]
-name = "importlib-metadata"
-version = "4.10.1"
-description = "Read metadata from Python packages"
-category = "main"
-optional = false
-python-versions = ">=3.7"
-
-[package.dependencies]
-typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
-zipp = ">=0.5"
-
-[package.extras]
-docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"]
-perf = ["ipython"]
-testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"]
-
 [[package]]
 name = "iniconfig"
 version = "1.1.1"
@@ -759,9 +730,6 @@ category = "main"
 optional = false
 python-versions = ">=2.7"

-[package.dependencies]
-importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
-
 [package.extras]
 docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
 testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy", "enum34", "jsonlib"]
@@ -785,7 +753,6 @@ python-versions = "*"

 [package.dependencies]
 attrs = ">=17.4.0"
-importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
 pyrsistent = ">=0.14.0"
 six = ">=1.11.0"

@@ -840,7 +807,6 @@ flask = {version = "*", optional = true, markers = "extra == \"server\""}
 flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
 graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
 idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""}
-importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
 Jinja2 = ">=2.10.1"
 jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
 MarkupSafe = "!=2.0.0a1"
@@ -890,7 +856,6 @@ python-versions = ">=3.5"
 [package.dependencies]
 mypy-extensions = ">=0.4.3,<0.5.0"
 toml = "*"
-typed-ast = {version = ">=1.4.0,<1.5.0", markers = "python_version < \"3.8\""}
 typing-extensions = ">=3.7.4"

 [package.extras]
@@ -947,9 +912,6 @@ category = "main"
 optional = false
 python-versions = ">=3.6"

-[package.dependencies]
-importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
-
 [package.extras]
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
@@ -1061,7 +1023,6 @@ python-versions = ">=3.6"
 atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
 attrs = ">=19.2.0"
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
-importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
 iniconfig = "*"
 packaging = "*"
 pluggy = ">=0.12,<2.0"
@@ -1279,14 +1240,6 @@ category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"

-[[package]]
-name = "typed-ast"
-version = "1.4.3"
-description = "a fork of Python 2 and 3 ast modules with type comment support"
-category = "dev"
-optional = false
-python-versions = "*"
-
 [[package]]
 name = "types-psycopg2"
 version = "2.9.6"
@@ -1383,22 +1336,10 @@ category = "dev"
 optional = false
 python-versions = "*"

-[[package]]
-name = "zipp"
-version = "3.7.0"
-description = "Backport of pathlib-compatible object wrapper for zip files"
-category = "main"
-optional = false
-python-versions = ">=3.7"
-
-[package.extras]
-docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"]
-testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"]
-
 [metadata]
 lock-version = "1.1"
-python-versions = "^3.7"
-content-hash = "4ee85b435461dec70b406bf7170302fe54e9e247bdf628a9cb6b5fb9eb9afd82"
+python-versions = "^3.9"
+content-hash = "be9c00bb5081535805824242fea2a03b2f82fa9466856d618e24b3140c7da6a0"

 [metadata.files]
 aiopg = [
@@ -1594,10 +1535,6 @@ idna = [
    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
 ]
-importlib-metadata = [
-    {file = "importlib_metadata-4.10.1-py3-none-any.whl", hash = "sha256:899e2a40a8c4a1aec681feef45733de8a6c58f3f6a0dbed2eb6574b4387a77b6"},
-    {file = "importlib_metadata-4.10.1.tar.gz", hash = "sha256:951f0d8a5b7260e9db5e41d429285b5f451e928479f19d80818878527d36e95e"},
-]
 iniconfig = [
    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
@@ -2001,38 +1938,6 @@ toml = [
    {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
    {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
 ]
-typed-ast = [
-    {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6"},
-    {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075"},
-    {file = "typed_ast-1.4.3-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528"},
-    {file = "typed_ast-1.4.3-cp35-cp35m-win32.whl", hash = "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428"},
-    {file = "typed_ast-1.4.3-cp35-cp35m-win_amd64.whl", hash = "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3"},
-    {file = "typed_ast-1.4.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f"},
-    {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341"},
-    {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace"},
-    {file = "typed_ast-1.4.3-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f"},
-    {file = "typed_ast-1.4.3-cp36-cp36m-win32.whl", hash = "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363"},
-    {file = "typed_ast-1.4.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7"},
-    {file = "typed_ast-1.4.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266"},
-    {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e"},
-    {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04"},
-    {file = "typed_ast-1.4.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899"},
-    {file = "typed_ast-1.4.3-cp37-cp37m-win32.whl", hash = "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c"},
-    {file = "typed_ast-1.4.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805"},
-    {file = "typed_ast-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a"},
-    {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff"},
-    {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41"},
-    {file = "typed_ast-1.4.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39"},
-    {file = "typed_ast-1.4.3-cp38-cp38-win32.whl", hash = "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927"},
-    {file = "typed_ast-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40"},
-    {file = "typed_ast-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3"},
-    {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4"},
-    {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0"},
-    {file = "typed_ast-1.4.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3"},
-    {file = "typed_ast-1.4.3-cp39-cp39-win32.whl", hash = "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808"},
-    {file = "typed_ast-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c"},
-    {file = "typed_ast-1.4.3.tar.gz", hash = "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"},
-]
 types-psycopg2 = [
    {file = "types-psycopg2-2.9.6.tar.gz", hash = "sha256:753b50b38da0e61bc8f89d149f2c4420c7e18535a87963d17b72343eb98f7c32"},
    {file = "types_psycopg2-2.9.6-py3-none-any.whl", hash = "sha256:2cfd855e1562ebb5da595ee9401da93a308d69121ccd359cb8341f94ba4b6d1c"},
@@ -2123,7 +2028,3 @@ yapf = [
    {file = "yapf-0.31.0-py2.py3-none-any.whl", hash = "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"},
    {file = "yapf-0.31.0.tar.gz", hash = "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d"},
 ]
-zipp = [
-    {file = "zipp-3.7.0-py3-none-any.whl", hash = "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"},
-    {file = "zipp-3.7.0.tar.gz", hash = "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d"},
-]
--- a/safekeeper/README.md
+++ b/safekeeper/README.md
@@ -1,6 +1,6 @@
 # WAL service

-The zenith WAL service acts as a holding area and redistribution
+The neon WAL service acts as a holding area and redistribution
 center for recently generated WAL. The primary Postgres server streams
 the WAL to the WAL safekeeper, and treats it like a (synchronous)
 replica. A replication slot is used in the primary to prevent the
@@ -94,7 +94,7 @@ Q: What if the compute node evicts a page, needs it back, but the page is yet
 A: If the compute node has evicted a page, changes to it have been WAL-logged
   (that's why it is called Write Ahead logging; there are some exceptions like
   index builds, but these are exceptions). These WAL records will eventually
-   reach the Page Server. The Page Server notes that the compute note requests
+   reach the Page Server. The Page Server notes that the compute node requests
   pages with a very recent LSN and will not respond to the compute node until a
   corresponding WAL is received from WAL safekeepers.

--- a/safekeeper/README_PROTO.md
+++ b/safekeeper/README_PROTO.md
@@ -151,7 +151,7 @@ It is assumed that in case of loosing local data by some safekeepers, it should
 * `RestartLSN`: position in WAL confirmed by all safekeepers.
 * `FlushLSN`: part of WAL persisted to the disk by safekeeper.
 * `NodeID`: pair (term,UUID)
-* `Pager`: Zenith component restoring pages from WAL stream
+* `Pager`: Neon component restoring pages from WAL stream
 * `Replica`: read-only computatio node
 * `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.

--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -24,7 +24,7 @@ use safekeeper::{broker, callmemaybe};
 use safekeeper::{http, s3_offload};
 use utils::{
    http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener,
-    zid::ZNodeId,
+    zid::NodeId,
 };

 const LOCK_FILE_NAME: &str = "safekeeper.lock";
@@ -167,7 +167,7 @@ fn main() -> anyhow::Result<()> {

    let mut given_id = None;
    if let Some(given_id_str) = arg_matches.value_of("id") {
-        given_id = Some(ZNodeId(
+        given_id = Some(NodeId(
            given_id_str
                .parse()
                .context("failed to parse safekeeper id")?,
@@ -192,7 +192,7 @@ fn main() -> anyhow::Result<()> {
    start_safekeeper(conf, given_id, arg_matches.is_present("init"))
 }

-fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: bool) -> Result<()> {
+fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bool) -> Result<()> {
    let log_file = logging::init("safekeeper.log", conf.daemonize)?;

    info!("version: {GIT_VERSION}");
@@ -245,7 +245,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: b
        // Otherwise, the coverage data will be damaged.
        match daemonize.exit_action(|| exit_now(0)).start() {
            Ok(_) => info!("Success, daemonized"),
-            Err(e) => error!("Error, {}", e),
+            Err(err) => bail!("Error: {err}. could not daemonize. bailing."),
        }
    }

@@ -345,14 +345,14 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: b
 }

 /// Determine safekeeper id and set it in config.
-fn set_id(conf: &mut SafeKeeperConf, given_id: Option<ZNodeId>) -> Result<()> {
+fn set_id(conf: &mut SafeKeeperConf, given_id: Option<NodeId>) -> Result<()> {
    let id_file_path = conf.workdir.join(ID_FILE_NAME);

-    let my_id: ZNodeId;
+    let my_id: NodeId;
    // If ID exists, read it in; otherwise set one passed
    match fs::read(&id_file_path) {
        Ok(id_serialized) => {
-            my_id = ZNodeId(
+            my_id = NodeId(
                std::str::from_utf8(&id_serialized)
                    .context("failed to parse safekeeper id")?
                    .parse()
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -12,7 +12,7 @@ use tokio::{runtime, time::sleep};
 use tracing::*;

 use crate::{timeline::GlobalTimelines, SafeKeeperConf};
-use utils::zid::{ZNodeId, ZTenantTimelineId};
+use utils::zid::{NodeId, ZTenantTimelineId};

 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;
@@ -36,7 +36,7 @@ pub fn thread_main(conf: SafeKeeperConf) {
 fn timeline_safekeeper_path(
    broker_etcd_prefix: String,
    zttid: ZTenantTimelineId,
-    sk_id: ZNodeId,
+    sk_id: NodeId,
 ) -> String {
    format!(
        "{}/{sk_id}",
--- a/safekeeper/src/http/models.rs
+++ b/safekeeper/src/http/models.rs
@@ -1,9 +1,9 @@
 use serde::{Deserialize, Serialize};
-use utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
+use utils::zid::{NodeId, ZTenantId, ZTimelineId};

 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
    pub tenant_id: ZTenantId,
    pub timeline_id: ZTimelineId,
-    pub peer_ids: Vec<ZNodeId>,
+    pub peer_ids: Vec<NodeId>,
 }
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -20,14 +20,14 @@ use utils::{
        RequestExt, RouterBuilder,
    },
    lsn::Lsn,
-    zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
 };

 use super::models::TimelineCreateRequest;

 #[derive(Debug, Serialize)]
 struct SafekeeperStatus {
-    id: ZNodeId,
+    id: NodeId,
 }

 /// Healthcheck handler.
@@ -178,7 +178,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
    let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?;

    let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?;
-    tli.record_safekeeper_info(&safekeeper_info, ZNodeId(1))?;
+    tli.record_safekeeper_info(&safekeeper_info, NodeId(1))?;

    json_response(StatusCode::OK, ())
 }
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -3,7 +3,7 @@ use std::path::PathBuf;
 use std::time::Duration;
 use url::Url;

-use utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId};
+use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId};

 pub mod broker;
 pub mod callmemaybe;
@@ -49,7 +49,7 @@ pub struct SafeKeeperConf {
    pub listen_http_addr: String,
    pub ttl: Option<Duration>,
    pub recall_period: Duration,
-    pub my_id: ZNodeId,
+    pub my_id: NodeId,
    pub broker_endpoints: Vec<Url>,
    pub broker_etcd_prefix: String,
    pub s3_offload_enabled: bool,
@@ -79,7 +79,7 @@ impl Default for SafeKeeperConf {
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            ttl: None,
            recall_period: defaults::DEFAULT_RECALL_PERIOD,
-            my_id: ZNodeId(0),
+            my_id: NodeId(0),
            broker_endpoints: Vec::new(),
            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
            s3_offload_enabled: true,
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -26,7 +26,7 @@ use utils::{
    bin_ser::LeSer,
    lsn::Lsn,
    pq_proto::{SystemId, ZenithFeedback},
-    zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
 };

 pub const SK_MAGIC: u32 = 0xcafeceefu32;
@@ -164,7 +164,7 @@ impl PeerInfo {
 // vector-based node id -> peer state map with very limited functionality we
 // need/
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>);
+pub struct Peers(pub Vec<(NodeId, PeerInfo)>);

 /// Persistent information stored on safekeeper node
 /// On disk data is prefixed by magic and format version and followed by checksum.
@@ -224,7 +224,7 @@ pub struct SafekeeperMemState {
 }

 impl SafeKeeperState {
-    pub fn new(zttid: &ZTenantTimelineId, peers: Vec<ZNodeId>) -> SafeKeeperState {
+    pub fn new(zttid: &ZTenantTimelineId, peers: Vec<NodeId>) -> SafeKeeperState {
        SafeKeeperState {
            tenant_id: zttid.tenant_id,
            timeline_id: zttid.timeline_id,
@@ -277,7 +277,7 @@ pub struct ProposerGreeting {
 #[derive(Debug, Serialize)]
 pub struct AcceptorGreeting {
    term: u64,
-    node_id: ZNodeId,
+    node_id: NodeId,
 }

 /// Vote request sent from proposer to safekeepers
@@ -531,7 +531,7 @@ pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {

    pub wal_store: WAL,

-    node_id: ZNodeId, // safekeeper's node id
+    node_id: NodeId, // safekeeper's node id
 }

 impl<CTRL, WAL> SafeKeeper<CTRL, WAL>
@@ -544,7 +544,7 @@ where
        ztli: ZTimelineId,
        state: CTRL,
        mut wal_store: WAL,
-        node_id: ZNodeId,
+        node_id: NodeId,
    ) -> Result<SafeKeeper<CTRL, WAL>> {
        if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id {
            bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id);
@@ -1013,7 +1013,7 @@ mod tests {
        };
        let wal_store = DummyWalStore { lsn: Lsn(0) };
        let ztli = ZTimelineId::from([0u8; 16]);
-        let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap();
+        let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap();

        // check voting for 1 is ok
        let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
@@ -1028,7 +1028,7 @@ mod tests {
        let storage = InMemoryState {
            persisted_state: state,
        };
-        sk = SafeKeeper::new(ztli, storage, sk.wal_store, ZNodeId(0)).unwrap();
+        sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap();

        // and ensure voting second time for 1 is not ok
        vote_resp = sk.process_msg(&vote_request);
@@ -1045,7 +1045,7 @@ mod tests {
        };
        let wal_store = DummyWalStore { lsn: Lsn(0) };
        let ztli = ZTimelineId::from([0u8; 16]);
-        let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap();
+        let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap();

        let mut ar_hdr = AppendRequestHeader {
            term: 1,
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -21,7 +21,7 @@ use tracing::*;
 use utils::{
    lsn::Lsn,
    pq_proto::ZenithFeedback,
-    zid::{ZNodeId, ZTenantId, ZTenantTimelineId},
+    zid::{NodeId, ZTenantId, ZTenantTimelineId},
 };

 use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey};
@@ -99,7 +99,7 @@ impl SharedState {
    fn create(
        conf: &SafeKeeperConf,
        zttid: &ZTenantTimelineId,
-        peer_ids: Vec<ZNodeId>,
+        peer_ids: Vec<NodeId>,
    ) -> Result<Self> {
        let state = SafeKeeperState::new(zttid, peer_ids);
        let control_store = control_file::FileStorage::create_new(zttid, conf, state)?;
@@ -448,7 +448,7 @@ impl Timeline {
    }

    /// Update timeline state with peer safekeeper data.
-    pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: ZNodeId) -> Result<()> {
+    pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: NodeId) -> Result<()> {
        let mut shared_state = self.mutex.lock().unwrap();
        shared_state.sk.record_safekeeper_info(sk_info)?;
        self.notify_wal_senders(&mut shared_state);
@@ -551,7 +551,7 @@ impl GlobalTimelines {
        mut state: MutexGuard<GlobalTimelinesState>,
        conf: &SafeKeeperConf,
        zttid: ZTenantTimelineId,
-        peer_ids: Vec<ZNodeId>,
+        peer_ids: Vec<NodeId>,
    ) -> Result<Arc<Timeline>> {
        match state.timelines.get(&zttid) {
            Some(_) => bail!("timeline {} already exists", zttid),
@@ -576,7 +576,7 @@ impl GlobalTimelines {
    pub fn create(
        conf: &SafeKeeperConf,
        zttid: ZTenantTimelineId,
-        peer_ids: Vec<ZNodeId>,
+        peer_ids: Vec<NodeId>,
    ) -> Result<Arc<Timeline>> {
        let state = TIMELINES_STATE.lock().unwrap();
        GlobalTimelines::create_internal(state, conf, zttid, peer_ids)
--- a/test_runner/batch_others/test_basebackup_error.py
+++ b/test_runner/batch_others/test_basebackup_error.py
@@ -0,0 +1,20 @@
+import pytest
+from contextlib import closing
+
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log
+
+
+#
+# Test error handling, if the 'basebackup' command fails in the middle
+# of building the tar archive.
+#
+def test_basebackup_error(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli.create_branch("test_basebackup_error", "empty")
+
+    # Introduce failpoint
+    env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return")
+
+    with pytest.raises(Exception, match="basebackup-before-control-file"):
+        pg = env.postgres.create_start('test_basebackup_error')
--- a/test_runner/batch_others/test_ci.py
+++ b/test_runner/batch_others/test_ci.py
@@ -0,0 +1,13 @@
+import pytest
+import os
+import time
+
+from fixtures.zenith_fixtures import ZenithEnvBuilder
+
+
+@pytest.mark.parametrize('iteration', list(range(100)))
+def test_timeout(zenith_env_builder: ZenithEnvBuilder, test_output_dir: str, iteration: int):
+    env = zenith_env_builder.init_start()
+
+    # Sleep long enough with no output so CI step times out
+    time.sleep(1000)
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -393,7 +393,10 @@ class MockS3Server:
    ):
        self.port = port

-        self.subprocess = subprocess.Popen([f'poetry run moto_server s3 -p{port}'], shell=True)
+        # XXX: do not use `shell=True` or add `exec ` to the command here otherwise.
+        # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux
+        # if a process is started from the shell process.
+        self.subprocess = subprocess.Popen(['poetry', 'run', 'moto_server', 's3', f'-p{port}'])
        error = None
        try:
            return_code = self.subprocess.poll()
@@ -403,7 +406,7 @@ class MockS3Server:
            error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
        if error is not None:
            log.error(error)
-            self.subprocess.kill()
+            self.kill()
            raise RuntimeError("failed to start s3 mock server")

    def endpoint(self) -> str:
@@ -1890,7 +1893,11 @@ class Etcd:
                f"--data-dir={self.datadir}",
                f"--listen-client-urls={client_url}",
                f"--advertise-client-urls={client_url}",
-                f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}"
+                f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}",
+                # Set --quota-backend-bytes to keep the etcd virtual memory
+                # size smaller. Our test etcd clusters are very small.
+                # See https://github.com/etcd-io/etcd/issues/7910
+                f"--quota-backend-bytes=100000000"
            ]
            self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file)

--- a/vendor/postgres
+++ b/vendor/postgres
Author	SHA1	Message	Date
Bojan Serafimov	d4585bcb67	Fix typo	2022-05-30 16:49:38 -04:00
Bojan Serafimov	ae7b7bcf7c	Run multiple times	2022-05-30 16:21:43 -04:00
Bojan Serafimov	894721c204	check more often	2022-05-30 16:17:48 -04:00
Bojan Serafimov	d7bb3d14df	WIP	2022-05-30 15:39:08 -04:00
chaitanya sharma	c584d90bb9	initial commit, renamed znodeid to nodeid.	2022-05-25 20:11:26 +03:00
Heikki Linnakangas	7997fc2932	Fix error handling with 'basebackup' command. If the 'basebackup' command failed in the middle of building the tar archive, the client would not report the error, but would attempt to to start up postgres with the partial contents of the data directory. That fails because the control file is missing (it's added to the archive last, precisly to make sure that you cannot start postgres from a partial archive). But the client doesn't see the proper error message that caused the basebackup to fail in the server, which is confusing. Two issues conspired to cause that: 1. The tar::Builder object that we use in the pageserver to construct the tar stream has a Drop handler that automatically writes a valid end-of-archive marker on drop. Because of that, the resulting tarball looks complete, even if an error happens while we're building it. The pageserver does send an ErrorResponse after the seemingly-valid tarball, but: 2. The client stops reading the Copy stream, as soon as it sees the tar end-of-archive marker. Therefore, it doesn't read the ErrorResponse that comes after it. We have two clients that call 'basebackup', one in `control_plane` used by the `neon_local` binary, and another one in `compute_tools`. Both had the same issue. This PR fixes both issues, even though fixing either one would be enough to fix the problem at hand. The pageserver now doesn't send the end-of-archive marker on error, and the client now reads the copy stream to the end, even if it sees an end-of-archive marker. Fixes github issue #1715 In the passing, change Basebackup to use generic Write rather than 'dyn'.	2022-05-25 18:14:44 +03:00
Heikki Linnakangas	24d2313d0b	Set --quota-backend-bytes when launching etcd in tests. By default, etcd makes a huge 10 GB mmap() allocation when it starts up. It doesn't actually use that much memory, it's just address space, but it caused me grief when I tried to use 'rr' to debug a python test run. Apparently, when you replay the 'rr' trace, it does allocate memory for all that address space. The size of the initial mmap depends on the --quota-backend-bytes setting. Our etcd clusters are very small, so let's set --quota-backend-bytes to keep the virtual memory size small, to make debugging with 'rr' easier. See https://github.com/etcd-io/etcd/issues/7910 and `5e4b008106`	2022-05-25 16:57:45 +03:00
Andrey Taranik	9ab52e2186	helm repository name fix for production proxy deploy (#1790 )	2022-05-25 15:41:18 +03:00
Heikki Linnakangas	6f1f33ef42	Improve error messages on seccomp loading errors. Bump vendor/postgres for https://github.com/neondatabase/postgres/pull/166	2022-05-25 14:33:06 +03:00
Andrey Taranik	703f691df8	production inventory update (#1779 )	2022-05-25 14:30:50 +03:00
Arseny Sher	2b265fd6dc	Disable restart_after_crash in neon_local. It is pointless when basebackup is invalid.	2022-05-25 14:48:11 +04:00
Sergey Melnikov	d32b491a53	Add zenith-us-stage-sk-6 to deploy (#1728 )	2022-05-25 10:31:10 +03:00
Kirill Bulatov	541ec25875	Properly shutdown test mock S3 server	2022-05-24 19:09:31 +03:00
KlimentSerafimov	8346aa3a29	Potential fix to #1626 . Fixed typo is Makefile. (#1781 ) * Potential fix to #1626. Fixed typo is Makefile. * Completed fix to #1626. Summary: changed 'error' to 'bail' in start_pageserver and start_safekeeper.	2022-05-24 04:55:38 -04:00
Heikki Linnakangas	2aceb6a309	Fix garbage collection to not remove image layers that are still needed. The logic would incorrectly remove an image layer, if a new image layer existed, even though the older image layer was still needed by some delta layers after it. See example given in the comment this adds. Without this fix, I was getting a lot of "could not find data for key 010000000000000000000000000000000000" errors from GC, with the new test case being added in PR #1735. Fixes #707	2022-05-23 20:58:27 +03:00
KlimentSerafimov	3ff5caf786	Add to readme install protobuf etcd (#1777 ) * Update installation instructions * Added libprotobuf-dev etcd to apt install Added "brew install protobuf etcd" to OSX installation instructions. Added "sudo apt install libprotobuf-dev etcd" to Linux installation instructions. Without these, cargo build complains. Figured out in collaboration with Bojan.	2022-05-23 13:11:59 -04:00
chaitanya sharma	fbedd535c0	Replace a bunch of zenith references with neon.	2022-05-23 13:16:00 +03:00
Egor Suvorov	89e5659f3f	Replace COPYRIGHT file from the root with NOTICE file The primary reason: make GitHub detect that we use Apache License 2.0 They do it via https://github.com/licensee/licensee Ruby library (gem). Our COPYRIGHT file contains a part of the Apache License, which should be added to a source file, not the license or copyright information itself, which confuses the library. Instead, the recommended way is to create a NOTICE file which references license of the code and its bundled dependencies.	2022-05-23 01:03:03 +02:00
Egor Suvorov	ef7cdb13e2	Remove unused dependencies from poetry.lock via `poetry lock --no-update` There were a bunch of dependencies for Python <3.9. They are not needed after #1254. This commit makes it easier to add/remove dependencies because lock file will be updated like this on any such operation. Do not update dependencies yet to not break anything.	2022-05-21 12:21:45 +02:00
Egor Suvorov	73187bfef1	postgres_ffi: find_end_of_wal_segment: clarify code around xl_crc retrieval It would be better to not update xl_crc/rec_hdr at all when skipping contrecord, but I would prefer to keep PR #1574 small. Better audit of `find_end_of_wal_segment` is coming anyway in #544.	2022-05-21 05:25:17 +02:00
Egor Suvorov	967eb38e81	postgres_ffi: find_end_of_wal_segment: fix contrecord skipping Also enable corresponding test.	2022-05-21 05:25:17 +02:00
Egor Suvorov	a124e44866	postgres_ffi: find_end_of_wal_segment: add lots of trace	2022-05-21 05:25:17 +02:00
Egor Suvorov	c4b77084af	utils: add const_assert! macro	2022-05-21 05:25:17 +02:00
Egor Suvorov	c9efdec8db	postgres_ffi: find_end_of_wal_segment: improve name of wal_crc variable Now it reflects the field it's mirroring.	2022-05-21 05:25:17 +02:00
Egor Suvorov	12b7c793b3	postgres_ffi: find_end_of_wal_segment: remove redundant CRC operations Previous invariant: `crc` contains an "unfinalized" CRC32 value, its one complement, like in postgres before FIN_CRC32C. New invariant: `crc` always contains a "finalized" CRC32 value, this is the semantics of crc32c_append, so we don't need to invert CRC manually.	2022-05-21 05:25:17 +02:00
Egor Suvorov	3c6890bf1d	postgres_ffi: add complex WAL tests for find_end_of_wal * Actual generation logic is in a separate crate `postgres_ffi/wal_generate` * The create also provides a binary for debug purposes akin to `initdb` * Two tests currently fail and are ignored * There is no easy way to test this directly in Safekeeper as it starts restoring from commit_lsn. So testing would require disconnecting Safekeeper just after it has received the WAL, but before it is committed.	2022-05-21 05:25:17 +02:00
Andrey Taranik	d97617ed3a	updated proxy and proxy scram deployment for prod and stress environments (#1758 )	2022-05-20 23:12:30 +03:00