Revert postgres prewarming

Prewarm compute nodes (#4828 )
test_compatibility: fix pg_tenant_only_port port collision (#4850 )
2026-02-05 19:50:36 +00:00 · 2023-07-31 14:15:26 -04:00 · 2023-07-31 14:13:32 -04:00 · 2023-07-31 20:49:46 +03:00 · 2023-07-31 20:23:18 +03:00 · 2023-07-31 14:40:52 +01:00
18 changed files with 233 additions and 243 deletions
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,10 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
-    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
+    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -193,6 +193,13 @@ fn main() -> Result<()> {
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
+
+        // TODO this can stall startups in the unlikely event that we bind
+        //      this compute node while it's busy prewarming. It's not too
+        //      bad because it's just 100ms and unlikely, but it's an
+        //      avoidable problem.
+        // compute.prewarm_postgres()?;
+
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -532,6 +532,50 @@ impl ComputeNode {
        Ok(())
    }

+    /// Start and stop a postgres process to warm up the VM for startup.
+    pub fn prewarm_postgres(&self) -> Result<()> {
+        info!("prewarming");
+
+        // Create pgdata
+        let pgdata = &format!("{}.warmup", self.pgdata);
+        create_pgdata(pgdata)?;
+
+        // Run initdb to completion
+        info!("running initdb");
+        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
+        Command::new(initdb_bin)
+            .args(["-D", pgdata])
+            .output()
+            .expect("cannot start initdb process");
+
+        // Write conf
+        use std::io::Write;
+        let conf_path = Path::new(pgdata).join("postgresql.conf");
+        let mut file = std::fs::File::create(conf_path)?;
+        writeln!(file, "shared_buffers=65536")?;
+        writeln!(file, "port=51055")?; // Nobody should be connecting
+        writeln!(file, "shared_preload_libraries = 'neon'")?;
+
+        // Start postgres
+        info!("starting postgres");
+        let mut pg = Command::new(&self.pgbin)
+            .args(["-D", pgdata])
+            .spawn()
+            .expect("cannot start postgres process");
+
+        // Stop it when it's ready
+        info!("waiting for postgres");
+        wait_for_postgres(&mut pg, Path::new(pgdata))?;
+        pg.kill()?;
+        info!("sent kill signal");
+        pg.wait()?;
+        info!("done prewarming");
+
+        // clean up
+        let _ok = fs::remove_dir_all(pgdata);
+        Ok(())
+    }
+
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -390,39 +390,42 @@ where
    }

    #[allow(dead_code)]
-    pub fn dump(&self) -> Result<()> {
-        self.dump_recurse(self.root_blk, &[], 0)
-    }
+    pub async fn dump(&self) -> Result<()> {
+        let mut stack = Vec::new();

-    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
-        let blk = self.reader.read_blk(self.start_blk + blknum)?;
-        let buf: &[u8] = blk.as_ref();
+        stack.push((self.root_blk, String::new(), 0, 0, 0));

-        let node = OnDiskNode::<L>::deparse(buf)?;
+        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
+            let blk = self.reader.read_blk(self.start_blk + blknum)?;
+            let buf: &[u8] = blk.as_ref();
+            let node = OnDiskNode::<L>::deparse(buf)?;

-        print!("{:indent$}", "", indent = depth * 2);
-        println!(
-            "blk #{}: path {}: prefix {}, suffix_len {}",
-            blknum,
-            hex::encode(path),
-            hex::encode(node.prefix),
-            node.suffix_len
-        );
+            if child_idx == 0 {
+                print!("{:indent$}", "", indent = depth * 2);
+                let path_prefix = stack
+                    .iter()
+                    .map(|(_blknum, path, ..)| path.as_str())
+                    .collect::<String>();
+                println!(
+                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
+                    hex::encode(node.prefix),
+                    node.suffix_len
+                );
+            }

-        let mut idx = 0;
-        let mut key_off = 0;
-        while idx < node.num_children {
+            if child_idx + 1 < node.num_children {
+                let key_off = key_off + node.suffix_len as usize;
+                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
+            }
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(idx as usize);
+            let val = node.value(child_idx as usize);
+
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                let child_path = [path, node.prefix].concat();
-                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
+                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
            }
-            idx += 1;
-            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -754,8 +757,8 @@ mod tests {
        }
    }

-    #[test]
-    fn basic() -> Result<()> {
+    #[tokio::test]
+    async fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -775,7 +778,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
@@ -835,8 +838,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn lots_of_keys() -> Result<()> {
+    #[tokio::test]
+    async fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -856,7 +859,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        use std::sync::Mutex;

@@ -994,8 +997,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[test]
-    fn particular_data() -> Result<()> {
+    #[tokio::test]
+    async fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1022,7 +1025,7 @@ mod tests {
        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump()?;
+        reader.dump().await?;

        Ok(())
    }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -223,6 +223,45 @@ mod tests {
        assert_eq!(part, expected);
    }

+    #[test]
+    fn v2_indexpart_is_parsed_with_deleted_at() {
+        let example = r#"{
+            "version":2,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 2,
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
+
    #[test]
    fn empty_layers_are_parsed() {
        let empty_layers_json = r#"{
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -256,7 +256,7 @@ impl Layer for DeltaLayer {
            file,
        );

-        tree_reader.dump()?;
+        tree_reader.dump().await?;

        let mut cursor = file.block_cursor();

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -175,7 +175,7 @@ impl Layer for ImageLayer {
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

-        tree_reader.dump()?;
+        tree_reader.dump().await?;

        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
            println!("key: {} offset {}", hex::encode(key), value);
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1600,7 +1600,7 @@ impl Timeline {
            if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
                // create an ImageLayer struct for each image file.
                if imgfilename.lsn > disk_consistent_lsn {
-                    warn!(
+                    info!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
                        imgfilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1632,7 +1632,7 @@ impl Timeline {
                // is 102, then it might not have been fully flushed to disk
                // before crash.
                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
-                    warn!(
+                    info!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
                        deltafilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1774,7 +1774,7 @@ impl Timeline {
            match remote_layer_name {
                LayerFileName::Image(imgfilename) => {
                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                        warn!(
+                        info!(
                        "found future image layer {} on timeline {} remote_consistent_lsn is {}",
                        imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
                    );
@@ -1799,7 +1799,7 @@ impl Timeline {
                    // is 102, then it might not have been fully flushed to disk
                    // before crash.
                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                        warn!(
+                        info!(
                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
                        );
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -88,50 +88,16 @@ static LWLockId lfc_lock;
 static int   lfc_max_size;
 static int   lfc_size_limit;
 static int   lfc_free_space_watermark;
-static int   lfc_free_memory_watermark;
 static char* lfc_path;
 static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif
-static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark or lfc_free_memory_watermak are reached */
+static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */

 void FileCacheMonitorMain(Datum main_arg);

-#ifdef __APPLE__
-
-#include <sys/types.h>
-#include <sys/sysctl.h>
-
-static size_t
-get_available_memory(void)
-{
-	size_t total;
-	size_t sizeof_total = sizeof(total);
-	if (sysctlbyname("hw.memsize", &total, &sizeof_total, NULL, 0) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	return total;
-}
-
-#else
-
-#include <sys/sysinfo.h>
-
-static size_t
-get_available_memory(void)
-{
-	struct sysinfo si;
-	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	return si.totalram*si.mem_unit;
-}
-
-#endif
-
-
 static void
 lfc_shmem_startup(void)
 {
@@ -229,11 +195,10 @@ lfc_change_limit_hook(int newval, void *extra)
 }

 /*
- * Local file system state monitor check available free space and memory.
- * If available disk space is lower than lfc_free_space_watermark or
- * available memory is lower than lfc_free_memory_watermark then we shrink size of local cache
+ * Local file system state monitor check available free space.
+ * If it is lower than lfc_free_space_watermark then we shrink size of local cache
 * but throwing away least recently accessed chunks.
- * First time the watermark is reached cache size is divided by two,
+ * First time low space watermark is reached cache size is divided by two,
 * second time by four,... Finally we remove all chunks from local cache.
 *
 * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
@@ -263,27 +228,23 @@ FileCacheMonitorMain(Datum main_arg)
 	{
 		if (lfc_size_limit != 0)
 		{
-			bool shrink_cache = false;
-			if (lfc_free_space_watermark != 0)
+			struct statvfs sfs;
+			if (statvfs(lfc_path, &sfs) < 0)
 			{
-				struct statvfs sfs;
-				if (statvfs(lfc_path, &sfs) < 0)
-					elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
-				else
-					shrink_cache |= sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB;
-			}
-			if (lfc_free_memory_watermark != 0)
-				shrink_cache |= get_available_memory() < lfc_free_memory_watermark*MB;
-
-			if (shrink_cache)
-			{
-				if (lfc_shrinking_factor < 31) {
-					lfc_shrinking_factor += 1;
-				}
-				lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
+				elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
 			}
 			else
-				lfc_shrinking_factor = 0; /* reset to initial value */
+			{
+				if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
+				{
+					if (lfc_shrinking_factor < 31) {
+						lfc_shrinking_factor += 1;
+					}
+					lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
+				}
+				else
+					lfc_shrinking_factor = 0; /* reset to initial value */
+			}
 		}
 		pg_usleep(monitor_interval);
 	}
@@ -356,19 +317,6 @@ lfc_init(void)
 							NULL,
 							NULL);

-	DefineCustomIntVariable("neon.free_memory_watermark",
-							"Minimal free memory in system after reaching which local file cache will be truncated",
-							NULL,
-							&lfc_free_memory_watermark,
-							0, /* disabled by default, because iurt makes sense only when local file cache is located i tmpfs  */
-							0,
-							INT_MAX,
-							PGC_SIGHUP,
-							GUC_UNIT_MB,
-							NULL,
-							NULL,
-							NULL);
-
 	DefineCustomStringVariable("neon.file_cache_path",
 							   "Path to local file cache (can be raw device)",
 							   NULL,
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -5,7 +5,7 @@ use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::{try_wake, NUM_RETRIES_CONNECT},
+    proxy::handle_try_wake,
    sasl, scram,
    stream::PqStream,
 };
@@ -51,14 +51,15 @@ pub(super) async fn authenticate(
        }
    };

+    info!("compute node's state has likely changed; requesting a wake-up");
    let mut num_retries = 0;
    let mut node = loop {
-        num_retries += 1;
-        match try_wake(api, extra, creds).await? {
+        let wake_res = api.wake_compute(extra, creds).await;
+        match handle_try_wake(wake_res, num_retries)? {
+            ControlFlow::Continue(_) => num_retries += 1,
            ControlFlow::Break(n) => break n,
-            ControlFlow::Continue(_) if num_retries < NUM_RETRIES_CONNECT => continue,
-            ControlFlow::Continue(e) => return Err(e.into()),
        }
+        info!(num_retries, "retrying wake compute");
    };
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -347,11 +347,6 @@ async fn connect_to_compute_once(
        .await
 }

-enum ConnectionState<E> {
-    Cached(console::CachedNodeInfo),
-    Invalid(compute::ConnCfg, E),
-}
-
 #[async_trait]
 pub trait ConnectMechanism {
    type Connection;
@@ -407,70 +402,67 @@ where

    mechanism.update_connect_config(&mut node_info.config);

-    let mut num_retries = 0;
-    let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);
+    // try once
+    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        Ok(res) => return Ok(res),
+        Err(e) => {
+            error!(error = ?e, "could not connect to compute node");
+            (invalidate_cache(node_info), e)
+        }
+    };

-    loop {
-        match state {
-            ConnectionState::Invalid(config, err) => {
-                info!("compute node's state has likely changed; requesting a wake-up");
+    let mut num_retries = 1;

-                let wake_res = match creds {
-                    auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-                    auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-                    // nothing to do?
-                    auth::BackendType::Link(_) => return Err(err.into()),
-                    // test backend
-                    auth::BackendType::Test(x) => x.wake_compute(),
-                };
+    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let node_info = loop {
+        let wake_res = match creds {
+            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            // nothing to do?
+            auth::BackendType::Link(_) => return Err(err.into()),
+            // test backend
+            auth::BackendType::Test(x) => x.wake_compute(),
+        };

-                match handle_try_wake(wake_res) {
-                    // there was an error communicating with the control plane
-                    Err(e) => return Err(e.into()),
-                    // failed to wake up but we can continue to retry
-                    Ok(ControlFlow::Continue(_)) => {
-                        state = ConnectionState::Invalid(config, err);
-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
-
-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                        continue;
-                    }
-                    // successfully woke up a compute node and can break the wakeup loop
-                    Ok(ControlFlow::Break(mut node_info)) => {
-                        node_info.config.reuse_password(&config);
-                        mechanism.update_connect_config(&mut node_info.config);
-                        state = ConnectionState::Cached(node_info)
-                    }
-                }
+        match handle_try_wake(wake_res, num_retries)? {
+            // failed to wake up but we can continue to retry
+            ControlFlow::Continue(_) => {}
+            // successfully woke up a compute node and can break the wakeup loop
+            ControlFlow::Break(mut node_info) => {
+                node_info.config.reuse_password(&config);
+                mechanism.update_connect_config(&mut node_info.config);
+                break node_info;
            }
-            ConnectionState::Cached(node_info) => {
-                match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-                    Ok(res) => return Ok(res),
-                    Err(e) => {
-                        error!(error = ?e, "could not connect to compute node");
-                        if !e.should_retry(num_retries) {
-                            return Err(e.into());
-                        }
+        }

-                        // after the first connect failure,
-                        // we should invalidate the cache and wake up a new compute node
-                        if num_retries == 0 {
-                            state = ConnectionState::Invalid(invalidate_cache(node_info), e);
-                        } else {
-                            state = ConnectionState::Cached(node_info);
-                        }
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;

-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying wake compute");
+    };

-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                    }
+    // now that we have a new node, try connect to it repeatedly.
+    // this can error for a few reasons, for instance:
+    // * DNS connection settings haven't quite propagated yet
+    info!("wake_compute success. attempting to connect");
+    loop {
+        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !e.should_retry(num_retries) {
+                    return Err(e.into());
                }
            }
        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying connect_once");
    }
 }

@@ -478,12 +470,15 @@ where
 /// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
 /// * Returns Ok(Break(node)) if the wakeup succeeded
 /// * Returns Err(e) if there was an error
-fn handle_try_wake(
+pub fn handle_try_wake(
    result: Result<console::CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
 ) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
    match result {
        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.could_retry() => Ok(ControlFlow::Continue(err)),
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
            _ => Err(err),
        },
        // Ready to try again.
@@ -491,22 +486,10 @@ fn handle_try_wake(
    }
 }

-/// Attempts to wake up the compute node.
-pub async fn try_wake(
-    api: &impl console::Api,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::ClientCredentials<'_>,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    info!("compute node's state has likely changed; requesting a wake-up");
-    handle_try_wake(api.wake_compute(extra, creds).await)
-}
-
 pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
    fn should_retry(&self, num_retries: u32) -> bool {
        match self {
-            // retry all errors at least once
-            _ if num_retries == 0 => true,
            _ if num_retries >= NUM_RETRIES_CONNECT => false,
            err => err.could_retry(),
        }
@@ -558,14 +541,9 @@ impl ShouldRetry for compute::ConnectionError {
    }
 }

-pub fn retry_after(num_retries: u32) -> time::Duration {
-    match num_retries {
-        0 => time::Duration::ZERO,
-        _ => {
-            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
-        }
-    }
+fn retry_after(num_retries: u32) -> time::Duration {
+    // 1.5 seems to be an ok growth factor heuristic
+    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -302,7 +302,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 0..10 {
+    for num_retries in 1..10 {
        total_wait += retry_after(num_retries);
    }
    assert!(total_wait < tokio::time::Duration::from_secs(12));
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -234,7 +234,10 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
                listen_pg_addr_tenant_only
            );
            let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
-                error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+                error!(
+                    "failed to bind to address {}: {}",
+                    listen_pg_addr_tenant_only, e
+                );
                e
            })?;
            Some(listener)
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -257,28 +257,15 @@ def prepare_snapshot(
        shutil.rmtree(repo_dir / "pgdatadirs")
    os.mkdir(repo_dir / "endpoints")

-    # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
-    # them anymore, but old versions did.
-    for tenant in (repo_dir / "tenants").glob("*"):
-        wal_redo_dir = tenant / "wal-redo-datadir.___temp"
-        if wal_redo_dir.exists() and wal_redo_dir.is_dir():
-            shutil.rmtree(wal_redo_dir)
-
    # Update paths and ports in config files
    pageserver_toml = repo_dir / "pageserver.toml"
    pageserver_config = toml.load(pageserver_toml)
    pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
-    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_http_addr"]
-    )
-    pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_pg_addr"]
-    )
+    for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
+        pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param])

-    # Older pageserver versions had just one `auth_type` setting. Now there
-    # are separate settings for pg and http ports. We don't use authentication
-    # in compatibility tests so just remove authentication related settings.
-    pageserver_config.pop("auth_type", None)
+    # We don't use authentication in compatibility tests
+    # so just remove authentication related settings.
    pageserver_config.pop("pg_auth_type", None)
    pageserver_config.pop("http_auth_type", None)

@@ -290,19 +277,16 @@ def prepare_snapshot(

    snapshot_config_toml = repo_dir / "config"
    snapshot_config = toml.load(snapshot_config_toml)
-
-    broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
-    snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
-
-    snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_http_addr"]
-    )
-    snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_pg_addr"]
+    for param in ("listen_http_addr", "listen_pg_addr"):
+        snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port(
+            snapshot_config["pageserver"][param]
+        )
+    snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
+        snapshot_config["broker"]["listen_addr"]
    )
    for sk in snapshot_config["safekeepers"]:
-        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
-        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])
+        for param in ("http_port", "pg_port", "pg_tenant_only_port"):
+            sk[param] = port_distributor.replace_with_new_port(sk[param])

    if pg_distrib_dir:
        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -14,10 +14,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
    pageserver_http = env.pageserver.http_client()

    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -72,10 +72,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
    # Use a tiny checkpoint distance, to create a lot of layers quickly.
    # That allows us to stress the compaction and layer flushing logic more.
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -15,10 +15,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    env.pageserver.is_testing_enabled_or_skip()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-
    # Create a branch for us
    env.neon_cli.create_branch("test_pageserver_recovery", "main")

--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -348,9 +348,6 @@ def test_remote_storage_upload_queue_retries(
    # XXX: should vary this test to selectively fail just layer uploads, index uploads, deletions
    #      but how do we validate the result after restore?

-    # these are always possible when we do an immediate stop. perhaps something with compacting has changed since.
-    env.pageserver.allowed_errors.append(r".*found future (delta|image) layer.*")
-
    env.pageserver.stop(immediate=True)
    env.endpoints.stop_all()
Author	SHA1	Message	Date
Bojan Serafimov	0abfc72f1c	Revert postgres prewarming	2023-07-31 14:15:26 -04:00
bojanserafimov	ddbe170454	Prewarm compute nodes (#4828 )	2023-07-31 14:13:32 -04:00
Alexander Bayandin	39e458f049	test_compatibility: fix pg_tenant_only_port port collision (#4850 ) ## Problem Compatibility tests fail from time to time due to `pg_tenant_only_port` port collision (added in https://github.com/neondatabase/neon/pull/4731) ## Summary of changes - replace `pg_tenant_only_port` value in config with new port - remove old logic, than we don't need anymore - unify config overrides	2023-07-31 20:49:46 +03:00
Vadim Kharitonov	e1424647a0	Update pg_embedding to 0.3.1 version (#4811 )	2023-07-31 20:23:18 +03:00
Yinnan Yao	705ae2dce9	Fix error message for listen_pg_addr_tenant_only binding (#4787 ) ## Problem Wrong use of `conf.listen_pg_addr` in `error!()`. ## Summary of changes Use `listen_pg_addr_tenant_only` instead of `conf.listen_pg_addr`. Signed-off-by: yaoyinnan <35447132+yaoyinnan@users.noreply.github.com>	2023-07-31 14:40:52 +01:00
Conrad Ludgate	eb78603121	proxy: div by zero (#4845 ) ## Problem 1. In the CacheInvalid state loop, we weren't checking the `num_retries`. If this managed to get up to `32`, the retry_after procedure would compute 2^32 which would overflow to 0 and trigger a div by zero 2. When fixing the above, I started working on a flow diagram for the state machine logic and realised it was more complex than it had to be: a. We start in a `Cached` state b. `Cached`: call `connect_once`. After the first connect_once error, we always move to the `CacheInvalid` state, otherwise, we return the connection. c. `CacheInvalid`: we attempt to `wake_compute` and we either switch to Cached or we retry this step (or we error). d. `Cached`: call `connect_once`. We either retry this step or we have a connection (or we error) - After num_retries > 1 we never switch back to `CacheInvalid`. ## Summary of changes 1. Insert a `num_retries` check in the `handle_try_wake` procedure. Also using floats in the retry_after procedure to prevent the overflow entirely 2. Refactor connect_to_compute to be more linear in design.	2023-07-31 09:30:24 -04:00
John Spray	f0ad603693	pageserver: add unit test for deleted_at in IndexPart (#4844 ) ## Problem Existing IndexPart unit tests only exercised the version 1 format (i.e. without deleted_at set). ## Summary of changes Add a test that sets version to 2, and sets a value for deleted_at. Closes https://github.com/neondatabase/neon/issues/4162	2023-07-31 12:51:18 +01:00
Arpad Müller	e5183f85dc	Make DiskBtreeReader::dump async (#4838 ) ## Problem `DiskBtreeReader::dump` calls `read_blk` internally, which we want to make async in the future. As it is currently relying on recursion, and async doesn't like recursion, we want to find an alternative to that and instead traverse the tree using a loop and a manual stack. ## Summary of changes * Make `DiskBtreeReader::dump` and all the places calling it async * Make `DiskBtreeReader::dump` non-recursive internally and use a stack instead. It now deparses the node in each iteration, which isn't optimal, but on the other hand it's hard to store the node as it is referencing the buffer. Self referential data are hard in Rust. For a dumping function, speed isn't a priority so we deparse the node multiple times now (up to branching factor many times). Part of https://github.com/neondatabase/neon/issues/4743 I have verified that output is unchanged by comparing the output of this command both before and after this patch: ``` cargo test -p pageserver -- particular_data --nocapture ```	2023-07-31 12:52:29 +02:00
Joonas Koivunen	89ee8f2028	fix: demote warnings, fix flakyness (#4837 ) `WARN ... found future (image\|delta) layer` are not actionable log lines. They don't need to be warnings. `info!` is enough. This also fixes some known but not tracked flakyness in [`test_remote_timeline_client_calls_started_metric`][evidence]. [evidence]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4829/5683495367/index.html#/testresult/34fe79e24729618b Closes #3369. Closes #4473.	2023-07-31 07:43:12 +00:00