diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 57635f4920..71ca7329ee 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1101,8 +1101,8 @@ jobs: $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} done done - docker buildx imagetools create -t neondatabase/neon-test-extensions:latest \ - neondatabase/neon-test-extensions:${{ needs.tag.outputs.build-tag }} + docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ + neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 336d1c3fb8..25d00d6dfd 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -919,6 +919,14 @@ impl Timeline { result.add_key(AUX_FILES_KEY); } + #[cfg(test)] + { + let guard = self.extra_test_dense_keyspace.load(); + for kr in &guard.ranges { + result.add_range(kr.clone()); + } + } + Ok(( result.to_keyspace(), /* AUX sparse key space */ diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d3fbbad598..63ce39d81f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5331,6 +5331,9 @@ mod tests { let cancel = CancellationToken::new(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let mut test_key_end = test_key; + test_key_end.field6 = NUM_KEYS as u32; + tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end)); let mut keyspace = KeySpaceAccum::new(); @@ -6290,8 +6293,8 @@ mod tests { let cancel = CancellationToken::new(); - let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); - base_key.field1 = AUX_KEY_PREFIX; + let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... let mut test_key = base_key; let mut lsn = Lsn(0x10); @@ -6396,6 +6399,7 @@ mod tests { Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN ) .await?; + tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next()))); let child = tenant .branch_timeline_test_with_layers( diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f032de3675..48962e45e9 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -426,6 +426,14 @@ pub struct Timeline { /// Indicate whether aux file v2 storage is enabled. pub(crate) last_aux_file_policy: AtomicAuxFilePolicy, + + /// Some test cases directly place keys into the timeline without actually modifying the directory + /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that + /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense + /// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and + /// in the future, add `extra_test_sparse_keyspace` if necessary. + #[cfg(test)] + pub(crate) extra_test_dense_keyspace: ArcSwap, } pub struct WalReceiverInfo { @@ -2354,6 +2362,9 @@ impl Timeline { aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy), + + #[cfg(test)] + extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -5611,6 +5622,13 @@ impl Timeline { } Ok(layers) } + + #[cfg(test)] + pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) { + let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone(); + keyspace.merge(&ks); + self.extra_test_dense_keyspace.store(Arc::new(keyspace)); + } } type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 6305f2ec92..8edaf65639 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -3112,12 +3112,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf request_lsn = UINT64_MAX; /* - * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU + * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU * segment has not changed since the basebackup, because in order to * modify it, we would have had to download it already. And once * downloaded, we never evict SLRU segments from local disk. */ - not_modified_since = GetRedoStartLsn(); + not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); SlruKind kind; diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 1a0012397c..772a39fe35 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -1,5 +1,4 @@ import json -import os from pathlib import Path from typing import Any, Dict, Tuple @@ -35,10 +34,6 @@ from performance.pageserver.util import ( @pytest.mark.timeout( 10000 ) # TODO: this value is just "a really high number"; have this per instance type -@pytest.mark.skipif( - os.getenv("CI", "false") == "true", - reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/6724", -) def test_pageserver_max_throughput_getpage_at_latest_lsn( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, @@ -91,6 +86,14 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( n_tenants, setup_wrapper, ) + + env.pageserver.allowed_errors.append( + # https://github.com/neondatabase/neon/issues/6925 + # https://github.com/neondatabase/neon/issues/6390 + # https://github.com/neondatabase/neon/issues/6724 + r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" + ) + run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py index 4af7dcdfc3..d6babe4393 100644 --- a/test_runner/regress/test_ondemand_slru_download.py +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -129,3 +129,33 @@ def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count cur_replica = conn_replica.cursor() cur_replica.execute("SELECT * FROM clogtest") assert cur_replica.fetchall() == [(1,), (3,)] + + +def test_ondemand_download_after_wal_switch(neon_env_builder: NeonEnvBuilder): + """ + Test on-demand SLRU download on standby, when starting right after + WAL segment switch. + + This is a repro for a bug in how the LSN at WAL page/segment + boundary was handled (https://github.com/neondatabase/neon/issues/8030) + """ + + tenant_conf = { + "lazy_slru_download": "true", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + endpoint = env.endpoints.create_start("main") + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Start standby at WAL segment boundary + cur.execute("SELECT pg_switch_wal()") + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + _endpoint_at_lsn = env.endpoints.create_start( + branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn + ) diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 73a24c42d6..15f820bebd 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -304,7 +304,9 @@ files: - slot_name values: [restart_lsn] query: | - select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical'; + select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn + from pg_replication_slots + where slot_type = 'logical'; - metric_name: retained_wal type: gauge