# It's possible to run any regular test with the local fs remote storage via # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import time from collections import defaultdict from typing import Any, DefaultDict, Dict, Tuple from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, flush_ep_to_pageserver, last_flush_lsn_upload, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, wait_for_upload, wait_for_upload_queue_empty, wait_until_tenant_active, ) from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import query_scalar, wait_until def get_num_downloaded_layers(client: PageserverHttpClient): """ This assumes that the pageserver only has a single tenant. """ value = client.get_metric_value( "pageserver_remote_operation_seconds_count", { "file_kind": "layer", "op_kind": "download", "status": "success", }, ) if value is None: return 0 return int(value) # # If you have a large relation, check that the pageserver downloads parts of it as # require by queries. # def test_ondemand_download_large_rel(neon_env_builder: NeonEnvBuilder): # thinking about using a shared environment? the test assumes that global # metrics are for single tenant. env = neon_env_builder.init_start( initial_tenant_conf={ # disable background GC "gc_period": "0s", "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{10 * 1024 ** 2}", # 10 MB # allow compaction with the checkpoint "compaction_threshold": "3", "compaction_target_size": f"{10 * 1024 ** 2}", # 10 MB # but don't run compaction in background or on restart "compaction_period": "0s", } ) endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() tenant_id = env.initial_tenant timeline_id = env.initial_timeline # We want to make sure that the data is large enough that the keyspace is partitioned. num_rows = 1000000 with endpoint.cursor() as cur: # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") cur.execute( f"""CREATE TABLE tbl AS SELECT g as id, 'long string to consume some space' || g from generate_series(1,{num_rows}) g""" ) cur.execute("CREATE INDEX ON tbl (id)") cur.execute("VACUUM tbl") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # stop endpoint before checkpoint to stop wal generation endpoint.stop() # stopping of safekeepers now will help us not to calculate logical size # after startup, so page requests should be the only one on-demand # downloading the layers for sk in env.safekeepers: sk.stop() # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) # wait until pageserver successfully uploaded a checkpoint to remote storage wait_for_upload(client, tenant_id, timeline_id, current_lsn) log.info("uploads have finished") ##### Stop the first pageserver instance, erase all its data env.pageserver.stop() # remove all the layer files for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"): log.info(f"unlinking layer {layer}") layer.unlink() ##### Second start, restore the data and ensure it's the same env.pageserver.start() # start a readonly endpoint which we'll use to check the database. # readonly (with lsn=) is required so that we don't try to connect to # safekeepers, that have now been shut down. endpoint = env.endpoints.create_start("main", lsn=current_lsn) before_downloads = get_num_downloaded_layers(client) assert before_downloads != 0, "basebackup should on-demand non-zero layers" # Probe in the middle of the table. There's a high chance that the beginning # and end of the table was stored together in the same layer files with data # from other tables, and with the entry that stores the size of the # relation, so they are likely already downloaded. But the middle of the # table should not have been needed by anything yet. with endpoint.cursor() as cur: assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1 after_downloads = get_num_downloaded_layers(client) log.info(f"layers downloaded before {before_downloads} and after {after_downloads}") assert after_downloads > before_downloads # # If you have a relation with a long history of updates, the pageserver downloads the layer # files containing the history as needed by timetravel queries. # def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): # thinking about using a shared environment? the test assumes that global # metrics are for single tenant. env = neon_env_builder.init_start( initial_tenant_conf={ # Disable background GC & compaction # We don't want GC, that would break the assertion about num downloads. # We don't want background compaction, we force a compaction every time we do explicit checkpoint. "gc_period": "0s", "compaction_period": "0s", # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB "compaction_threshold": "1", "image_creation_threshold": "1", "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB } ) pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() tenant_id = env.initial_tenant timeline_id = env.initial_timeline #### # Produce layers #### lsns = [] table_len = 10000 with endpoint.cursor() as cur: cur.execute( f""" CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); """ ) current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) # wait until pageserver receives that data wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) lsns.append((0, current_lsn)) for checkpoint_number in range(1, 20): with endpoint.cursor() as cur: cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) lsns.append((checkpoint_number, current_lsn)) # wait until pageserver receives that data wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) # prevent new WAL from being produced, wait for layers to reach remote storage env.endpoints.stop_all() for sk in env.safekeepers: sk.stop() # NB: the wait_for_upload returns as soon as remote_consistent_lsn == current_lsn. # But the checkpoint also triggers a compaction # => image layer generation => # => doesn't advance LSN # => but we want the remote state to deterministic, so additionally, wait for upload queue to drain wait_for_upload(client, tenant_id, timeline_id, current_lsn) wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) client.deletion_queue_flush(execute=True) env.pageserver.stop() env.pageserver.start() # We've shut down the SKs, then restarted the PSes to sever all walreceiver connections; # This means pageserver's remote_consistent_lsn is now frozen to whatever it was after the pageserver.stop() call. wait_until_tenant_active(client, tenant_id) ### # Produce layers complete; # Start the actual testing. ### def get_api_current_physical_size(): d = client.timeline_detail(tenant_id, timeline_id) return d["current_physical_size"] def get_resident_physical_size(): return client.get_timeline_metric( tenant_id, timeline_id, "pageserver_resident_physical_size" ) filled_current_physical = get_api_current_physical_size() log.info(filled_current_physical) filled_size = get_resident_physical_size() log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" # Stop the first pageserver instance, erase all its data env.pageserver.stop() # remove all the layer files for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"): log.info(f"unlinking layer {layer}") layer.unlink() ##### Second start, restore the data and ensure it's the same env.pageserver.start() wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) # The current_physical_size reports the sum of layers loaded in the layer # map, regardless of where the layer files are located. So even though we # just removed the local files, they still count towards # current_physical_size because they are loaded as `RemoteLayer`s. assert filled_current_physical == get_api_current_physical_size() # Run queries at different points in time num_layers_downloaded = [0] resident_size = [get_resident_physical_size()] for checkpoint_number, lsn in lsns: endpoint_old = env.endpoints.create_start( branch_name="main", endpoint_id=f"ep-old_lsn_{checkpoint_number}", lsn=lsn ) with endpoint_old.cursor() as cur: # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000 assert ( query_scalar( cur, f"select count(*) from testtab where checkpoint_number<>{checkpoint_number}", ) == 0 ) assert ( query_scalar( cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}", ) == table_len ) after_downloads = get_num_downloaded_layers(client) num_layers_downloaded.append(after_downloads) log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}") # Check that on each query, we need to download at least one more layer file. However in # practice, thanks to compaction and the fact that some requests need to download # more history, some points-in-time are covered by earlier downloads already. But # in broad strokes, as we query more points-in-time, more layers need to be downloaded. # # Do a fuzzy check on that, by checking that after each point-in-time, we have downloaded # more files than we had three iterations ago. log.info(f"layers downloaded after checkpoint {checkpoint_number}: {after_downloads}") if len(num_layers_downloaded) > 4: assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4] # Likewise, assert that the resident_physical_size metric grows as layers are downloaded resident_size.append(get_resident_physical_size()) log.info(f"resident_size[-1]={resident_size[-1]}") if len(resident_size) > 4: assert resident_size[-1] > resident_size[len(resident_size) - 4] # current_physical_size reports the total size of all layer files, whether # they are present only in the remote storage, only locally, or both. # It should not change. assert filled_current_physical == get_api_current_physical_size() endpoint_old.stop() # # Ensure that the `download_remote_layers` API works # def test_download_remote_layers_api( neon_env_builder: NeonEnvBuilder, ): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start( initial_tenant_conf={ # Disable background GC & compaction # We don't want GC, that would break the assertion about num downloads. # We don't want background compaction, we force a compaction every time we do explicit checkpoint. "gc_period": "0s", "compaction_period": "0s", # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB "compaction_threshold": "999999", "image_creation_threshold": "999999", "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB } ) # This test triggers layer download failures on demand. It is possible to modify the failpoint # during a `Timeline::get_vectored` right between the vectored read and it's validation read. # This means that one of the reads can fail while the other one succeeds and vice versa. # TODO(vlad): Remove this block once the vectored read path validation goes away. env.pageserver.allowed_errors.extend( [ ".*initial_size_calculation.*Vectored get failed with downloading evicted layer file failed, but sequential get did not.*" ".*initial_size_calculation.*Sequential get failed with downloading evicted layer file failed, but vectored get did not.*" ] ) endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() tenant_id = env.initial_tenant timeline_id = env.initial_timeline table_len = 10000 with endpoint.cursor() as cur: cur.execute( f""" CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); """ ) last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) env.endpoints.stop_all() def get_api_current_physical_size(): d = client.timeline_detail(tenant_id, timeline_id) return d["current_physical_size"] def get_resident_physical_size(): return client.get_timeline_metric( tenant_id, timeline_id, "pageserver_resident_physical_size" ) # Shut down safekeepers before starting the pageserver. # If we don't, they might stream us more WAL. for sk in env.safekeepers: sk.stop() # it is sad we cannot do a flush inmem layer without compaction, but # working around with very high layer0 count and image layer creation # threshold client.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload_queue_empty(client, tenant_id, timeline_id) filled_current_physical = get_api_current_physical_size() log.info(f"filled_current_physical: {filled_current_physical}") filled_size = get_resident_physical_size() log.info(f"filled_size: {filled_size}") assert filled_current_physical == filled_size, "we don't yet do layer eviction" env.pageserver.stop() # remove all the layer files for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"): log.info(f"unlinking layer {layer.name}") layer.unlink() ##### Second start, restore the data and ensure it's the same env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"}) env.pageserver.allowed_errors.extend( [ ".*download failed: downloading evicted layer file failed.*", f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed", ] ) wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) ###### Phase 1: exercise download error code path this_time = get_api_current_physical_size() assert ( filled_current_physical == this_time ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" post_unlink_size = get_resident_physical_size() log.info(f"post_unlink_size: {post_unlink_size}") assert ( post_unlink_size < filled_size ), "we just deleted layers and didn't cause anything to re-download them yet" # issue downloads that we know will fail info = client.timeline_download_remote_layers( tenant_id, timeline_id, max_concurrent_downloads=10, errors_ok=True, at_least_one_download=False, ) log.info(f"info={info}") assert info["state"] == "Completed" assert info["total_layer_count"] > 0 assert info["successful_download_count"] == 0 # can't assert == total_layer_count because timeline_detail also tries to # download layers for logical size, but this might not always hold. assert info["failed_download_count"] > 0 assert ( info["total_layer_count"] == info["successful_download_count"] + info["failed_download_count"] ) assert get_api_current_physical_size() == filled_current_physical assert ( get_resident_physical_size() == post_unlink_size ), "didn't download anything new due to failpoint" ##### Retry, this time without failpoints client.configure_failpoints(("remote-storage-download-pre-rename", "off")) info = client.timeline_download_remote_layers( tenant_id, timeline_id, # allow some concurrency to unveil potential concurrency bugs max_concurrent_downloads=10, errors_ok=False, ) log.info(f"info={info}") assert info["state"] == "Completed" assert info["total_layer_count"] > 0 assert info["successful_download_count"] > 0 assert info["failed_download_count"] == 0 assert ( info["total_layer_count"] == info["successful_download_count"] + info["failed_download_count"] ) refilled_size = get_resident_physical_size() log.info(refilled_size) assert filled_size == refilled_size, "we redownloaded all the layers" assert get_api_current_physical_size() == filled_current_physical for sk in env.safekeepers: sk.start() # ensure that all the data is back endpoint_old = env.endpoints.create_start(branch_name="main") with endpoint_old.cursor() as cur: assert query_scalar(cur, "select count(*) from testtab") == table_len def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: NeonEnvBuilder): """ Create a few layers, then evict, then make sure compaction runs successfully. """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) conf = { # Disable background GC & compaction "gc_period": "0s", "compaction_period": "0s", # unused, because manual will be called after each table "checkpoint_distance": 100 * 1024**2, # this will be updated later on to allow manual compaction outside of checkpoints "compaction_threshold": 100, # repartitioning parameter, not required here "image_creation_threshold": 100, # repartitioning parameter, not required here "compaction_target_size": 128 * 1024**2, # pitr_interval and gc_horizon are not interesting because we dont run gc } env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf)) def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[int, int]: m = pageserver_http.get_metrics() # these are global counters total_bytes = m.query_one("pageserver_remote_ondemand_downloaded_bytes_total").value assert ( total_bytes < 2**53 and total_bytes.is_integer() ), "bytes should still be safe integer-in-f64" count = m.query_one("pageserver_remote_ondemand_downloaded_layers_total").value assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64" return (int(total_bytes), int(count)) pageserver_http = env.pageserver.http_client() tenant_id = env.initial_tenant timeline_id = env.initial_timeline with env.endpoints.create_start("main") as endpoint: # no particular reason to create the layers like this, but we are sure # not to hit the image_creation_threshold here. with endpoint.cursor() as cur: cur.execute("create table a as select id::bigint from generate_series(1, 204800) s(id)") wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) with endpoint.cursor() as cur: cur.execute("update a set id = -id") flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint" assert len(layers.historic_layers) == 1 + 2, "should have initdb layer and 2 deltas" layer_sizes = 0 for layer in layers.historic_layers: log.info(f"pre-compact: {layer}") assert layer.layer_file_size is not None, "we must know layer file sizes" layer_sizes += layer.layer_file_size pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"}) pageserver_http.timeline_compact(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) for layer in layers.historic_layers: log.info(f"post compact: {layer}") assert len(layers.historic_layers) == 1, "should have compacted to single layer" post_compact = downloaded_bytes_and_count(pageserver_http) # use gte to allow pageserver to do other random stuff; this test could be run on a shared pageserver assert post_compact[0] >= layer_sizes assert post_compact[1] >= 3, "should had downloaded the three layers" def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: NeonEnvBuilder): """ Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted. Due to current implementation, this will make image creation on-demand download layers, but we cannot really directly test for it. """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) conf = { # Disable background GC & compaction "gc_period": "0s", "compaction_period": "0s", # repartitioning threshold is this / 10, but it doesn't really seem to matter "checkpoint_distance": 50 * 1024**2, "compaction_threshold": 3, # important: keep this high for the data ingestion "image_creation_threshold": 100, # repartitioning parameter, unused "compaction_target_size": 128 * 1024**2, # Always check if a new image layer can be created "image_layer_creation_check_threshold": 0, # pitr_interval and gc_horizon are not interesting because we dont run gc } env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf)) tenant_id = env.initial_tenant timeline_id = env.initial_timeline pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") # no particular reason to create the layers like this, but we are sure # not to hit the image_creation_threshold here. with endpoint.cursor() as cur: cur.execute("create table a (id bigserial primary key, some_value bigint not null)") cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)") wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) for i in range(0, 2): for j in range(0, 3): # create a minimal amount of "delta difficulty" for this table with endpoint.cursor() as cur: cur.execute("update a set some_value = -some_value + %s", (j,)) with endpoint.cursor() as cur: # vacuuming should aid to reuse keys, though it's not really important # with image_creation_threshold=1 which we will use on the last compaction cur.execute("vacuum") wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) if i == 1 and j == 2: # last iteration; stop before checkpoint to avoid leaving an inmemory layer endpoint.stop_and_destroy() pageserver_http.timeline_checkpoint(tenant_id, timeline_id) # images should not yet be created, because threshold is too high, # but these will be reshuffled to L1 layers pageserver_http.timeline_compact(tenant_id, timeline_id) for _ in range(0, 20): # loop in case flushing is still in progress layers = pageserver_http.layer_map_info(tenant_id, timeline_id) if not layers.in_memory_layers: break time.sleep(0.2) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint" kinds_before: DefaultDict[str, int] = defaultdict(int) for layer in layers.historic_layers: kinds_before[layer.kind] += 1 pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) assert dict(kinds_before) == {"Delta": 4} # now having evicted all layers, reconfigure to have lower image creation # threshold to expose image creation to downloading all of the needed # layers -- threshold of 2 would sound more reasonable, but keeping it as 1 # to be less flaky conf["image_creation_threshold"] = "1" env.neon_cli.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()}) pageserver_http.timeline_compact(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) kinds_after: DefaultDict[str, int] = defaultdict(int) for layer in layers.historic_layers: kinds_after[layer.kind] += 1 assert dict(kinds_after) == {"Delta": 4, "Image": 1} def stringify(conf: Dict[str, Any]) -> Dict[str, str]: return dict(map(lambda x: (x[0], str(x[1])), conf.items()))