Use proper tokens and delimeters when listing S3

2026-01-09 06:22:57 +00:00 · 2023-03-17 18:19:05 +02:00
parent b52389f228
commit 018c8b0e2b
2 changed files with 48 additions and 3 deletions
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -291,6 +291,7 @@ impl RemoteStorage for S3Bucket {
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(self.prefix_in_bucket.clone())
+                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
                .set_continuation_token(continuation_token)
                .send()
                .await
@@ -306,7 +307,7 @@ impl RemoteStorage for S3Bucket {
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
            );

-            match fetch_response.continuation_token {
+            match fetch_response.next_continuation_token {
                Some(new_token) => continuation_token = Some(new_token),
                None => break,
            }
@@ -371,7 +372,7 @@ impl RemoteStorage for S3Bucket {
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            match fetch_response.continuation_token {
+            match fetch_response.next_continuation_token {
                Some(new_token) => continuation_token = Some(new_token),
                None => break,
            }
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -6,7 +6,7 @@ import shutil
 import threading
 import time
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Set, Tuple

 import pytest
 from fixtures.log_helper import log
@@ -717,6 +717,50 @@ def test_empty_branch_remote_storage_upload_on_restart(
    ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'"


+# Test creates >1000 timelines and upload them to the remote storage.
+# AWS S3 does not return more than 1000 items and starts paginating, ensure that pageserver paginates correctly.
+@pytest.mark.skip("Too slow to run, requires too much disk space to run")
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_thousands_of_branches(
+        neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_compaction_downloads_on_demand_without_image_creation",
+    )
+
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+    expected_timelines: Set[TimelineId] = set([])
+    tenant_id = env.initial_tenant
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+
+    max_timelines = 1500
+    for i in range(0, max_timelines):
+        new_timeline_id = TimelineId.generate()
+        log.info(f"Creating timeline {new_timeline_id}, {i + 1} out of {max_timelines}")
+        expected_timelines.add(new_timeline_id)
+
+        client.timeline_create(tenant_id, new_timeline_id=new_timeline_id)
+        client.timeline_checkpoint(tenant_id, new_timeline_id)
+        wait_for_last_flush_lsn(env, pg, tenant_id, new_timeline_id)
+        with pg.cursor() as cur:
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+        wait_for_upload(client, tenant_id, new_timeline_id, current_lsn)
+
+    client.tenant_detach(tenant_id=tenant_id)
+    client.tenant_attach(tenant_id=tenant_id)
+
+    timelines_after_reattach = set(
+        [timeline["timeline_id"] for timeline in client.timeline_list(tenant_id=tenant_id)]
+    )
+
+    assert (
+            expected_timelines == timelines_after_reattach
+    ), f"Timelines after reattach do not match the ones created initially. \
+        Missing timelines: {expected_timelines - timelines_after_reattach}, extra timelines: {timelines_after_reattach - expected_timelines}"
+
+
 def wait_upload_queue_empty(
    client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):