mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 15:02:56 +00:00
fix: compaction: no index upload scheduled if no on-demand downloads
Commit
0cf7fd0fb8
Compaction with on-demand download (#3598)
introduced a subtle bug: if we don't have to do on-demand downloads,
we only take one ROUND in fn compact() and exit early.
Thereby, we miss scheduling the index part upload for any layers
created by fn compact_inner().
Before that commit, we didn't have this problem.
So, this patch fixes it.
Since no regression test caught this, I went ahead and extended the
timeline size tests to assert that, if remote storage is configured,
1. pageserver_remote_physical_size matches the other physical sizes
2. file sizes reported by the layer map info endpoint match the other
physical size metrics
Without the pageserver code fix, the regression test would
fail at the physical size assertion, complaining that
any of the resident physical size != remote physical size metric
50790400.0 != 18399232.0
I figured out what the problem is by comparing the remote storage
and local directories like so, and noticed that the image layer
in the local directory wasn't present on the remote side.
It's size was exactly the difference
50790400.0 - 18399232.0 =32391168.0
fixes https://github.com/neondatabase/neon/issues/3738
This commit is contained in:
committed by
Christian Schwarz
parent
d1a0a907ff
commit
66a5159511
@@ -662,8 +662,8 @@ impl Timeline {
|
||||
// update the index file on next flush iteration too. But it
|
||||
// could take a while until that happens.
|
||||
//
|
||||
// Additionally, only do this on the terminal round before sleeping.
|
||||
if last_round {
|
||||
// Additionally, only do this once before we return from this function.
|
||||
if last_round || res.is_ok() {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
}
|
||||
|
||||
@@ -3542,3 +3542,23 @@ def wait_for_sk_commit_lsn_to_reach_remote_storage(
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
|
||||
return lsn
|
||||
|
||||
|
||||
def wait_for_upload_queue_empty(
|
||||
pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
ps_http = pageserver.http_client()
|
||||
while True:
|
||||
all_metrics = ps_http.get_metrics()
|
||||
tl = all_metrics.query_all(
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
{
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
},
|
||||
)
|
||||
assert len(tl) > 0
|
||||
log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
|
||||
if all(m.value == 0 for m in tl):
|
||||
return
|
||||
time.sleep(0.2)
|
||||
|
||||
@@ -268,18 +268,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
|
||||
[layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote]
|
||||
)
|
||||
|
||||
# TODO: the following would be nice to assert, but for some reason, the commented-out
|
||||
# assert below fails with 113401856.0 != 140427264
|
||||
# => https://github.com/neondatabase/neon/issues/3738
|
||||
#
|
||||
# log.info("ensure that remote_physical_size metric matches layer map")
|
||||
# remote_physical_size_metric = ps_http.get_timeline_metric(
|
||||
# tenant_id, timeline_id, "pageserver_remote_physical_size"
|
||||
# )
|
||||
# log.info("ensure that remote_physical_size metric corresponds to layer map dump")
|
||||
# assert remote_physical_size_metric == sum(
|
||||
# [layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote]
|
||||
# )
|
||||
log.info("ensure that remote_physical_size metric matches layer map")
|
||||
remote_physical_size_metric = ps_http.get_timeline_metric(
|
||||
tenant_id, timeline_id, "pageserver_remote_physical_size"
|
||||
)
|
||||
log.info("ensure that remote_physical_size metric corresponds to layer map dump")
|
||||
assert remote_physical_size_metric == sum(
|
||||
layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
|
||||
)
|
||||
|
||||
log.info("before runnning GC, ensure that remote_physical size is zero")
|
||||
ensure_resident_and_remote_size_metrics()
|
||||
|
||||
@@ -5,6 +5,7 @@ import threading
|
||||
import time
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import psycopg2.errors
|
||||
import psycopg2.extras
|
||||
@@ -18,9 +19,11 @@ from fixtures.neon_fixtures import (
|
||||
PgBin,
|
||||
PortDistributor,
|
||||
Postgres,
|
||||
RemoteStorageKind,
|
||||
VanillaPostgres,
|
||||
assert_tenant_status,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_upload_queue_empty,
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
@@ -301,8 +304,18 @@ def test_timeline_initial_logical_size_calculation_cancellation(
|
||||
# message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
|
||||
|
||||
|
||||
def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_timeline_physical_size_init(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init")
|
||||
pg = env.postgres.create_start("test_timeline_physical_size_init")
|
||||
|
||||
@@ -330,12 +343,22 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
|
||||
)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id)
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
)
|
||||
|
||||
|
||||
def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_timeline_physical_size_post_checkpoint(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint")
|
||||
pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint")
|
||||
@@ -353,11 +376,21 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
|
||||
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id)
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
)
|
||||
|
||||
|
||||
def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_timeline_physical_size_post_compaction(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
# Disable background compaction as we don't want it to happen after `get_physical_size` request
|
||||
# and before checking the expected size on disk, which makes the assertion failed
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
@@ -386,15 +419,33 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
|
||||
)
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
|
||||
|
||||
# shutdown safekeepers to prevent new data from coming in
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
|
||||
pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id)
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
)
|
||||
|
||||
|
||||
def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_timeline_physical_size_post_gc(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
# Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
|
||||
# and before checking the expected size on disk, which makes the assertion failed
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"
|
||||
@@ -430,8 +481,12 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
|
||||
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
|
||||
pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id)
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
)
|
||||
|
||||
|
||||
@@ -515,18 +570,29 @@ def test_timeline_size_metrics(
|
||||
assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
|
||||
|
||||
|
||||
def test_tenant_physical_size(neon_simple_env: NeonEnv):
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_tenant_physical_size(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
random.seed(100)
|
||||
|
||||
env = neon_simple_env
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
tenant, timeline = env.neon_cli.create_tenant()
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, tenant, timeline)
|
||||
|
||||
def get_timeline_resident_physical_size(timeline: TimelineId):
|
||||
sizes = get_physical_size_values(env, tenant, timeline)
|
||||
assert_physical_size_invariants(sizes)
|
||||
sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
|
||||
assert_physical_size_invariants(sizes, remote_storage_kind)
|
||||
return sizes.prometheus_resident_physical
|
||||
|
||||
timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
|
||||
@@ -546,6 +612,9 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
|
||||
wait_for_last_flush_lsn(env, pg, tenant, timeline)
|
||||
pageserver_http.timeline_checkpoint(tenant, timeline)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, tenant, timeline)
|
||||
|
||||
timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
|
||||
|
||||
pg.stop()
|
||||
@@ -563,21 +632,39 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
|
||||
|
||||
class TimelinePhysicalSizeValues:
|
||||
api_current_physical: int
|
||||
prometheus_resident_physical: int
|
||||
prometheus_resident_physical: float
|
||||
prometheus_remote_physical: Optional[float] = None
|
||||
python_timelinedir_layerfiles_physical: int
|
||||
layer_map_file_size_sum: int
|
||||
|
||||
|
||||
def get_physical_size_values(
|
||||
env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId
|
||||
env: NeonEnv,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
remote_storage_kind: Optional[RemoteStorageKind],
|
||||
) -> TimelinePhysicalSizeValues:
|
||||
res = TimelinePhysicalSizeValues()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
res.prometheus_resident_physical = int(
|
||||
client.get_timeline_metric(tenant_id, timeline_id, "pageserver_resident_physical_size")
|
||||
res.layer_map_file_size_sum = sum(
|
||||
layer.layer_file_size or 0
|
||||
for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers
|
||||
)
|
||||
|
||||
metrics = client.get_metrics()
|
||||
metrics_filter = {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
|
||||
res.prometheus_resident_physical = metrics.query_one(
|
||||
"pageserver_resident_physical_size", metrics_filter
|
||||
).value
|
||||
if remote_storage_kind is not None:
|
||||
res.prometheus_remote_physical = metrics.query_one(
|
||||
"pageserver_remote_physical_size", metrics_filter
|
||||
).value
|
||||
else:
|
||||
res.prometheus_remote_physical = None
|
||||
|
||||
detail = client.timeline_detail(
|
||||
tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
|
||||
)
|
||||
@@ -589,11 +676,20 @@ def get_physical_size_values(
|
||||
return res
|
||||
|
||||
|
||||
def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
|
||||
def assert_physical_size_invariants(
|
||||
sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
# resident phyiscal size is defined as
|
||||
assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
|
||||
assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum
|
||||
|
||||
# we don't do layer eviction, so, all layers are resident
|
||||
assert sizes.api_current_physical == sizes.prometheus_resident_physical
|
||||
if remote_storage_kind is not None:
|
||||
assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
|
||||
# XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
|
||||
else:
|
||||
assert sizes.prometheus_remote_physical is None
|
||||
|
||||
|
||||
# Timeline logical size initialization is an asynchronous background task that runs once,
|
||||
|
||||
Reference in New Issue
Block a user