Don't upload index file when GC doesn't remove anything.

I saw an excessive number of index file upload operations in
production, even when nothing on the timeline changes. It was because
our GC schedules index file upload if the GC cutoff LSN is advanced,
even if the GC had nothing else to do. The GC cutoff LSN marches
steadily forwards, even when there is no user activity on the
timeline, when the cutoff is determined by the time-based PITR
interval setting. To dial that down, only schedule index file upload
when GC is about to actually remove something.
This commit is contained in:
Heikki Linnakangas
2022-12-15 12:39:50 +02:00
committed by Heikki Linnakangas
parent 6dec85b19d
commit c262390214
2 changed files with 111 additions and 26 deletions

View File

@@ -2487,9 +2487,6 @@ impl Timeline {
);
write_guard.store_and_unlock(new_gc_cutoff).wait();
}
// Persist the new GC cutoff value in the metadata file, before
// we actually remove anything.
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
info!("GC starting");
@@ -2600,19 +2597,33 @@ impl Timeline {
layers_to_remove.push(Arc::clone(&l));
}
// Actually delete the layers from disk and remove them from the map.
// (couldn't do this in the loop above, because you cannot modify a collection
// while iterating it. BTreeMap::retain() would be another option)
let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
for doomed_layer in layers_to_remove {
let path = doomed_layer.local_path();
self.metrics
.current_physical_size_gauge
.sub(path.metadata()?.len());
layer_names_to_delete.push(doomed_layer.filename());
doomed_layer.delete()?;
layers.remove_historic(doomed_layer);
result.layers_removed += 1;
if !layers_to_remove.is_empty() {
// Persist the new GC cutoff value in the metadata file, before
// we actually remove anything.
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
// Actually delete the layers from disk and remove them from the map.
// (couldn't do this in the loop above, because you cannot modify a collection
// while iterating it. BTreeMap::retain() would be another option)
let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
for doomed_layer in layers_to_remove {
let path = doomed_layer.local_path();
self.metrics
.current_physical_size_gauge
.sub(path.metadata()?.len());
layer_names_to_delete.push(doomed_layer.filename());
doomed_layer.delete()?;
layers.remove_historic(doomed_layer);
result.layers_removed += 1;
}
if result.layers_removed != 0 {
fail_point!("after-timeline-gc-removed-layers");
}
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
}
}
info!(
@@ -2620,14 +2631,6 @@ impl Timeline {
result.layers_removed, new_gc_cutoff
);
if result.layers_removed != 0 {
fail_point!("after-timeline-gc-removed-layers");
}
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
}
result.elapsed = now.elapsed()?;
Ok(result)
}

View File

@@ -2,9 +2,17 @@ import asyncio
import concurrent.futures
import random
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres
from fixtures.types import TimelineId
from fixtures.metrics import parse_metrics
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
Postgres,
RemoteStorageKind,
wait_for_last_flush_lsn,
)
from fixtures.types import TenantId, TimelineId
from fixtures.utils import query_scalar
# Test configuration
@@ -43,6 +51,7 @@ async def gc(env: NeonEnv, timeline: TimelineId):
while updates_performed < updates_to_perform:
await loop.run_in_executor(pool, do_gc)
# At the same time, run UPDATEs and GC
async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId):
workers = []
@@ -88,3 +97,76 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
r = cur.fetchone()
assert r is not None
assert r == (num_rows, updates_to_perform)
#
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
# Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_gc_index_upload",
)
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_gc_index_upload", "main")
pg = env.postgres.create_start("test_gc_index_upload")
pageserver_http = env.pageserver.http_client()
pg_conn = pg.connect()
cur = pg_conn.cursor()
tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
"""
INSERT INTO foo
SELECT g, 0, 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
# Helper function that gets the number of given kind of remote ops from the metrics
def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
total = 0.0
for sample in ps_metrics.query_all(
name="pageserver_remote_operation_seconds_count",
filter={
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
"file_kind": str(file_kind),
"op_kind": str(op_kind),
},
):
total += sample[2]
return int(total)
# Sanity check that the metric works
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
before = get_num_remote_ops("index", "upload")
assert before > 0
# Run many cycles of GC. Then check that the number of index files
# uploads didn't grow much. In particular we don't want to re-upload the
# index file on every GC iteration, when it has no work to do.
#
# On each iteration, we use a slightly smaller GC horizon, so that the GC
# at least needs to check if it has work to do.
for i in range(100):
cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')")
pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32)
num_index_uploads = get_num_remote_ops("index", "upload")
log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
after = num_index_uploads
log.info(f"{after-before} new index uploads during test")
assert after - before < 5