mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
part of Epic https://github.com/neondatabase/neon/issues/7386 # Motivation The materialized page cache adds complexity to the code base, which increases the maintenance burden and risk for subtle and hard to reproduce bugs such as #8050. Further, the best hit rate that we currently achieve in production is ca 1% of materialized page cache lookups for `task_kind=PageRequestHandler`. Other task kinds have hit rates <0.2%. Last, caching page images in Pageserver rewards under-sized caches in Computes because reading from Pageserver's materialized page cache over the network is often sufficiently fast (low hundreds of microseconds). Such Computes should upscale their local caches to fit their working set, rather than repeatedly requesting the same page from Pageserver. Some more discussion and context in internal thread https://neondb.slack.com/archives/C033RQ5SPDH/p1718714037708459 # Changes This PR removes the materialized page cache code & metrics. The infrastructure for different key kinds in `PageCache` is left in place, even though the "Immutable" key kind is the only remaining one. This can be further simplified in a future commit. Some tests started failing because their total runtime was dependent on high materialized page cache hit rates. This test makes them fixed-runtime or raises pytest timeouts: * test_local_file_cache_unlink * test_physical_replication * test_pg_regress # Performance I focussed on ensuring that this PR will not result in a performance regression in prod. * **getpage** requests: our production metrics have shown the materialized page cache to be irrelevant (low hit rate). Also, Pageserver is the wrong place to cache page images, it should happen in compute. * **ingest** (`task_kind=WalReceiverConnectionHandler`): prod metrics show 0 percent hit rate, so, removing will not be a regression. * **get_lsn_by_timestamp**: important API for branch creation, used by control pane. The clog pages that this code uses are not materialize-page-cached because they're not 8k. No risk of introducing a regression here. We will watch the various nightly benchmarks closely for more results before shipping to prod.
85 lines
2.8 KiB
Python
85 lines
2.8 KiB
Python
import os
|
|
import queue
|
|
import random
|
|
import threading
|
|
import time
|
|
from typing import List
|
|
|
|
from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
|
|
from fixtures.utils import query_scalar
|
|
|
|
|
|
def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
|
|
env = neon_env_builder.init_start()
|
|
|
|
cache_dir = os.path.join(env.repo_dir, "file_cache")
|
|
os.mkdir(cache_dir)
|
|
|
|
env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
|
|
env.neon_cli.create_branch("test_local_file_cache_unlink", "empty")
|
|
|
|
endpoint = env.endpoints.create_start(
|
|
"test_local_file_cache_unlink",
|
|
config_lines=[
|
|
"shared_buffers='1MB'",
|
|
f"neon.file_cache_path='{cache_dir}/file.cache'",
|
|
"neon.max_file_cache_size='64MB'",
|
|
"neon.file_cache_size_limit='10MB'",
|
|
],
|
|
)
|
|
|
|
cur = endpoint.connect().cursor()
|
|
|
|
stop = threading.Event()
|
|
n_rows = 100000
|
|
n_threads = 20
|
|
n_updates_per_connection = 1000
|
|
|
|
cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
|
|
cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
|
|
|
|
# Start threads that will perform random UPDATEs. Each UPDATE
|
|
# increments the counter on the row, so that we can check at the
|
|
# end that the sum of all the counters match the number of updates
|
|
# performed (plus the initial 1 on each row).
|
|
#
|
|
# Furthermore, each thread will reconnect between every 1000 updates.
|
|
def run_updates(n_updates_performed_q: queue.Queue[int]):
|
|
n_updates_performed = 0
|
|
conn = endpoint.connect()
|
|
cur = conn.cursor()
|
|
while not stop.is_set():
|
|
id = random.randint(1, n_rows)
|
|
cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
|
|
n_updates_performed += 1
|
|
if n_updates_performed % n_updates_per_connection == 0:
|
|
cur.close()
|
|
conn.close()
|
|
conn = endpoint.connect()
|
|
cur = conn.cursor()
|
|
n_updates_performed_q.put(n_updates_performed)
|
|
|
|
n_updates_performed_q: queue.Queue[int] = queue.Queue()
|
|
threads: List[threading.Thread] = []
|
|
for _i in range(n_threads):
|
|
thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
|
|
thread.start()
|
|
threads.append(thread)
|
|
|
|
time.sleep(5)
|
|
|
|
# unlink, this is what we're actually testing
|
|
new_cache_dir = os.path.join(env.repo_dir, "file_cache_new")
|
|
os.rename(cache_dir, new_cache_dir)
|
|
|
|
time.sleep(10)
|
|
|
|
stop.set()
|
|
|
|
n_updates_performed = 0
|
|
for thread in threads:
|
|
thread.join()
|
|
n_updates_performed += n_updates_performed_q.get()
|
|
|
|
assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
|