from __future__ import annotations import random import time from typing import TYPE_CHECKING import psycopg2.errors import pytest from fixtures.log_helper import log from fixtures.utils import USE_LFC if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnvBuilder @pytest.mark.timeout(600) def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(".*simulated connection error.*") # this is never hit # the real reason (Simulated Connection Error) is on the next line, and we cannot filter this out. env.pageserver.allowed_errors.append( ".*ERROR error in page_service connection task: Postgres query error" ) # Enable failpoint before starting everything else up so that we exercise the retry # on fetching basebackup pageserver_http = env.pageserver.http_client() pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "20%return(15)")) env.create_branch("test_compute_pageserver_connection_stress") endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress") pg_conn = endpoint.connect() cur = pg_conn.cursor() def execute_retry_on_timeout(query): while True: try: cur.execute(query) return except psycopg2.errors.QueryCanceled: log.info(f"Query '{query}' timed out - retrying") # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. execute_retry_on_timeout("CREATE TABLE foo (t text)") execute_retry_on_timeout( """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g """ ) # Verify that the table is larger than shared_buffers execute_retry_on_timeout( """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) row = cur.fetchone() assert row is not None log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) execute_retry_on_timeout("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) end_time = time.time() + 30 times_executed = 0 while time.time() < end_time: if random.random() < 0.5: execute_retry_on_timeout("INSERT INTO foo VALUES ('stas'), ('heikki')") else: execute_retry_on_timeout("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") cur.fetchall() times_executed += 1 log.info(f"Workload executed {times_executed} times") # do a graceful shutdown which would had caught the allowed_errors before # https://github.com/neondatabase/neon/pull/8632 env.pageserver.stop() def test_compute_pageserver_hung_connections(neon_env_builder: NeonEnvBuilder): """ Test timeouts in waiting for response to pageserver request """ env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(".*slow GetPage.*") pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start( "main", tenant_id=env.initial_tenant, config_lines=["autovacuum = off"], ) pg_conn = endpoint.connect() cur = pg_conn.cursor() # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. cur.execute("CREATE TABLE foo (t text)") cur.execute( """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g """ ) # Verify that the table is larger than shared_buffers cur.execute( """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) row = cur.fetchone() assert row is not None log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) # Print the backend PID so that it can be compared with the logs easily cur.execute("SELECT pg_backend_pid()") row = cur.fetchone() assert row is not None log.info(f"running test workload in backend PID {row[0]}") def run_workload(duration: float): end_time = time.time() + duration times_executed = 0 while time.time() < end_time: if random.random() < 0.5: cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") else: cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") cur.fetchall() times_executed += 1 log.info(f"Workload executed {times_executed} times") assert times_executed > 0 ## Test short connection hiccups ## ## This is to exercise the logging timeout. log.info("running workload with log timeout") cur.execute("SET neon.pageserver_response_log_timeout = '500ms'") pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) run_workload(20) # check that the message was logged assert endpoint.log_contains("no response received from pageserver for .* s, still waiting") assert endpoint.log_contains("received response from pageserver after .* s") ## Test connections that are hung for longer ## ## This exercises the disconnect timeout. We'll disconnect and ## reconnect after 500 ms. log.info("running workload with disconnect timeout") cur.execute("SET neon.pageserver_response_log_timeout = '250ms'") cur.execute("SET neon.pageserver_response_disconnect_timeout = '500ms'") pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) run_workload(15) assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") # do a graceful shutdown which would had caught the allowed_errors before # https://github.com/neondatabase/neon/pull/8632 env.pageserver.stop() def test_compute_pageserver_statement_timeout(neon_env_builder: NeonEnvBuilder): """ Test statement_timeout while waiting for response to pageserver request """ env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(".*slow GetPage.*") pageserver_http = env.pageserver.http_client() # Make sure the shared_buffers and LFC are tiny, to ensure the queries # hit the storage. Disable autovacuum to make the test more deterministic. config_lines = [ "shared_buffers='512kB'", "autovacuum = off", ] if USE_LFC: config_lines = ["neon.max_file_cache_size = 1MB", "neon.file_cache_size_limit = 1MB"] endpoint = env.endpoints.create_start( "main", tenant_id=env.initial_tenant, config_lines=config_lines, ) pg_conn = endpoint.connect() cur = pg_conn.cursor() # Disable parallel query. Parallel workers open their own pageserver connections, # which messes up the test logic. cur.execute("SET max_parallel_workers_per_gather=0") cur.execute("SET effective_io_concurrency=0") # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. cur.execute("CREATE TABLE foo (t text)") cur.execute( """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g """ ) # Verify that the table is larger than shared_buffers cur.execute( """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) row = cur.fetchone() assert row is not None log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) ## Run a query until the compute->pageserver connection hits the failpoint and ## get stuck. This tests that the statement_timeout is obeyed while waiting on a ## GetPage request. log.info("running workload with statement_timeout") cur.execute("SET neon.pageserver_response_log_timeout = '2000ms'") cur.execute("SET neon.pageserver_response_disconnect_timeout = '30000ms'") cur.execute("SET statement_timeout='10s'") pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%return(60000)")) start_time = time.time() with pytest.raises(psycopg2.errors.QueryCanceled): cur.execute("SELECT count(*) FROM foo") cur.fetchall() log.info("Statement timeout reached") end_time = time.time() # Verify that the statement_timeout canceled the query before # neon.pageserver_response_disconnect_timeout expired assert end_time - start_time < 40 times_canceled = 1 # Should not have disconnected yet assert not endpoint.log_contains("no response from pageserver for .* s, disconnecting") # Clear the failpoint. This doesn't affect the connection that already hit it. It # will keep waiting. But subsequent connections will work normally. pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "off")) # If we keep retrying, we should eventually succeed. (This tests that the # neon.pageserver_response_disconnect_timeout is not reset on query # cancellation.) while times_canceled < 10: try: cur.execute("SELECT count(*) FROM foo") cur.fetchall() log.info("Statement succeeded") break except psycopg2.errors.QueryCanceled: log.info("Statement timed out, retrying") times_canceled += 1 assert times_canceled > 1 and times_canceled < 10 assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") # do a graceful shutdown which would had caught the allowed_errors before # https://github.com/neondatabase/neon/pull/8632 env.pageserver.stop()