""" Run the random API tests on the cloud instance of Neon """ from __future__ import annotations import os import random import subprocess import time from datetime import UTC, datetime, timedelta from typing import TYPE_CHECKING, Any import pytest from fixtures.log_helper import log if TYPE_CHECKING: from pathlib import Path from fixtures.neon_api import NeonAPI from fixtures.neon_fixtures import PgBin from fixtures.pg_version import PgVersion class NeonEndpoint: """ Neon Endpoint Gets the output of the API call of an endpoint creation """ def __init__(self, project: NeonProject, endpoint: dict[str, Any]): self.project: NeonProject = project self.id: str = endpoint["id"] # The branch endpoint belongs to self.branch: NeonBranch = project.branches[endpoint["branch_id"]] self.type: str = endpoint["type"] # add itself to the list of endpoints of the branch self.branch.endpoints[self.id] = self self.project.endpoints[self.id] = self self.host: str = endpoint["host"] self.benchmark: subprocess.Popen[Any] | None = None # The connection environment is used when running benchmark self.connect_env: dict[str, str] | None = None if self.branch.connect_env: self.connect_env = self.branch.connect_env.copy() self.connect_env["PGHOST"] = self.host if self.type == "read_only": self.project.read_only_endpoints_total += 1 def delete(self): self.project.delete_endpoint(self.id) def start_benchmark(self, clients=10): return self.project.start_benchmark(self.id, clients=clients) def check_benchmark(self): self.project.check_benchmark(self.id) def terminate_benchmark(self): self.project.terminate_benchmark(self.id) class NeonBranch: """ Neon Branch Gets the output of the API call of the Neon Public API call of a branch creation as a first parameter is_reset defines if the branch is a reset one i.e. created as a result of the reset API Call """ def __init__(self, project, branch: dict[str, Any], is_reset=False): self.id: str = branch["branch"]["id"] self.desc = branch self.project: NeonProject = project self.neon_api: NeonAPI = project.neon_api self.project_id: str = branch["branch"]["project_id"] self.parent: NeonBranch | None = ( self.project.branches[branch["branch"]["parent_id"]] if "parent_id" in branch["branch"] else None ) if is_reset: self.project.reset_branches.add(self.id) elif self.parent: self.project.leaf_branches[self.id] = self if self.parent is not None and self.parent.id in self.project.leaf_branches: self.project.leaf_branches.pop(self.parent.id) self.project.branches[self.id] = self self.children: dict[str, NeonBranch] = {} if self.parent is not None: self.parent.children[self.id] = self self.endpoints: dict[str, NeonEndpoint] = {} self.connection_parameters: dict[str, str] | None = ( branch["connection_uris"][0]["connection_parameters"] if "connection_uris" in branch else None ) self.benchmark: subprocess.Popen[Any] | None = None self.updated_at: datetime = datetime.fromisoformat(branch["branch"]["updated_at"]) self.parent_timestamp: datetime = ( datetime.fromisoformat(branch["branch"]["parent_timestamp"]) if "parent_timestamp" in branch["branch"] else datetime.fromtimestamp(0, tz=UTC) ) self.connect_env: dict[str, str] | None = None if self.connection_parameters: self.connect_env = { "PGHOST": self.connection_parameters["host"], "PGUSER": self.connection_parameters["role"], "PGDATABASE": self.connection_parameters["database"], "PGPASSWORD": self.connection_parameters["password"], "PGSSLMODE": "require", } def __str__(self): """ Prints the branch's name with all the predecessors (r) means the branch is a reset one """ return f"{self.id}{'(r)' if self.id in self.project.reset_branches else ''}, parent: {self.parent}" def random_time(self) -> datetime: min_time = max( self.updated_at + timedelta(seconds=1), self.project.min_time, self.parent_timestamp + timedelta(seconds=1), ) max_time = datetime.now(UTC) - timedelta(seconds=1) log.info("min_time: %s, max_time: %s", min_time, max_time) return (min_time + (max_time - min_time) * random.random()).replace(microsecond=0) def create_child_branch(self, parent_timestamp: datetime | None = None) -> NeonBranch | None: return self.project.create_branch(self.id, parent_timestamp) def create_ro_endpoint(self) -> NeonEndpoint | None: if not self.project.check_limit_endpoints(): return None return NeonEndpoint( self.project, self.neon_api.create_endpoint(self.project_id, self.id, "read_only", {})["endpoint"], ) def delete(self) -> None: self.project.delete_branch(self.id) def start_benchmark(self, clients=10) -> subprocess.Popen[Any]: return self.project.start_benchmark(self.id, clients=clients) def check_benchmark(self) -> None: self.project.check_benchmark(self.id) def terminate_benchmark(self) -> None: self.project.terminate_benchmark(self.id) def reset_to_parent(self) -> None: for ep in self.project.endpoints.values(): if ep.type == "read_only": ep.terminate_benchmark() self.terminate_benchmark() res = self.neon_api.reset_to_parent(self.project_id, self.id) self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"]) self.parent_timestamp = datetime.fromisoformat(res["branch"]["parent_timestamp"]) self.project.wait() self.start_benchmark() for ep in self.project.endpoints.values(): if ep.type == "read_only": ep.start_benchmark() def restore_random_time(self) -> None: """ Does PITR, i.e. calls the reset API call on the same branch to the random time in the past """ res = self.restore( self.id, source_timestamp=self.random_time().isoformat().replace("+00:00", "Z"), preserve_under_name=self.project.gen_restore_name(), ) if res is None: return self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"]) self.parent_timestamp = datetime.fromisoformat(res["branch"]["parent_timestamp"]) parent_id: str = res["branch"]["parent_id"] # Creates an object for the parent branch # After the reset operation a new parent branch is created parent = NeonBranch( self.project, self.neon_api.get_branch_details(self.project_id, parent_id), True ) self.project.branches[parent_id] = parent self.parent = parent parent.children[self.id] = self self.project.wait() def restore( self, source_branch_id: str, source_lsn: str | None = None, source_timestamp: str | None = None, preserve_under_name: str | None = None, ) -> dict[str, Any] | None: if not self.project.check_limit_branches(): return None endpoints = [ep for ep in self.endpoints.values() if ep.type == "read_only"] # Terminate all the benchmarks running to prevent errors. Errors in benchmark during pgbench are expected for ep in endpoints: ep.terminate_benchmark() self.terminate_benchmark() res: dict[str, Any] = self.neon_api.restore_branch( self.project_id, self.id, source_branch_id, source_lsn, source_timestamp, preserve_under_name, ) self.project.wait() self.start_benchmark() for ep in endpoints: ep.start_benchmark() return res class NeonProject: """ The project object Calls the Public API to create a Neon Project """ def __init__(self, neon_api: NeonAPI, pg_bin: PgBin, pg_version: PgVersion): self.neon_api = neon_api self.pg_bin = pg_bin proj = self.neon_api.create_project( pg_version, f"Automatic random API test GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" ) self.id: str = proj["project"]["id"] self.name: str = proj["project"]["name"] self.connection_uri: str = proj["connection_uris"][0]["connection_uri"] self.connection_parameters: dict[str, str] = proj["connection_uris"][0][ "connection_parameters" ] self.pg_version: PgVersion = pg_version # Leaf branches are the branches, which do not have children self.leaf_branches: dict[str, NeonBranch] = {} self.branches: dict[str, NeonBranch] = {} self.reset_branches: set[str] = set() self.main_branch: NeonBranch = NeonBranch(self, proj) self.main_branch.connection_parameters = self.connection_parameters self.endpoints: dict[str, NeonEndpoint] = {} for endpoint in proj["endpoints"]: NeonEndpoint(self, endpoint) self.neon_api.wait_for_operation_to_finish(self.id) self.benchmarks: dict[str, subprocess.Popen[Any]] = {} self.restore_num: int = 0 self.restart_pgbench_on_console_errors: bool = False self.limits: dict[str, Any] = self.get_limits()["limits"] self.read_only_endpoints_total: int = 0 self.min_time: datetime = datetime.now(UTC) def get_limits(self) -> dict[str, Any]: return self.neon_api.get_project_limits(self.id) def delete(self) -> None: self.neon_api.delete_project(self.id) def check_limit_branches(self) -> bool: if self.limits["max_branches"] == -1 or len(self.branches) < self.limits["max_branches"]: return True log.info("branch limit exceeded (%s/%s)", len(self.branches), self.limits["max_branches"]) return False def check_limit_endpoints(self) -> bool: if ( self.limits["max_read_only_endpoints"] == -1 or self.read_only_endpoints_total < self.limits["max_read_only_endpoints"] ): return True log.info( "Maximum read only endpoint limit exceeded (%s/%s)", self.read_only_endpoints_total, self.limits["max_read_only_endpoints"], ) return False def create_branch( self, parent_id: str | None = None, parent_timestamp: datetime | None = None ) -> NeonBranch | None: self.wait() if not self.check_limit_branches(): return None if parent_timestamp: log.info("Timestamp: %s", parent_timestamp) parent_timestamp_str: str | None = None if parent_timestamp: parent_timestamp_str = parent_timestamp.isoformat().replace("+00:00", "Z") branch_def = self.neon_api.create_branch( self.id, parent_id=parent_id, parent_timestamp=parent_timestamp_str ) new_branch = NeonBranch(self, branch_def) self.wait() return new_branch def delete_branch(self, branch_id: str) -> None: parent = self.branches[branch_id].parent if not parent or branch_id == self.main_branch.id: raise RuntimeError("Cannot delete the main branch") if branch_id not in self.leaf_branches and branch_id not in self.reset_branches: raise RuntimeError(f"The branch {branch_id}, probably, has ancestors") if branch_id not in self.branches: raise RuntimeError(f"The branch with id {branch_id} is not found") endpoints_to_delete = [ ep for ep in self.branches[branch_id].endpoints.values() if ep.type == "read_only" ] for ep in endpoints_to_delete: ep.delete() if branch_id not in self.reset_branches: self.terminate_benchmark(branch_id) self.neon_api.delete_branch(self.id, branch_id) if len(parent.children) == 1 and parent.id != self.main_branch.id: self.leaf_branches[parent.id] = parent parent.children.pop(branch_id) if branch_id in self.leaf_branches: self.leaf_branches.pop(branch_id) else: self.reset_branches.remove(branch_id) self.branches.pop(branch_id) self.wait() if parent.id in self.reset_branches: parent.delete() def get_random_leaf_branch(self) -> NeonBranch | None: target: NeonBranch | None = None if self.leaf_branches: target = random.choice(list(self.leaf_branches.values())) else: log.info("No leaf branches found") return target def delete_endpoint(self, endpoint_id: str) -> None: self.terminate_benchmark(endpoint_id) self.neon_api.delete_endpoint(self.id, endpoint_id) self.endpoints[endpoint_id].branch.endpoints.pop(endpoint_id) self.endpoints.pop(endpoint_id) self.read_only_endpoints_total -= 1 self.wait() def start_benchmark(self, target: str, clients: int = 10) -> subprocess.Popen[Any]: if target in self.benchmarks: raise RuntimeError(f"Benchmark was already started for {target}") is_endpoint = target.startswith("ep") read_only = is_endpoint and self.endpoints[target].type == "read_only" cmd = ["pgbench", f"-c{clients}", "-T10800", "-Mprepared"] if read_only: cmd.extend(["-S", "-n"]) target_object = self.endpoints[target] if is_endpoint else self.branches[target] if target_object.connect_env is None: raise RuntimeError(f"The connection environment is not defined for {target}") log.info( "running pgbench on %s, cmd: %s, host: %s", target, cmd, target_object.connect_env["PGHOST"], ) pgbench = self.pg_bin.run_nonblocking( cmd, env=target_object.connect_env, stderr_pipe=subprocess.PIPE ) self.benchmarks[target] = pgbench target_object.benchmark = pgbench time.sleep(2) return pgbench def check_all_benchmarks(self) -> None: for target in tuple(self.benchmarks.keys()): self.check_benchmark(target) def check_benchmark(self, target) -> None: rc = self.benchmarks[target].poll() if rc is not None: _, err = self.benchmarks[target].communicate() log.error("STDERR: %s", err) # if the benchmark failed due to irresponsible Control plane, # just restart it if self.restart_pgbench_on_console_errors and ( "ERROR: Couldn't connect to compute node" in err or "ERROR: Console request failed" in err or "ERROR: Control plane request failed" in err ): log.info("Restarting benchmark for %s", target) self.benchmarks.pop(target) self.start_benchmark(target) return raise RuntimeError(f"The benchmark for {target} ended with code {rc}") def terminate_benchmark(self, target): log.info("Terminating the benchmark %s", target) target_endpoint = target.startswith("ep") self.check_benchmark(target) self.benchmarks[target].terminate() self.benchmarks.pop(target) if target_endpoint: self.endpoints[target].benchmark = None else: self.branches[target].benchmark = None def wait(self): """ Wait for all the operations to be finished """ return self.neon_api.wait_for_operation_to_finish(self.id) def gen_restore_name(self): self.restore_num += 1 return f"restore{self.restore_num}" @pytest.fixture() def setup_class( pg_version: PgVersion, pg_bin: PgBin, neon_api: NeonAPI, ): neon_api.retry_if_possible = True project = NeonProject(neon_api, pg_bin, pg_version) log.info("Created a project with id %s, name %s", project.id, project.name) yield pg_bin, project log.info("Retried 524 errors: %s", neon_api.retries524) log.info("Retried 4xx errors: %s", neon_api.retries4xx) if neon_api.retries524 > 0: print(f"::warning::Retried on 524 error {neon_api.retries524} times") if neon_api.retries4xx > 0: print(f"::warning::Retried on 4xx error {neon_api.retries4xx} times") log.info("Removing the project %s", project.id) project.delete() def do_action(project: NeonProject, action: str) -> bool: """ Runs the action """ log.info("Action: %s", action) if action == "new_branch" or action == "new_branch_random_time": use_random_time: bool = action == "new_branch_random_time" log.info("Trying to create a new branch %s", "random time" if use_random_time else "") parent = project.branches[ random.choice(list(set(project.branches.keys()) - project.reset_branches)) ] child = parent.create_child_branch(parent.random_time() if use_random_time else None) if child is None: return False log.info("Created branch %s", child) child.start_benchmark() elif action == "delete_branch": if (target := project.get_random_leaf_branch()) is None: return False log.info("Trying to delete branch %s", target) target.delete() elif action == "new_ro_endpoint": ep = random.choice( [br for br in project.branches.values() if br.id not in project.reset_branches] ).create_ro_endpoint() if ep is None: return False log.info("Created the RO endpoint with id %s branch: %s", ep.id, ep.branch.id) ep.start_benchmark() elif action == "delete_ro_endpoint": if project.read_only_endpoints_total == 0: log.info("no read_only endpoints present, skipping") return False ro_endpoints: list[NeonEndpoint] = [ endpoint for endpoint in project.endpoints.values() if endpoint.type == "read_only" ] target_ep: NeonEndpoint = random.choice(ro_endpoints) target_ep.delete() log.info("endpoint %s deleted", target_ep.id) elif action == "restore_random_time": if (target := project.get_random_leaf_branch()) is None: return False log.info("Restore %s", target) target.restore_random_time() elif action == "reset_to_parent": if (target := project.get_random_leaf_branch()) is None: return False log.info("Reset to parent %s", target) target.reset_to_parent() else: raise ValueError(f"The action {action} is unknown") return True @pytest.mark.timeout(7200) @pytest.mark.remote_cluster def test_api_random( setup_class, pg_distrib_dir: Path, test_output_dir: Path, ): """ Run the random API tests """ if seed_env := os.getenv("RANDOM_SEED"): seed = int(seed_env) else: seed = 0 if seed == 0: seed = int(time.time()) log.info("Using random seed: %s", seed) random.seed(seed) pg_bin, project = setup_class # Here we can assign weights ACTIONS = ( ("new_branch", 1.2), ("new_branch_random_time", 0.5), ("new_ro_endpoint", 1.4), ("delete_ro_endpoint", 0.8), ("delete_branch", 1.2), ("restore_random_time", 0.9), ("reset_to_parent", 0.3), ) if num_ops_env := os.getenv("NUM_OPERATIONS"): num_operations = int(num_ops_env) else: num_operations = 250 pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=project.main_branch.connect_env) # To not go to the past where pgbench tables do not exist time.sleep(1) project.min_time = datetime.now(UTC) for _ in range(num_operations): log.info("Starting action #%s", _ + 1) while not do_action( project, random.choices([a[0] for a in ACTIONS], weights=[w[1] for w in ACTIONS])[0] ): log.info("Retrying...") project.check_all_benchmarks() assert True