Add a new HTTP management API to toggle pageserver logs

This commit is contained in:
Kirill Bulatov
2023-01-10 13:39:45 +02:00
parent fcb905f519
commit 65fe6b5f90
16 changed files with 1085 additions and 86 deletions

View File

@@ -595,13 +595,14 @@ class NeonEnvBuilder:
# fsync is disabled by default to make the tests go faster
safekeepers_enable_fsync: bool = False,
auth_enabled: bool = False,
rust_log_override: Optional[str] = None,
rust_log_env_var: Optional[str] = None,
default_branch_name: str = DEFAULT_BRANCH_NAME,
preserve_database_files: bool = False,
initial_tenant: Optional[TenantId] = None,
initial_timeline: Optional[TimelineId] = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
self.rust_log_env_var = rust_log_env_var
self.port_distributor = port_distributor
self.remote_storage = remote_storage
self.remote_storage_users = remote_storage_users
@@ -622,6 +623,7 @@ class NeonEnvBuilder:
self.pg_version = pg_version
self.preserve_database_files = preserve_database_files
self.initial_tenant = initial_tenant or TenantId.generate()
self.initial_timeline = initial_timeline or TimelineId.generate()
def init_configs(self) -> NeonEnv:
# Cannot create more than one environment from one builder
@@ -640,10 +642,10 @@ class NeonEnvBuilder:
# Prepare the default branch to start the postgres on later.
# Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
log.info(
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
f"Services started, creating initial timeline {env.initial_tenant}/{env.initial_timeline}"
)
initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
env.neon_cli.create_tenant(tenant_id=env.initial_tenant, timeline_id=env.initial_timeline)
log.info("Initial timeline created successfully")
return env
@@ -888,7 +890,7 @@ class NeonEnv:
def __init__(self, config: NeonEnvBuilder):
self.repo_dir = config.repo_dir
self.rust_log_override = config.rust_log_override
self.rust_log_env_var = config.rust_log_env_var
self.port_distributor = config.port_distributor
self.s3_mock_server = config.mock_s3_server
self.neon_cli = NeonCli(env=self)
@@ -904,6 +906,7 @@ class NeonEnv:
# generate initial tenant ID here instead of letting 'neon init' generate it,
# so that we don't need to dig it out of the config file afterwards.
self.initial_tenant = config.initial_tenant
self.initial_timeline = config.initial_timeline
# Create a config file corresponding to the options
toml = textwrap.dedent(
@@ -1118,6 +1121,80 @@ class PageserverApiException(Exception):
pass
class PageserverLogScope(abc.ABC):
log_level: Optional[str]
tenant_id: Optional[TenantId]
timeline_id: Optional[TimelineId]
@abc.abstractmethod
def to_json(self) -> dict[str, Any]:
pass
def disable(self):
self.log_level = None
class CustomLogScope(PageserverLogScope):
directive: str
enabled: bool
def __init__(self, directive: str, enabled: bool):
self.directive = directive
self.enabled = enabled
def __str__(self) -> str:
return f"CustomLogScope{{directive: {self.directive}, enabled: {self.enabled}}}"
def to_json(self) -> dict[str, Any]:
return {
"directive": self.directive,
"enabled": self.enabled,
}
def disable(self):
self.enabled = False
class TenantLogScope(PageserverLogScope):
tenant_id: TenantId
def __init__(self, log_level: Optional[str], tenant_id: TenantId):
self.log_level = log_level
self.tenant_id = tenant_id
def __str__(self) -> str:
return f"TenantLogScope{{log_level: {self.log_level}, tenant_id: {self.tenant_id}}}"
def to_json(self) -> dict[str, Any]:
return {
"log_level": self.log_level if self.log_level else None,
"scope": {"kind": "Tenant", "tenant_id": str(self.tenant_id)},
}
class TimelineLogScope(PageserverLogScope):
tenant_id: TenantId
timeline_id: TimelineId
def __init__(self, log_level: Optional[str], tenant_id: TenantId, timeline_id: TimelineId):
self.tenant_id = tenant_id
self.log_level = log_level
self.timeline_id = timeline_id
def __str__(self) -> str:
return f"TimelineLogScope{{log_level: {self.log_level}, tenant_id: {self.tenant_id}, timeline_id: {self.timeline_id}}}"
def to_json(self) -> dict[str, Any]:
return {
"log_level": self.log_level if self.log_level else None,
"scope": {
"kind": "Timeline",
"tenant_id": str(self.tenant_id),
"timeline_id": str(self.timeline_id),
},
}
class PageserverHttpClient(requests.Session):
def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
super().__init__()
@@ -1161,6 +1238,27 @@ class PageserverHttpClient(requests.Session):
assert res_json is None
return res_json
def current_log_filter(self) -> str:
res = self.get(f"http://localhost:{self.port}/v1/log_filter")
self.verbose_error(res)
log_filter = res.json()
assert isinstance(log_filter, str)
return log_filter
def change_log_filter(self, scope: PageserverLogScope):
res = self.put(
f"http://localhost:{self.port}/v1/log_filter",
json=scope.to_json(),
)
self.verbose_error(res)
assert (
res.status_code == 201
), f"Expected 201 status code, but got {res.status_code} for scope {scope}"
def reset_log_filter(self):
res = self.post(f"http://localhost:{self.port}/v1/reset_log_filter")
self.verbose_error(res)
def tenant_list(self) -> List[Dict[Any, Any]]:
res = self.get(f"http://localhost:{self.port}/v1/tenant")
self.verbose_error(res)
@@ -1635,8 +1733,8 @@ class AbstractNeonCli(abc.ABC):
env_vars = os.environ.copy()
env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir)
env_vars["POSTGRES_DISTRIB_DIR"] = str(self.env.pg_distrib_dir)
if self.env.rust_log_override is not None:
env_vars["RUST_LOG"] = self.env.rust_log_override
if self.env.rust_log_env_var is not None:
env_vars["RUST_LOG"] = self.env.rust_log_env_var
for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items():
env_vars[extra_env_key] = extra_env_value
@@ -2158,6 +2256,17 @@ class NeonPageserver(PgProtocol):
return None
def log_lines(self) -> List[str]:
logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
lines = []
while True:
line = logfile.readline()
if line:
lines.append(line)
else:
return lines
def append_pageserver_param_overrides(
params_to_update: List[str],

View File

@@ -326,7 +326,7 @@ def check_neon_works(
# TODO: replace with NeonEnvBuilder / NeonEnv
config: Any = type("NeonEnvStub", (object,), {})
config.rust_log_override = None
config.rust_log_env_var = None
config.repo_dir = repo_dir
config.pg_version = pg_version
config.initial_tenant = snapshot_config["default_tenant_id"]

View File

@@ -1,12 +1,17 @@
import subprocess
from pathlib import Path
from typing import Optional
from typing import List, Optional
import pytest
from fixtures.neon_fixtures import (
DEFAULT_BRANCH_NAME,
CustomLogScope,
NeonEnv,
NeonEnvBuilder,
PageserverHttpClient,
PageserverLogScope,
TenantLogScope,
TimelineLogScope,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
@@ -185,3 +190,146 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
with env.pageserver.http_client(auth_token=pageserver_token) as client:
check_client(client, env.initial_tenant)
def pageserver_predefined_scopes() -> List[PageserverLogScope]:
return [
TenantLogScope("debug", TenantId.generate()),
TimelineLogScope("debug", TenantId.generate(), TimelineId.generate()),
]
def pageserver_custom_scopes() -> List[PageserverLogScope]:
return [CustomLogScope("pageserver=debug", True), CustomLogScope("hyper=debug", True)]
@pytest.mark.parametrize(
"pageserver_log_scope", pageserver_predefined_scopes() + pageserver_custom_scopes()
)
def test_pageserver_logs_toggle_overrides_env_var(
neon_env_builder: NeonEnvBuilder, pageserver_log_scope: PageserverLogScope
):
# Disable global logging, including the pageserver one, using an env var
neon_env_builder.rust_log_env_var = "error"
env = neon_env_builder.init_start()
pageserver = env.pageserver
pageserver_http = pageserver.http_client()
initial_log_filter = pageserver_http.current_log_filter()
# Ensure that pageserver does not produce logs with such setting
pageserver_http.tenant_status(env.initial_tenant)
assert (
len(pageserver.log_lines()) == 0
), "No err log lines expected, others should be turned off"
# Make a dynamic override via HTTP api and check that it makes pageserver to produce some logs
pageserver_log_scope.tenant_id = env.initial_tenant
pageserver_log_scope.timeline_id = env.initial_timeline
pageserver_log_scope.log_level = "debug"
pageserver_http.change_log_filter(pageserver_log_scope)
log_filter_after_update = pageserver_http.current_log_filter()
assert (
initial_log_filter != log_filter_after_update
), "Turning a filter on should change the log filter"
# Query pageserver to produce some logs with the override
pageserver_http.tenant_status(env.initial_tenant)
pageserver_http.timeline_compact(env.initial_tenant, env.initial_timeline)
# Disable the override back
pageserver_log_scope.disable()
pageserver_http.change_log_filter(pageserver_log_scope)
log_filter_after_toggle = pageserver_http.current_log_filter()
assert (
initial_log_filter == log_filter_after_toggle
), "Turning a filter on and off should restore the initial state"
updated_log_lines = pageserver.log_lines()
log_lines_count_after_override = len(updated_log_lines)
assert (
log_lines_count_after_override > 0
), f"Pageserver should produce logs due to the new override, filter: '{pageserver_http.current_log_filter()}'"
pageserver_http.tenant_status(env.initial_tenant)
pageserver_http.timeline_compact(env.initial_tenant, env.initial_timeline)
assert (
len(pageserver.log_lines()) == log_lines_count_after_override
), "After override is disabled, no more logs should be produced again"
def test_pageserver_logs_restart(neon_env_builder: NeonEnvBuilder):
# Disable global logging, including the pageserver one, using an env var
neon_env_builder.rust_log_env_var = "error"
env = neon_env_builder.init_start()
pageserver = env.pageserver
pageserver_http = pageserver.http_client()
initial_log_filter = pageserver_http.current_log_filter()
current_log_filter = initial_log_filter
for pageserver_log_scope in pageserver_predefined_scopes():
pageserver_log_scope.tenant_id = env.initial_tenant
pageserver_log_scope.timeline_id = env.initial_timeline
pageserver_log_scope.log_level = "debug"
pageserver_http.change_log_filter(pageserver_log_scope)
new_filter = pageserver_http.current_log_filter()
assert new_filter != current_log_filter, "New filter should appear after the toggle"
current_log_filter = new_filter
# Make a dynamic override via HTTP api and check that it makes pageserver to produce some logs
pageserver_http.tenant_status(env.initial_tenant)
pageserver_http.timeline_compact(env.initial_tenant, env.initial_timeline)
env.pageserver.stop()
updated_log_lines = pageserver.log_lines()
log_lines_count_after_override = len(updated_log_lines)
assert (
log_lines_count_after_override > 0
), f"Pageserver should produce logs due to the new override, filter: '{current_log_filter}'"
env.pageserver.start()
pageserver_http.tenant_status(env.initial_tenant)
pageserver_http.timeline_compact(env.initial_tenant, env.initial_timeline)
assert (
initial_log_filter == pageserver_http.current_log_filter()
), "Pageserver log filter should get back to default after a restart"
assert (
len(pageserver.log_lines()) == log_lines_count_after_override
), "After pageserver restart, no more log overrides should produce debug logs"
def test_pageserver_logs_reset(neon_env_builder: NeonEnvBuilder):
# Disable global logging, including the pageserver one, using an env var
neon_env_builder.rust_log_env_var = "error"
env = neon_env_builder.init_start()
pageserver = env.pageserver
pageserver_http = pageserver.http_client()
initial_log_filter = pageserver_http.current_log_filter()
current_log_filter = initial_log_filter
for pageserver_log_scope in pageserver_custom_scopes():
pageserver_log_scope.tenant_id = env.initial_tenant
pageserver_log_scope.timeline_id = env.initial_timeline
pageserver_log_scope.log_level = "debug"
pageserver_http.change_log_filter(pageserver_log_scope)
new_filter = pageserver_http.current_log_filter()
assert new_filter != current_log_filter, "New filter should appear after the toggle"
current_log_filter = new_filter
pageserver_http.tenant_status(env.initial_tenant)
pageserver_http.timeline_compact(env.initial_tenant, env.initial_timeline)
pageserver_http.reset_log_filter()
updated_log_lines = pageserver.log_lines()
log_lines_count_after_override = len(updated_log_lines)
assert (
log_lines_count_after_override > 0
), f"Pageserver should produce logs due to the new override, filter: '{current_log_filter}'"
pageserver_http.tenant_status(env.initial_tenant)
pageserver_http.timeline_compact(env.initial_tenant, env.initial_timeline)
assert (
initial_log_filter == pageserver_http.current_log_filter()
), "Pageserver log filter should get back to default after a reset"
assert (
len(pageserver.log_lines()) == log_lines_count_after_override
), "After pageserver restart, no more log overrides should produce debug logs"