Merge with main

This commit is contained in:
Konstantin Knizhnik
2022-05-13 18:57:32 +03:00
53 changed files with 724 additions and 256 deletions

View File

@@ -21,7 +21,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder):
# Override defaults, 1M gc_horizon and 4M checkpoint_distance.
# Extend compaction_period and gc_period to disable background compaction and gc.
tenant = env.zenith_cli.create_tenant(
tenant, _ = env.zenith_cli.create_tenant(
conf={
'gc_period': '10 m',
'gc_horizon': '1048576',
@@ -35,7 +35,6 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder):
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
pscur.execute("failpoints flush-frozen=sleep(10000)")
env.zenith_cli.create_timeline(f'main', tenant_id=tenant)
pg_branch0 = env.postgres.create_start('main', tenant_id=tenant)
branch0_cur = pg_branch0.connect().cursor()
branch0_cur.execute("SHOW zenith.zenith_timeline")

View File

@@ -19,6 +19,8 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
#
# See https://github.com/zenithdb/zenith/issues/1068
zenith_env_builder.num_safekeepers = 1
# Disable pitr, because here we want to test branch creation after GC
zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = zenith_env_builder.init_start()
# Branch at the point where only 100 rows were inserted

View File

@@ -1,7 +1,7 @@
import asyncio
import random
from fixtures.zenith_fixtures import ZenithEnv, Postgres
from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres
from fixtures.log_helper import log
# Test configuration
@@ -50,9 +50,12 @@ async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str):
#
# (repro for https://github.com/zenithdb/zenith/issues/1047)
#
def test_gc_aggressive(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch("test_gc_aggressive", "empty")
def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_gc_aggressive", "main")
pg = env.postgres.create_start('test_gc_aggressive')
log.info('postgres is running on test_gc_aggressive branch')

View File

@@ -1,5 +1,7 @@
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.log_helper import log
from fixtures.utils import print_gc_result
import psycopg2.extras
#
@@ -12,9 +14,11 @@ from fixtures.log_helper import log
# just a hint that the page hasn't been modified since that LSN, and the page
# server should return the latest page version regardless of the LSN.
#
def test_old_request_lsn(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch("test_old_request_lsn", "empty")
def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_old_request_lsn", "main")
pg = env.postgres.create_start('test_old_request_lsn')
log.info('postgres is running on test_old_request_lsn branch')
@@ -26,7 +30,7 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv):
timeline = cur.fetchone()[0]
psconn = env.pageserver.connect()
pscur = psconn.cursor()
pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor)
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers.
@@ -53,6 +57,9 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv):
# garbage collections so that the page server will remove old page versions.
for i in range(10):
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row)
for j in range(100):
cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')

View File

@@ -0,0 +1,77 @@
import subprocess
from contextlib import closing
import psycopg2.extras
import pytest
from fixtures.log_helper import log
from fixtures.utils import print_gc_result
from fixtures.zenith_fixtures import ZenithEnvBuilder
#
# Check pitr_interval GC behavior.
# Insert some data, run GC and create a branch in the past.
#
def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
# Set pitr interval such that we need to keep the data
zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}"
env = zenith_env_builder.init_start()
pgmain = env.postgres.create_start('main')
log.info("postgres is running on 'main' branch")
main_pg_conn = pgmain.connect()
main_cur = main_pg_conn.cursor()
main_cur.execute("SHOW zenith.zenith_timeline")
timeline = main_cur.fetchone()[0]
# Create table
main_cur.execute('CREATE TABLE foo (t text)')
for i in range(10000):
main_cur.execute('''
INSERT INTO foo
SELECT 'long string to consume some space';
''')
if i == 99:
# keep some early lsn to test branch creation after GC
main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()')
res = main_cur.fetchone()
lsn_a = res[0]
xid_a = res[1]
log.info(f'LSN after 100 rows: {lsn_a} xid {xid_a}')
main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()')
res = main_cur.fetchone()
debug_lsn = res[0]
debug_xid = res[1]
log.info(f'LSN after 10000 rows: {debug_lsn} xid {debug_xid}')
# run GC
with closing(env.pageserver.connect()) as psconn:
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
pscur.execute(f"compact {env.initial_tenant.hex} {timeline}")
# perform agressive GC. Data still should be kept because of the PITR setting.
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row)
# Branch at the point where only 100 rows were inserted
# It must have been preserved by PITR setting
env.zenith_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a)
pg_hundred = env.postgres.create_start('test_pitr_gc_hundred')
# On the 'hundred' branch, we should see only 100 rows
hundred_pg_conn = pg_hundred.connect()
hundred_cur = hundred_pg_conn.cursor()
hundred_cur.execute('SELECT count(*) FROM foo')
assert hundred_cur.fetchone() == (100, )
# All the rows are visible on the main branch
main_cur.execute('SELECT count(*) FROM foo')
assert main_cur.fetchone() == (10000, )

View File

@@ -16,7 +16,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}'''
env = zenith_env_builder.init_start()
"""Test per tenant configuration"""
tenant = env.zenith_cli.create_tenant(conf={
tenant, _ = env.zenith_cli.create_tenant(conf={
'checkpoint_distance': '20000',
'gc_period': '30sec',
})

View File

@@ -95,6 +95,10 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve
log.info('load thread stopped')
@pytest.mark.skip(
reason=
"needs to replace callmemaybe call with better idea how to migrate timelines between pageservers"
)
@pytest.mark.parametrize('with_load', ['with_load', 'without_load'])
def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
port_distributor: PortDistributor,
@@ -107,7 +111,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
# create folder for remote storage mock
remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage'
tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209"))
tenant, _ = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209"))
log.info("tenant to relocate %s", tenant)
# attach does not download ancestor branches (should it?), just use root branch for now

View File

@@ -12,8 +12,8 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep
env = zenith_env_builder.init_start()
"""Tests tenants with and without wal acceptors"""
tenant_1 = env.zenith_cli.create_tenant()
tenant_2 = env.zenith_cli.create_tenant()
tenant_1, _ = env.zenith_cli.create_tenant()
tenant_2, _ = env.zenith_cli.create_tenant()
env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}',
tenant_id=tenant_1)

View File

@@ -850,3 +850,116 @@ def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder):
# there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
assert wal_size_after_checkpoint < 16 * 2.5
def test_delete_force(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init_start()
# Create two tenants: one will be deleted, other should be preserved.
tenant_id = env.initial_tenant.hex
timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Acive, delete explicitly
timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explictly
timeline_id_3 = env.zenith_cli.create_branch('br3').hex # Active, delete with the tenant
timeline_id_4 = env.zenith_cli.create_branch('br4').hex # Inactive, delete with the tenant
tenant_id_other_uuid, timeline_id_other_uuid = env.zenith_cli.create_tenant()
tenant_id_other = tenant_id_other_uuid.hex
timeline_id_other = timeline_id_other_uuid.hex
# Populate branches
pg_1 = env.postgres.create_start('br1')
pg_2 = env.postgres.create_start('br2')
pg_3 = env.postgres.create_start('br3')
pg_4 = env.postgres.create_start('br4')
pg_other = env.postgres.create_start('main', tenant_id=uuid.UUID(hex=tenant_id_other))
for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]:
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute('CREATE TABLE t(key int primary key)')
sk = env.safekeepers[0]
sk_data_dir = Path(sk.data_dir())
sk_http = sk.http_client()
assert (sk_data_dir / tenant_id / timeline_id_1).is_dir()
assert (sk_data_dir / tenant_id / timeline_id_2).is_dir()
assert (sk_data_dir / tenant_id / timeline_id_3).is_dir()
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
# Stop branches which should be inactive and restart Safekeeper to drop its in-memory state.
pg_2.stop_and_destroy()
pg_4.stop_and_destroy()
sk.stop()
sk.start()
# Ensure connections to Safekeeper are established
for pg in [pg_1, pg_3, pg_other]:
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute('INSERT INTO t (key) VALUES (1)')
# Remove initial tenant's br1 (active)
assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == {
"dir_existed": True,
"was_active": True,
}
assert not (sk_data_dir / tenant_id / timeline_id_1).exists()
assert (sk_data_dir / tenant_id / timeline_id_2).is_dir()
assert (sk_data_dir / tenant_id / timeline_id_3).is_dir()
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
# Ensure repeated deletion succeeds
assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == {
"dir_existed": False, "was_active": False
}
assert not (sk_data_dir / tenant_id / timeline_id_1).exists()
assert (sk_data_dir / tenant_id / timeline_id_2).is_dir()
assert (sk_data_dir / tenant_id / timeline_id_3).is_dir()
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
# Remove initial tenant's br2 (inactive)
assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == {
"dir_existed": True,
"was_active": False,
}
assert not (sk_data_dir / tenant_id / timeline_id_1).exists()
assert not (sk_data_dir / tenant_id / timeline_id_2).exists()
assert (sk_data_dir / tenant_id / timeline_id_3).is_dir()
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
# Remove non-existing branch, should succeed
assert sk_http.timeline_delete_force(tenant_id, '00' * 16) == {
"dir_existed": False,
"was_active": False,
}
assert not (sk_data_dir / tenant_id / timeline_id_1).exists()
assert not (sk_data_dir / tenant_id / timeline_id_2).exists()
assert (sk_data_dir / tenant_id / timeline_id_3).exists()
assert (sk_data_dir / tenant_id / timeline_id_4).is_dir()
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
# Remove initial tenant fully (two branches are active)
response = sk_http.tenant_delete_force(tenant_id)
assert response == {
timeline_id_3: {
"dir_existed": True,
"was_active": True,
}
}
assert not (sk_data_dir / tenant_id).exists()
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
# Remove initial tenant again.
response = sk_http.tenant_delete_force(tenant_id)
assert response == {}
assert not (sk_data_dir / tenant_id).exists()
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
# Ensure the other tenant still works
sk_http.timeline_status(tenant_id_other, timeline_id_other)
with closing(pg_other.connect()) as conn:
with conn.cursor() as cur:
cur.execute('INSERT INTO t (key) VALUES (123)')

View File

@@ -1,7 +1,7 @@
import uuid
import requests
from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
from fixtures.zenith_fixtures import DEFAULT_BRANCH_NAME, ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
from typing import cast
@@ -64,13 +64,13 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
helper_compare_tenant_list(pageserver_http_client, env)
# Create new tenant
tenant1 = env.zenith_cli.create_tenant()
tenant1, _ = env.zenith_cli.create_tenant()
# check tenant1 appeared
helper_compare_tenant_list(pageserver_http_client, env)
# Create new tenant
tenant2 = env.zenith_cli.create_tenant()
tenant2, _ = env.zenith_cli.create_tenant()
# check tenant2 appeared
helper_compare_tenant_list(pageserver_http_client, env)
@@ -83,6 +83,16 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
assert tenant2.hex in tenants
def test_cli_tenant_create(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
tenant_id, _ = env.zenith_cli.create_tenant()
timelines = env.zenith_cli.list_timelines(tenant_id)
# an initial timeline should be created upon tenant creation
assert len(timelines) == 1
assert timelines[0][0] == DEFAULT_BRANCH_NAME
def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder):
# Start with single sk
zenith_env_builder.num_safekeepers = 1

View File

@@ -106,9 +106,9 @@ class ZenithCompare(PgCompare):
report=MetricReport.LOWER_IS_BETTER)
total_files = self.zenbenchmark.get_int_counter_value(
self.env.pageserver, "pageserver_num_persistent_files_created")
self.env.pageserver, "pageserver_created_persistent_files_total")
total_bytes = self.zenbenchmark.get_int_counter_value(
self.env.pageserver, "pageserver_persistent_bytes_written")
self.env.pageserver, "pageserver_written_persistent_bytes_total")
self.zenbenchmark.record("data_uploaded",
total_bytes / (1024 * 1024),
"MB",

View File

@@ -75,7 +75,8 @@ def lsn_from_hex(lsn_hex: str) -> int:
def print_gc_result(row):
log.info("GC duration {elapsed} ms".format_map(row))
log.info(
" total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}"
" total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}"
" needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}"
.format_map(row))

View File

@@ -831,20 +831,25 @@ class ZenithCli:
def create_tenant(self,
tenant_id: Optional[uuid.UUID] = None,
conf: Optional[Dict[str, str]] = None) -> uuid.UUID:
timeline_id: Optional[uuid.UUID] = None,
conf: Optional[Dict[str, str]] = None) -> Tuple[uuid.UUID, uuid.UUID]:
"""
Creates a new tenant, returns its id and its initial timeline's id.
"""
if tenant_id is None:
tenant_id = uuid.uuid4()
if timeline_id is None:
timeline_id = uuid.uuid4()
if conf is None:
res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex])
res = self.raw_cli([
'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex
])
else:
res = self.raw_cli(
['tenant', 'create', '--tenant-id', tenant_id.hex] +
sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), []))
res = self.raw_cli([
'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex
] + sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), []))
res.check_returncode()
return tenant_id
return tenant_id, timeline_id
def config_tenant(self, tenant_id: uuid.UUID, conf: Dict[str, str]):
"""
@@ -1795,6 +1800,21 @@ class SafekeeperHttpClient(requests.Session):
json=body)
res.raise_for_status()
def timeline_delete_force(self, tenant_id: str, timeline_id: str) -> Dict[Any, Any]:
res = self.delete(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
res.raise_for_status()
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def tenant_delete_force(self, tenant_id: str) -> Dict[Any, Any]:
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
res.raise_for_status()
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def get_metrics(self) -> SafekeeperMetrics:
request_result = self.get(f"http://localhost:{self.port}/metrics")
request_result.raise_for_status()

View File

@@ -18,7 +18,6 @@ from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare
def test_bulk_insert(zenith_with_baseline: PgCompare):
env = zenith_with_baseline
# Get the timeline ID of our branch. We need it for the 'do_gc' command
with closing(env.pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("create table huge (i int, j int);")

View File

@@ -30,7 +30,7 @@ def test_bulk_tenant_create(
for i in range(tenants_count):
start = timeit.default_timer()
tenant = env.zenith_cli.create_tenant()
tenant, _ = env.zenith_cli.create_tenant()
env.zenith_cli.create_timeline(
f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant)

View File

@@ -8,7 +8,6 @@ from fixtures.log_helper import log
import psycopg2.extras
import random
import time
from fixtures.utils import print_gc_result
# This is a clear-box test that demonstrates the worst case scenario for the