Compare commits

...

11 Commits

Author SHA1 Message Date
Christian Schwarz
a1063ead56 run just the flaky test in CI, ten times 2024-06-12 21:15:42 +02:00
Christian Schwarz
246c4c5ccf minimum amount of work 2024-06-12 21:15:28 +02:00
Christian Schwarz
e2d57fb61b Revert "make sure the getpage requests happens"
This reverts commit a62b2e3e76.
2024-06-12 21:11:43 +02:00
Christian Schwarz
a62b2e3e76 make sure the getpage requests happens 2024-06-12 21:11:33 +02:00
Christian Schwarz
4a190986b4 test_vm_bit_clear_on_heap_lock_blackbox: dump layer map while flaky code is running
refs https://github.com/neondatabase/neon/issues/6967
2024-06-12 21:09:35 +02:00
Christian Schwarz
17c51ad63a Merge remote-tracking branch 'origin/main' into bayandin/preserve_database_files-on-CI 2024-06-12 08:25:39 +02:00
Alexander Bayandin
29a2524788 A little cleanup 2024-06-10 11:30:19 +01:00
Alexander Bayandin
35ff8fc6c1 Explain preserve_database_files propogation a bit more
Co-authored-by: Christian Schwarz <christian@neon.tech>
2024-06-10 11:26:29 +01:00
Alexander Bayandin
184547634e CI(test_runner): do not preserve_database_files for test_sharding 2024-06-10 11:14:27 +01:00
Alexander Bayandin
070136e07e CI(test_runner): preserve_database_files for test_vm_bit_clear_on_heap_lock 2024-06-10 11:14:27 +01:00
Alexander Bayandin
7d9ea26530 CI(test_runner): Upload all test artifacts if preserve_database_files is enabled 2024-06-10 11:14:27 +01:00
13 changed files with 140 additions and 10 deletions

View File

@@ -447,6 +447,7 @@ jobs:
with:
build_type: ${{ matrix.build_type }}
test_selection: regress
extra_params: -k test_vm_bit_clear_on_heap_lock_blackbox
needs_postgres_source: true
run_with_real_s3: true
real_s3_bucket: neon-github-ci-tests
@@ -858,7 +859,7 @@ jobs:
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
tags: |
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
- name: Build neon extensions test image
if: matrix.version == 'v16'
uses: docker/build-push-action@v5

View File

@@ -4,6 +4,10 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
testing = [ "pageserver_api/testing" ]
[dependencies]
anyhow.workspace = true
async-trait.workspace = true

View File

@@ -383,6 +383,12 @@ impl PageServerNode {
.map(|x| x.parse::<AuxFilePolicy>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
#[cfg(feature = "testing")]
test_vm_bit_debug_logging: settings
.remove("test_vm_bit_debug_logging")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'test_vm_bit_debug_logging' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -506,6 +512,12 @@ impl PageServerNode {
.map(|x| x.parse::<AuxFilePolicy>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
#[cfg(feature = "testing")]
test_vm_bit_debug_logging: settings
.remove("test_vm_bit_debug_logging")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'test_vm_bit_debug_logging' as bool")?,
}
};

View File

@@ -4,6 +4,10 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
testing = []
[dependencies]
serde.workspace = true
serde_with.workspace = true

View File

@@ -322,6 +322,8 @@ pub struct TenantConfig {
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub switch_aux_file_policy: Option<AuxFilePolicy>,
#[cfg(feature = "testing")]
pub test_vm_bit_debug_logging: Option<bool>,
}
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`

View File

@@ -8,7 +8,7 @@ license.workspace = true
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints"]
testing = ["fail/failpoints", "pageserver_api/testing"]
[dependencies]
anyhow.workspace = true

View File

@@ -3831,6 +3831,8 @@ pub(crate) mod harness {
tenant_conf.image_layer_creation_check_threshold,
),
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
#[cfg(feature = "testing")]
test_vm_bit_debug_logging: Some(tenant_conf.test_vm_bit_debug_logging),
}
}
}

View File

@@ -377,6 +377,9 @@ pub struct TenantConf {
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
/// file is written.
pub switch_aux_file_policy: AuxFilePolicy,
#[cfg(feature = "testing")]
pub test_vm_bit_debug_logging: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -476,6 +479,11 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub switch_aux_file_policy: Option<AuxFilePolicy>,
#[cfg(feature = "testing")]
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub test_vm_bit_debug_logging: Option<bool>,
}
impl TenantConfOpt {
@@ -538,6 +546,10 @@ impl TenantConfOpt {
switch_aux_file_policy: self
.switch_aux_file_policy
.unwrap_or(global_conf.switch_aux_file_policy),
#[cfg(feature = "testing")]
test_vm_bit_debug_logging: self
.test_vm_bit_debug_logging
.unwrap_or(global_conf.test_vm_bit_debug_logging),
}
}
}
@@ -582,6 +594,8 @@ impl Default for TenantConf {
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
#[cfg(feature = "testing")]
test_vm_bit_debug_logging: false,
}
}
}
@@ -657,6 +671,8 @@ impl From<TenantConfOpt> for models::TenantConfig {
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
switch_aux_file_policy: value.switch_aux_file_policy,
#[cfg(feature = "testing")]
test_vm_bit_debug_logging: value.test_vm_bit_debug_logging,
}
}
}

View File

@@ -914,16 +914,59 @@ impl Timeline {
let mut reconstruct_state = ValuesReconstructState::new();
// Only add the cached image to the reconstruct state when it exists.
let cached_page_img_lsn = cached_page_img.as_ref().map(|(lsn, _)| *lsn);
if cached_page_img.is_some() {
let mut key_state = VectoredValueReconstructState::default();
key_state.img = cached_page_img;
reconstruct_state.keys.insert(key, Ok(key_state));
}
let debug_log = {
#[cfg(feature = "testing")]
{
self.get_test_vm_bit_debug_logging()
}
#[cfg(not(feature = "testing"))]
{
false
}
};
if debug_log {
tracing::info!(%key, %lsn, ?cached_page_img_lsn, "debug-logging page reconstruction");
}
if debug_log {
tracing::info!(
location = "before vectored get",
"debug-logging page reconstruction"
);
self.layers
.read()
.await
.layer_map()
.dump(false, ctx)
.await
.unwrap();
}
let vectored_res = self
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
.await;
if debug_log {
tracing::info!(
location = "before validation",
"debug-logging page reconstruction"
);
self.layers
.read()
.await
.layer_map()
.dump(false, ctx)
.await
.unwrap();
}
if self.conf.validate_vectored_get {
self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
.await;
@@ -2158,6 +2201,15 @@ impl Timeline {
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
}
#[cfg(feature = "testing")]
fn get_test_vm_bit_debug_logging(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.test_vm_bit_debug_logging
.unwrap_or(self.conf.default_tenant_conf.test_vm_bit_debug_logging)
}
fn get_image_layer_creation_check_threshold(&self) -> u8 {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2173,6 +2225,10 @@ impl Timeline {
pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
// NB: Most tenant conf options are read by background loops, so,
// changes will automatically be picked up.
#[cfg(feature = "testing")]
{
info!(?new_conf.test_vm_bit_debug_logging, "updating tenant conf");
}
// The threshold is embedded in the metric. So, we need to update it.
{

View File

@@ -1355,7 +1355,7 @@ def _shared_simple_env(
pg_distrib_dir=pg_distrib_dir,
pg_version=pg_version,
run_id=run_id,
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
test_name=request.node.name,
test_output_dir=test_output_dir,
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
@@ -1430,7 +1430,7 @@ def neon_env_builder(
pg_version=pg_version,
broker=default_broker,
run_id=run_id,
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
test_name=request.node.name,
test_output_dir=test_output_dir,
@@ -1439,6 +1439,11 @@ def neon_env_builder(
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
) as builder:
yield builder
# Propogate `preserve_database_files` to make it possible to use in other fixtures,
# like `test_output_dir` fixture for attaching all database files to Allure report.
request.node.user_properties.append(
("preserve_database_files", builder.preserve_database_files)
)
@dataclass
@@ -4101,7 +4106,16 @@ def test_output_dir(
yield test_dir
allure_attach_from_dir(test_dir)
preserve_database_files = False
for k, v in request.node.user_properties:
# NB: the neon_env_builder fixture uses this fixture (test_output_dir).
# So, neon_env_builder's cleanup runs before here.
# The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property.
if k == "preserve_database_files":
assert isinstance(v, bool)
preserve_database_files = v
allure_attach_from_dir(test_dir, preserve_database_files)
class FileAndThreadLock:

View File

@@ -240,9 +240,18 @@ ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
)
def allure_attach_from_dir(dir: Path):
def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False):
"""Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report"""
if preserve_database_files:
zst_file = dir.with_suffix(".tar.zst")
with zst_file.open("wb") as zst:
cctx = zstandard.ZstdCompressor()
with cctx.stream_writer(zst) as compressor:
with tarfile.open(fileobj=compressor, mode="w") as tar:
tar.add(dir, arcname="")
allure.attach.file(zst_file, "everything.tar.zst", "application/zstd", "tar.zst")
for attachment in Path(dir).glob("**/*"):
if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0:
name = str(attachment.relative_to(dir))

View File

@@ -50,8 +50,6 @@ def test_sharding_smoke(
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
neon_env_builder.preserve_database_files = True
env = neon_env_builder.init_start(
initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
)
@@ -366,8 +364,6 @@ def test_sharding_split_smoke(
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
neon_env_builder.preserve_database_files = True
non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
env = neon_env_builder.init_configs(True)

View File

@@ -1,6 +1,7 @@
import time
from contextlib import closing
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
from fixtures.utils import query_scalar
@@ -121,6 +122,8 @@ def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder):
This is a repro for the bug fixed in commit 66fa176cc8.
"""
# Debugging https://github.com/neondatabase/neon/issues/6967
neon_env_builder.preserve_database_files = True
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start(
"main",
@@ -191,12 +194,16 @@ def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder):
assert vm_page_at_pageserver == vm_page_in_cache
@pytest.mark.repeat(10)
def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
"""
The previous test is enough to verify the bug that was fixed in
commit 66fa176cc8. But for good measure, we also reproduce the
original problem that the missing VM page update caused.
"""
# Debugging https://github.com/neondatabase/neon/issues/6967
neon_env_builder.preserve_database_files = True
tenant_conf = {
"checkpoint_distance": f"{128 * 1024}",
"compaction_target_size": f"{128 * 1024}",
@@ -287,6 +294,13 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
# already truncated away.
#
# ERROR: could not access status of transaction 1027
# Debugging https://github.com/neondatabase/neon/issues/6967
# the select() below fails occassionally at get_impl="vectored" validation
env.pageserver.http_client().patch_tenant_config_client_side(
tenant_id,
{"test_vm_bit_debug_logging": True},
)
cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update")
tup = cur.fetchall()
log.info(f"tuple = {tup}")