mirror of
https://github.com/neondatabase/neon.git
synced 2026-04-26 10:50:38 +00:00
Compare commits
11 Commits
release-co
...
problame/f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1063ead56 | ||
|
|
246c4c5ccf | ||
|
|
e2d57fb61b | ||
|
|
a62b2e3e76 | ||
|
|
4a190986b4 | ||
|
|
17c51ad63a | ||
|
|
29a2524788 | ||
|
|
35ff8fc6c1 | ||
|
|
184547634e | ||
|
|
070136e07e | ||
|
|
7d9ea26530 |
3
.github/workflows/build_and_test.yml
vendored
3
.github/workflows/build_and_test.yml
vendored
@@ -447,6 +447,7 @@ jobs:
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
test_selection: regress
|
||||
extra_params: -k test_vm_bit_clear_on_heap_lock_blackbox
|
||||
needs_postgres_source: true
|
||||
run_with_real_s3: true
|
||||
real_s3_bucket: neon-github-ci-tests
|
||||
@@ -858,7 +859,7 @@ jobs:
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
|
||||
- name: Build neon extensions test image
|
||||
if: matrix.version == 'v16'
|
||||
uses: docker/build-push-action@v5
|
||||
|
||||
@@ -4,6 +4,10 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
testing = [ "pageserver_api/testing" ]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
|
||||
@@ -383,6 +383,12 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
#[cfg(feature = "testing")]
|
||||
test_vm_bit_debug_logging: settings
|
||||
.remove("test_vm_bit_debug_logging")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'test_vm_bit_debug_logging' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -506,6 +512,12 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
#[cfg(feature = "testing")]
|
||||
test_vm_bit_debug_logging: settings
|
||||
.remove("test_vm_bit_debug_logging")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'test_vm_bit_debug_logging' as bool")?,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -4,6 +4,10 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
|
||||
@@ -322,6 +322,8 @@ pub struct TenantConfig {
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
#[cfg(feature = "testing")]
|
||||
pub test_vm_bit_debug_logging: Option<bool>,
|
||||
}
|
||||
|
||||
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
|
||||
@@ -3831,6 +3831,8 @@ pub(crate) mod harness {
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
|
||||
#[cfg(feature = "testing")]
|
||||
test_vm_bit_debug_logging: Some(tenant_conf.test_vm_bit_debug_logging),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -377,6 +377,9 @@ pub struct TenantConf {
|
||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||
/// file is written.
|
||||
pub switch_aux_file_policy: AuxFilePolicy,
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub test_vm_bit_debug_logging: bool,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -476,6 +479,11 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub test_vm_bit_debug_logging: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -538,6 +546,10 @@ impl TenantConfOpt {
|
||||
switch_aux_file_policy: self
|
||||
.switch_aux_file_policy
|
||||
.unwrap_or(global_conf.switch_aux_file_policy),
|
||||
#[cfg(feature = "testing")]
|
||||
test_vm_bit_debug_logging: self
|
||||
.test_vm_bit_debug_logging
|
||||
.unwrap_or(global_conf.test_vm_bit_debug_logging),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -582,6 +594,8 @@ impl Default for TenantConf {
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
|
||||
#[cfg(feature = "testing")]
|
||||
test_vm_bit_debug_logging: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -657,6 +671,8 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
switch_aux_file_policy: value.switch_aux_file_policy,
|
||||
#[cfg(feature = "testing")]
|
||||
test_vm_bit_debug_logging: value.test_vm_bit_debug_logging,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -914,16 +914,59 @@ impl Timeline {
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
|
||||
// Only add the cached image to the reconstruct state when it exists.
|
||||
let cached_page_img_lsn = cached_page_img.as_ref().map(|(lsn, _)| *lsn);
|
||||
if cached_page_img.is_some() {
|
||||
let mut key_state = VectoredValueReconstructState::default();
|
||||
key_state.img = cached_page_img;
|
||||
reconstruct_state.keys.insert(key, Ok(key_state));
|
||||
}
|
||||
|
||||
let debug_log = {
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
self.get_test_vm_bit_debug_logging()
|
||||
}
|
||||
#[cfg(not(feature = "testing"))]
|
||||
{
|
||||
false
|
||||
}
|
||||
};
|
||||
|
||||
if debug_log {
|
||||
tracing::info!(%key, %lsn, ?cached_page_img_lsn, "debug-logging page reconstruction");
|
||||
}
|
||||
|
||||
if debug_log {
|
||||
tracing::info!(
|
||||
location = "before vectored get",
|
||||
"debug-logging page reconstruction"
|
||||
);
|
||||
self.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.dump(false, ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
if debug_log {
|
||||
tracing::info!(
|
||||
location = "before validation",
|
||||
"debug-logging page reconstruction"
|
||||
);
|
||||
self.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.dump(false, ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
if self.conf.validate_vectored_get {
|
||||
self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
|
||||
.await;
|
||||
@@ -2158,6 +2201,15 @@ impl Timeline {
|
||||
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn get_test_vm_bit_debug_logging(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.test_vm_bit_debug_logging
|
||||
.unwrap_or(self.conf.default_tenant_conf.test_vm_bit_debug_logging)
|
||||
}
|
||||
|
||||
fn get_image_layer_creation_check_threshold(&self) -> u8 {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -2173,6 +2225,10 @@ impl Timeline {
|
||||
pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
|
||||
// NB: Most tenant conf options are read by background loops, so,
|
||||
// changes will automatically be picked up.
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
info!(?new_conf.test_vm_bit_debug_logging, "updating tenant conf");
|
||||
}
|
||||
|
||||
// The threshold is embedded in the metric. So, we need to update it.
|
||||
{
|
||||
|
||||
@@ -1355,7 +1355,7 @@ def _shared_simple_env(
|
||||
pg_distrib_dir=pg_distrib_dir,
|
||||
pg_version=pg_version,
|
||||
run_id=run_id,
|
||||
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
|
||||
preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
|
||||
@@ -1430,7 +1430,7 @@ def neon_env_builder(
|
||||
pg_version=pg_version,
|
||||
broker=default_broker,
|
||||
run_id=run_id,
|
||||
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
|
||||
preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
|
||||
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
@@ -1439,6 +1439,11 @@ def neon_env_builder(
|
||||
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
|
||||
) as builder:
|
||||
yield builder
|
||||
# Propogate `preserve_database_files` to make it possible to use in other fixtures,
|
||||
# like `test_output_dir` fixture for attaching all database files to Allure report.
|
||||
request.node.user_properties.append(
|
||||
("preserve_database_files", builder.preserve_database_files)
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -4101,7 +4106,16 @@ def test_output_dir(
|
||||
|
||||
yield test_dir
|
||||
|
||||
allure_attach_from_dir(test_dir)
|
||||
preserve_database_files = False
|
||||
for k, v in request.node.user_properties:
|
||||
# NB: the neon_env_builder fixture uses this fixture (test_output_dir).
|
||||
# So, neon_env_builder's cleanup runs before here.
|
||||
# The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property.
|
||||
if k == "preserve_database_files":
|
||||
assert isinstance(v, bool)
|
||||
preserve_database_files = v
|
||||
|
||||
allure_attach_from_dir(test_dir, preserve_database_files)
|
||||
|
||||
|
||||
class FileAndThreadLock:
|
||||
|
||||
@@ -240,9 +240,18 @@ ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
|
||||
)
|
||||
|
||||
|
||||
def allure_attach_from_dir(dir: Path):
|
||||
def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False):
|
||||
"""Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report"""
|
||||
|
||||
if preserve_database_files:
|
||||
zst_file = dir.with_suffix(".tar.zst")
|
||||
with zst_file.open("wb") as zst:
|
||||
cctx = zstandard.ZstdCompressor()
|
||||
with cctx.stream_writer(zst) as compressor:
|
||||
with tarfile.open(fileobj=compressor, mode="w") as tar:
|
||||
tar.add(dir, arcname="")
|
||||
allure.attach.file(zst_file, "everything.tar.zst", "application/zstd", "tar.zst")
|
||||
|
||||
for attachment in Path(dir).glob("**/*"):
|
||||
if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0:
|
||||
name = str(attachment.relative_to(dir))
|
||||
|
||||
@@ -50,8 +50,6 @@ def test_sharding_smoke(
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
|
||||
neon_env_builder.preserve_database_files = True
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
@@ -366,8 +364,6 @@ def test_sharding_split_smoke(
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
|
||||
neon_env_builder.preserve_database_files = True
|
||||
|
||||
non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
|
||||
|
||||
env = neon_env_builder.init_configs(True)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import time
|
||||
from contextlib import closing
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
|
||||
from fixtures.utils import query_scalar
|
||||
@@ -121,6 +122,8 @@ def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
This is a repro for the bug fixed in commit 66fa176cc8.
|
||||
"""
|
||||
# Debugging https://github.com/neondatabase/neon/issues/6967
|
||||
neon_env_builder.preserve_database_files = True
|
||||
env = neon_env_builder.init_start()
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main",
|
||||
@@ -191,12 +194,16 @@ def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder):
|
||||
assert vm_page_at_pageserver == vm_page_in_cache
|
||||
|
||||
|
||||
@pytest.mark.repeat(10)
|
||||
def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
The previous test is enough to verify the bug that was fixed in
|
||||
commit 66fa176cc8. But for good measure, we also reproduce the
|
||||
original problem that the missing VM page update caused.
|
||||
"""
|
||||
# Debugging https://github.com/neondatabase/neon/issues/6967
|
||||
neon_env_builder.preserve_database_files = True
|
||||
|
||||
tenant_conf = {
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
"compaction_target_size": f"{128 * 1024}",
|
||||
@@ -287,6 +294,13 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
|
||||
# already truncated away.
|
||||
#
|
||||
# ERROR: could not access status of transaction 1027
|
||||
|
||||
# Debugging https://github.com/neondatabase/neon/issues/6967
|
||||
# the select() below fails occassionally at get_impl="vectored" validation
|
||||
env.pageserver.http_client().patch_tenant_config_client_side(
|
||||
tenant_id,
|
||||
{"test_vm_bit_debug_logging": True},
|
||||
)
|
||||
cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update")
|
||||
tup = cur.fetchall()
|
||||
log.info(f"tuple = {tup}")
|
||||
|
||||
Reference in New Issue
Block a user