mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 13:32:57 +00:00
This PR contains the first version of a [FoundationDB-like](https://www.youtube.com/watch?v=4fFDFbi3toc) simulation testing for safekeeper and walproposer. ### desim This is a core "framework" for running determenistic simulation. It operates on threads, allowing to test syncronous code (like walproposer). `libs/desim/src/executor.rs` contains implementation of a determenistic thread execution. This is achieved by blocking all threads, and each time allowing only a single thread to make an execution step. All executor's threads are blocked using `yield_me(after_ms)` function. This function is called when a thread wants to sleep or wait for an external notification (like blocking on a channel until it has a ready message). `libs/desim/src/chan.rs` contains implementation of a channel (basic sync primitive). It has unlimited capacity and any thread can push or read messages to/from it. `libs/desim/src/network.rs` has a very naive implementation of a network (only reliable TCP-like connections are supported for now), that can have arbitrary delays for each package and failure injections for breaking connections with some probability. `libs/desim/src/world.rs` ties everything together, to have a concept of virtual nodes that can have network connections between them. ### walproposer_sim Has everything to run walproposer and safekeepers in a simulation. `safekeeper.rs` reimplements all necesary stuff from `receive_wal.rs`, `send_wal.rs` and `timelines_global_map.rs`. `walproposer_api.rs` implements all walproposer callback to use simulation library. `simulation.rs` defines a schedule – a set of events like `restart <sk>` or `write_wal` that should happen at time `<ts>`. It also has code to spawn walproposer/safekeeper threads and provide config to them. ### tests `simple_test.rs` has tests that just start walproposer and 3 safekeepers together in a simulation, and tests that they are not crashing right away. `misc_test.rs` has tests checking more advanced simulation cases, like crashing or restarting threads, testing memory deallocation, etc. `random_test.rs` is the main test, it checks thousands of random seeds (schedules) for correctness. It roughly corresponds to running a real python integration test in an environment with very unstable network and cpu, but in a determenistic way (each seed results in the same execution log) and much much faster. Closes #547 --------- Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
156 lines
4.6 KiB
Rust
156 lines
4.6 KiB
Rust
use std::sync::Arc;
|
|
|
|
use tracing::{info, warn};
|
|
use utils::lsn::Lsn;
|
|
|
|
use crate::walproposer_sim::{
|
|
log::{init_logger, init_tracing_logger},
|
|
simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig},
|
|
};
|
|
|
|
pub mod walproposer_sim;
|
|
|
|
// Test that simulation supports restarting (crashing) safekeepers.
|
|
#[test]
|
|
fn crash_safekeeper() {
|
|
let clock = init_logger();
|
|
let config = TestConfig::new(Some(clock));
|
|
let test = config.start(1337);
|
|
|
|
let lsn = test.sync_safekeepers().unwrap();
|
|
assert_eq!(lsn, Lsn(0));
|
|
info!("Sucessfully synced empty safekeepers at 0/0");
|
|
|
|
let mut wp = test.launch_walproposer(lsn);
|
|
|
|
// Write some WAL and crash safekeeper 0 without waiting for replication.
|
|
test.poll_for_duration(30);
|
|
wp.write_tx(3);
|
|
test.servers[0].restart();
|
|
|
|
// Wait some time, so that walproposer can reconnect.
|
|
test.poll_for_duration(2000);
|
|
}
|
|
|
|
// Test that walproposer can be crashed (stopped).
|
|
#[test]
|
|
fn test_simple_restart() {
|
|
let clock = init_logger();
|
|
let config = TestConfig::new(Some(clock));
|
|
let test = config.start(1337);
|
|
|
|
let lsn = test.sync_safekeepers().unwrap();
|
|
assert_eq!(lsn, Lsn(0));
|
|
info!("Sucessfully synced empty safekeepers at 0/0");
|
|
|
|
let mut wp = test.launch_walproposer(lsn);
|
|
|
|
test.poll_for_duration(30);
|
|
wp.write_tx(3);
|
|
test.poll_for_duration(100);
|
|
|
|
wp.stop();
|
|
drop(wp);
|
|
|
|
let lsn = test.sync_safekeepers().unwrap();
|
|
info!("Sucessfully synced safekeepers at {}", lsn);
|
|
}
|
|
|
|
// Test runnning a simple schedule, restarting everything a several times.
|
|
#[test]
|
|
fn test_simple_schedule() -> anyhow::Result<()> {
|
|
let clock = init_logger();
|
|
let mut config = TestConfig::new(Some(clock));
|
|
config.network.keepalive_timeout = Some(100);
|
|
let test = config.start(1337);
|
|
|
|
let schedule: Schedule = vec![
|
|
(0, TestAction::RestartWalProposer),
|
|
(50, TestAction::WriteTx(5)),
|
|
(100, TestAction::RestartSafekeeper(0)),
|
|
(100, TestAction::WriteTx(5)),
|
|
(110, TestAction::RestartSafekeeper(1)),
|
|
(110, TestAction::WriteTx(5)),
|
|
(120, TestAction::RestartSafekeeper(2)),
|
|
(120, TestAction::WriteTx(5)),
|
|
(201, TestAction::RestartWalProposer),
|
|
(251, TestAction::RestartSafekeeper(0)),
|
|
(251, TestAction::RestartSafekeeper(1)),
|
|
(251, TestAction::RestartSafekeeper(2)),
|
|
(251, TestAction::WriteTx(5)),
|
|
(255, TestAction::WriteTx(5)),
|
|
(1000, TestAction::WriteTx(5)),
|
|
];
|
|
|
|
test.run_schedule(&schedule)?;
|
|
info!("Test finished, stopping all threads");
|
|
test.world.deallocate();
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Test that simulation can process 10^4 transactions.
|
|
#[test]
|
|
fn test_many_tx() -> anyhow::Result<()> {
|
|
let clock = init_logger();
|
|
let config = TestConfig::new(Some(clock));
|
|
let test = config.start(1337);
|
|
|
|
let mut schedule: Schedule = vec![];
|
|
for i in 0..100 {
|
|
schedule.push((i * 10, TestAction::WriteTx(100)));
|
|
}
|
|
|
|
test.run_schedule(&schedule)?;
|
|
info!("Test finished, stopping all threads");
|
|
test.world.stop_all();
|
|
|
|
let events = test.world.take_events();
|
|
info!("Events: {:?}", events);
|
|
let last_commit_lsn = events
|
|
.iter()
|
|
.filter_map(|event| {
|
|
if event.data.starts_with("commit_lsn;") {
|
|
let lsn: u64 = event.data.split(';').nth(1).unwrap().parse().unwrap();
|
|
return Some(lsn);
|
|
}
|
|
None
|
|
})
|
|
.last()
|
|
.unwrap();
|
|
|
|
let initdb_lsn = 21623024;
|
|
let diff = last_commit_lsn - initdb_lsn;
|
|
info!("Last commit lsn: {}, diff: {}", last_commit_lsn, diff);
|
|
// each tx is at least 8 bytes, it's written a 100 times for in a loop for 100 times
|
|
assert!(diff > 100 * 100 * 8);
|
|
Ok(())
|
|
}
|
|
|
|
// Checks that we don't have nasty circular dependencies, preventing Arc from deallocating.
|
|
// This test doesn't really assert anything, you need to run it manually to check if there
|
|
// is any issue.
|
|
#[test]
|
|
fn test_res_dealloc() -> anyhow::Result<()> {
|
|
let clock = init_tracing_logger(true);
|
|
let mut config = TestConfig::new(Some(clock));
|
|
|
|
let seed = 123456;
|
|
config.network = generate_network_opts(seed);
|
|
let test = config.start(seed);
|
|
warn!("Running test with seed {}", seed);
|
|
|
|
let schedule = generate_schedule(seed);
|
|
info!("schedule: {:?}", schedule);
|
|
test.run_schedule(&schedule).unwrap();
|
|
test.world.stop_all();
|
|
|
|
let world = test.world.clone();
|
|
drop(test);
|
|
info!("world strong count: {}", Arc::strong_count(&world));
|
|
world.deallocate();
|
|
info!("world strong count: {}", Arc::strong_count(&world));
|
|
|
|
Ok(())
|
|
}
|