Compare commits

...

50 Commits

Author SHA1 Message Date
Arthur Petukhovsky
6a00ad3aab Validate logs 2023-09-18 17:12:40 +00:00
Arthur Petukhovsky
61e6b24cb2 Fix fd leak 2023-09-18 09:54:19 +00:00
Arthur Petukhovsky
44c7d96ed0 Cleanup resources better 2023-09-16 23:10:53 +00:00
Arthur Petukhovsky
10ad3ae4eb Fix WAL page header 2023-09-16 20:50:10 +00:00
Arthur Petukhovsky
eb2886b401 Add test for 1000 WAL messages 2023-09-16 14:58:26 +00:00
Arthur Petukhovsky
0dc262a84a Fix bug in walproposer voting 2023-08-29 14:11:04 +00:00
Arthur Petukhovsky
d801ba7248 Print network config 2023-08-29 14:10:57 +00:00
Arthur Petukhovsky
1effb586ba Make network random unstable 2023-08-29 13:55:36 +00:00
Arthur Petukhovsky
2fd351fd63 Hide debug logs 2023-08-29 10:05:18 +00:00
Arthur Petukhovsky
13e94bf687 Fix truncateLsn bug 2023-08-29 09:03:22 +00:00
Arthur Petukhovsky
41b9750e81 Run many schedules 2023-08-24 23:42:11 +00:00
Arthur Petukhovsky
f8729f046d Fix excessive logs 2023-08-24 17:25:44 +00:00
Arthur Petukhovsky
420d3bc18f Add simulation schedule 2023-08-24 15:24:38 +00:00
Arthur Petukhovsky
33f7877d1b Show simulation time in logs 2023-08-23 10:10:11 +00:00
Arthur Petukhovsky
7de94c959a Support walproposer recovery 2023-08-22 23:15:46 +00:00
Arthur Petukhovsky
731ed3bb64 Support virtual disk in tests 2023-08-17 13:09:55 +00:00
Arthur Petukhovsky
413ce2cfe8 Crash safekeepers 2023-08-17 10:36:23 +00:00
Arthur Petukhovsky
7f36028fab Generate WAL in tests 2023-08-03 16:58:41 +00:00
Arthur Petukhovsky
cb6a8d3fe3 Fix some warnings 2023-07-28 21:37:16 +00:00
Arthur Petukhovsky
095747afc0 Fix walproposer main loop 2023-07-28 21:18:08 +00:00
Arthur Petukhovsky
89bd7ab8a3 Fix read/write in walproposer 2023-07-28 15:14:24 +00:00
Arthur Petukhovsky
5034a8cca0 WIP 2023-07-26 22:51:19 +02:00
Arthur Petukhovsky
55e40d090e Run sync several times 2023-07-25 11:16:47 +00:00
Arthur Petukhovsky
d87e822169 Return LSN from sync safekeepers 2023-07-24 21:15:35 +00:00
Arthur Petukhovsky
296a0cbac2 Add -DSIMLIB 2023-07-21 15:40:47 +00:00
Arthur Petukhovsky
aed14f52d5 Test sync safekeepers 2023-06-03 19:11:28 +00:00
Arthur Petukhovsky
909d7fadb8 Implement simlib sk server 2023-06-02 14:49:55 +00:00
Arthur Petukhovsky
3840d6b18b Clean up C API 2023-06-01 09:38:07 +00:00
Arthur Petukhovsky
65f92232e6 Compile walproposer 2023-05-31 21:06:47 +00:00
Arthur Petukhovsky
0d4f987fc8 Implement full simlib C API 2023-05-31 20:25:25 +00:00
Arthur Petukhovsky
aa0763d49d Run simulator on C code 2023-05-31 16:55:16 +00:00
Arthur Petukhovsky
7b5123edda Fix elog 2023-05-31 15:06:26 +00:00
Arthur Petukhovsky
b6a80bc269 Link postgres to rust statically 2023-05-31 13:19:41 +00:00
Arthur Petukhovsky
ac82b34c64 Create more involved example 2023-05-30 16:43:33 +00:00
Arthur Petukhovsky
a77fc2c5ff Test Rust -> C -> Rust codepath 2023-05-30 16:38:32 +00:00
Arthur Petukhovsky
9ccbec0e14 Spend some time 2023-05-26 14:45:25 +03:00
Arthur Petukhovsky
b55005d2c4 Build simple C func example 2023-05-26 14:44:48 +03:00
Arthur Petukhovsky
6436432a77 Showcase network failures 2023-05-25 12:53:20 +03:00
Arthur Petukhovsky
1b8918e665 Add accept, close and delays to the network 2023-05-25 12:26:57 +03:00
Arthur Petukhovsky
87c9edac7c Add basic support for network delays 2023-05-24 20:28:53 +03:00
Arthur Petukhovsky
5e0550a620 Add os.sleep and os.random 2023-05-24 15:51:30 +03:00
Arthur Petukhovsky
06f493f525 Extract simlib 2023-05-24 13:06:42 +03:00
Arthur Petukhovsky
f6b540ebfe Add initial support for virtual time 2023-05-22 15:00:56 +03:00
Arthur Petukhovsky
83f87af02b Remove sync debug 2023-03-10 00:10:09 +02:00
Arthur Petukhovsky
79823c38cd It looks deterministic now 2023-03-10 00:03:35 +02:00
Arthur Petukhovsky
072fb3d7e9 WIP 2023-03-09 14:59:03 +02:00
Arthur Petukhovsky
f2fb9f6be9 WIP 2023-03-09 14:51:29 +02:00
Arthur Petukhovsky
dd4c8fb568 WIP 2023-03-09 00:51:14 +02:00
Arthur Petukhovsky
9116c01614 WIP 2023-03-08 18:45:13 +02:00
Arthur Petukhovsky
17cd96e022 WIP 2023-03-03 20:33:55 +00:00
48 changed files with 5623 additions and 76 deletions

View File

@@ -14,3 +14,6 @@ opt-level = 1
[alias]
build_testing = ["build", "--features", "testing"]
[build]
rustflags = ["-C", "default-linker-libraries"]

2
.gitignore vendored
View File

@@ -18,3 +18,5 @@ test_output/
*.o
*.so
*.Po
tmp

81
Cargo.lock generated
View File

@@ -679,6 +679,25 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "cbindgen"
version = "0.24.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b922faaf31122819ec80c4047cc684c6979a087366c069611e33649bf98e18d"
dependencies = [
"clap 3.2.23",
"heck",
"indexmap",
"log",
"proc-macro2",
"quote",
"serde",
"serde_json",
"syn",
"tempfile",
"toml",
]
[[package]]
name = "cc"
version = "1.0.79"
@@ -757,9 +776,12 @@ version = "3.2.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5"
dependencies = [
"atty",
"bitflags",
"clap_lex 0.2.4",
"indexmap",
"strsim",
"termcolor",
"textwrap",
]
@@ -1014,6 +1036,20 @@ version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
[[package]]
name = "crossbeam"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c"
dependencies = [
"cfg-if",
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-epoch",
"crossbeam-queue",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.6"
@@ -1048,6 +1084,16 @@ dependencies = [
"scopeguard",
]
[[package]]
name = "crossbeam-queue"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.14"
@@ -3311,6 +3357,7 @@ dependencies = [
"clap 4.1.4",
"const_format",
"crc32c",
"crossbeam",
"fs2",
"git-version",
"hex",
@@ -3324,9 +3371,11 @@ dependencies = [
"postgres-protocol",
"postgres_ffi",
"pq_proto",
"rand",
"regex",
"remote_storage",
"safekeeper_api",
"scopeguard",
"serde",
"serde_json",
"serde_with",
@@ -4588,6 +4637,38 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "walproposer"
version = "0.1.0"
dependencies = [
"anyhow",
"atty",
"bindgen",
"byteorder",
"bytes",
"cbindgen",
"crc32c",
"env_logger",
"hex",
"hyper",
"libc",
"log",
"memoffset 0.8.0",
"once_cell",
"postgres",
"postgres_ffi",
"rand",
"regex",
"safekeeper",
"scopeguard",
"serde",
"thiserror",
"tracing",
"tracing-subscriber",
"utils",
"workspace_hack",
]
[[package]]
name = "want"
version = "0.3.0"

View File

@@ -138,10 +138,12 @@ postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
safekeeper = { path = "./safekeeper/" }
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
utils = { version = "0.1", path = "./libs/utils/" }
walproposer = { version = "0.1", path = "./libs/walproposer/" }
## Common library dependency
workspace_hack = { version = "0.1", path = "./workspace_hack/" }

View File

@@ -39,6 +39,8 @@ endif
# been no changes to the files. Changing the mtime triggers an
# unnecessary rebuild of 'postgres_ffi'.
PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C'
PG_CONFIGURE_OPTS += CC=clang
PG_CONFIGURE_OPTS += CCX=clang++
# Choose whether we should be silent or verbose
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
@@ -134,6 +136,12 @@ neon-pg-ext-%: postgres-%
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
.PHONY:
neon-pg-ext-walproposer:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-v15 \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \

4
libs/walproposer/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
*.a
*.o
*.tmp
pgdata

View File

@@ -0,0 +1,39 @@
[package]
name = "walproposer"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
atty.workspace = true
rand.workspace = true
regex.workspace = true
bytes.workspace = true
byteorder.workspace = true
anyhow.workspace = true
crc32c.workspace = true
hex.workspace = true
once_cell.workspace = true
log.workspace = true
libc.workspace = true
memoffset.workspace = true
thiserror.workspace = true
tracing.workspace = true
tracing-subscriber = { workspace = true, features = ["json"] }
serde.workspace = true
scopeguard.workspace = true
utils.workspace = true
safekeeper.workspace = true
postgres_ffi.workspace = true
hyper.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
env_logger.workspace = true
postgres.workspace = true
[build-dependencies]
anyhow.workspace = true
bindgen.workspace = true
cbindgen = "0.24.0"

View File

@@ -0,0 +1,16 @@
# walproposer Rust module
## Rust -> C
We compile walproposer as a static library and generate Rust bindings for it using `bindgen`.
Entrypoint header file is `bindgen_deps.h`.
## C -> Rust
We use `cbindgen` to generate C bindings for the Rust code. They are stored in `rust_bindings.h`.
## How to run the tests
```
export RUSTFLAGS="-C default-linker-libraries"
```

View File

@@ -0,0 +1,30 @@
/*
* This header file is the input to bindgen. It includes all the
* PostgreSQL headers that we need to auto-generate Rust structs
* from. If you need to expose a new struct to Rust code, add the
* header here, and whitelist the struct in the build.rs file.
*/
#include "c.h"
#include "walproposer.h"
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
// Calc a sum of two numbers. Used to test Rust->C function calls.
int TestFunc(int a, int b);
// Run a client for simple simlib test.
void RunClientC(uint32_t serverId);
void WalProposerRust();
void WalProposerCleanup();
extern bool debug_enabled;
// Initialize global variables before calling any Postgres C code.
void MyContextInit();
XLogRecPtr MyInsertRecord();

137
libs/walproposer/build.rs Normal file
View File

@@ -0,0 +1,137 @@
use std::{env, path::PathBuf, process::Command};
use anyhow::{anyhow, Context};
use bindgen::CargoCallbacks;
extern crate bindgen;
fn main() -> anyhow::Result<()> {
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
cbindgen::Builder::new()
.with_crate(crate_dir)
.with_language(cbindgen::Language::C)
.generate()
.expect("Unable to generate bindings")
.write_to_file("rust_bindings.h");
// Tell cargo to invalidate the built crate whenever the wrapper changes
println!("cargo:rerun-if-changed=bindgen_deps.h,test.c,../../pgxn/neon/walproposer.c,build.sh");
println!("cargo:rustc-link-arg=-Wl,--start-group");
println!("cargo:rustc-link-arg=-lsim");
println!("cargo:rustc-link-arg=-lpgport_srv");
println!("cargo:rustc-link-arg=-lpostgres");
println!("cargo:rustc-link-arg=-lpgcommon_srv");
println!("cargo:rustc-link-arg=-lssl");
println!("cargo:rustc-link-arg=-lcrypto");
println!("cargo:rustc-link-arg=-lz");
println!("cargo:rustc-link-arg=-lpthread");
println!("cargo:rustc-link-arg=-lrt");
println!("cargo:rustc-link-arg=-ldl");
println!("cargo:rustc-link-arg=-lm");
println!("cargo:rustc-link-arg=-lwalproposer");
println!("cargo:rustc-link-arg=-Wl,--end-group");
println!("cargo:rustc-link-search=/home/admin/simulator/libs/walproposer");
// disable fPIE
println!("cargo:rustc-link-arg=-no-pie");
// print output of build.sh
let output = std::process::Command::new("./build.sh")
.output()
.expect("could not spawn `clang`");
println!("stdout: {}", String::from_utf8(output.stdout).unwrap());
println!("stderr: {}", String::from_utf8(output.stderr).unwrap());
if !output.status.success() {
// Panic if the command was not successful.
panic!("could not compile object file");
}
// // Finding the location of C headers for the Postgres server:
// // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
// // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
postgres_install_dir.into()
} else {
PathBuf::from("pg_install")
};
let pg_version = "v15";
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
if pg_install_dir_versioned.is_relative() {
let cwd = env::current_dir().context("Failed to get current_dir")?;
pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned);
}
let pg_config_bin = pg_install_dir_versioned
.join(pg_version)
.join("bin")
.join("pg_config");
let inc_server_path: String = if pg_config_bin.exists() {
let output = Command::new(pg_config_bin)
.arg("--includedir-server")
.output()
.context("failed to execute `pg_config --includedir-server`")?;
if !output.status.success() {
panic!("`pg_config --includedir-server` failed")
}
String::from_utf8(output.stdout)
.context("pg_config output is not UTF-8")?
.trim_end()
.into()
} else {
let server_path = pg_install_dir_versioned
.join("include")
.join("postgresql")
.join("server")
.into_os_string();
server_path
.into_string()
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
};
let inc_pgxn_path = "/home/admin/simulator/pgxn/neon";
// The bindgen::Builder is the main entry point
// to bindgen, and lets you build up options for
// the resulting bindings.
let bindings = bindgen::Builder::default()
// The input header we would like to generate
// bindings for.
.header("bindgen_deps.h")
// Tell cargo to invalidate the built crate whenever any of the
// included header files changed.
.parse_callbacks(Box::new(CargoCallbacks))
.allowlist_function("TestFunc")
.allowlist_function("RunClientC")
.allowlist_function("WalProposerRust")
.allowlist_function("MyContextInit")
.allowlist_function("WalProposerCleanup")
.allowlist_function("MyInsertRecord")
.allowlist_var("wal_acceptors_list")
.allowlist_var("wal_acceptor_reconnect_timeout")
.allowlist_var("wal_acceptor_connection_timeout")
.allowlist_var("am_wal_proposer")
.allowlist_var("neon_timeline_walproposer")
.allowlist_var("neon_tenant_walproposer")
.allowlist_var("syncSafekeepers")
.allowlist_var("sim_redo_start_lsn")
.allowlist_var("debug_enabled")
.clang_arg(format!("-I{inc_server_path}"))
.clang_arg(format!("-I{inc_pgxn_path}"))
.clang_arg(format!("-DSIMLIB"))
// Finish the builder and generate the bindings.
.generate()
// Unwrap the Result and panic on failure.
.expect("Unable to generate bindings");
// Write the bindings to the $OUT_DIR/bindings.rs file.
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
bindings
.write_to_file(out_path)
.expect("Couldn't write bindings!");
Ok(())
}

21
libs/walproposer/build.sh Executable file
View File

@@ -0,0 +1,21 @@
#!/bin/bash
set -e
cd /home/admin/simulator/libs/walproposer
# TODO: rewrite to Makefile
make -C ../.. neon-pg-ext-walproposer
make -C ../../pg_install/build/v15/src/backend postgres-lib -s
cp ../../pg_install/build/v15/src/backend/libpostgres.a .
cp ../../pg_install/build/v15/src/common/libpgcommon_srv.a .
cp ../../pg_install/build/v15/src/port/libpgport_srv.a .
clang -g -c libpqwalproposer.c test.c -ferror-limit=1 -I ../../pg_install/v15/include/postgresql/server -I ../../pgxn/neon
rm -rf libsim.a
ar rcs libsim.a test.o libpqwalproposer.o
rm -rf libwalproposer.a
PGXN_DIR=../../pg_install/build/neon-v15/
ar rcs libwalproposer.a $PGXN_DIR/walproposer.o $PGXN_DIR/walproposer_utils.o $PGXN_DIR/neon.o

View File

@@ -0,0 +1,542 @@
#include "postgres.h"
#include "neon.h"
#include "walproposer.h"
#include "rust_bindings.h"
#include "replication/message.h"
#include "access/xlog_internal.h"
// defined in walproposer.h
uint64 sim_redo_start_lsn;
XLogRecPtr sim_latest_available_lsn;
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
struct WalProposerConn
{
int64_t tcp;
};
/* Helper function */
static bool
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
{
// walprop_log(LOG, "not implemented");
return false;
}
/* Exported function definitions */
char *
walprop_error_message(WalProposerConn *conn)
{
// walprop_log(LOG, "not implemented");
return NULL;
}
WalProposerConnStatusType
walprop_status(WalProposerConn *conn)
{
// walprop_log(LOG, "not implemented: walprop_status");
return WP_CONNECTION_OK;
}
WalProposerConn *
walprop_connect_start(char *conninfo)
{
WalProposerConn *conn;
walprop_log(LOG, "walprop_connect_start: %s", conninfo);
const char *connstr_prefix = "host=node port=";
Assert(strncmp(conninfo, connstr_prefix, strlen(connstr_prefix)) == 0);
int nodeId = atoi(conninfo + strlen(connstr_prefix));
conn = palloc(sizeof(WalProposerConn));
conn->tcp = sim_open_tcp(nodeId);
return conn;
}
WalProposerConnectPollStatusType
walprop_connect_poll(WalProposerConn *conn)
{
// walprop_log(LOG, "not implemented: walprop_connect_poll");
return WP_CONN_POLLING_OK;
}
bool
walprop_send_query(WalProposerConn *conn, char *query)
{
// walprop_log(LOG, "not implemented: walprop_send_query");
return true;
}
WalProposerExecStatusType
walprop_get_query_result(WalProposerConn *conn)
{
// walprop_log(LOG, "not implemented: walprop_get_query_result");
return WP_EXEC_SUCCESS_COPYBOTH;
}
pgsocket
walprop_socket(WalProposerConn *conn)
{
return (pgsocket) conn->tcp;
}
int
walprop_flush(WalProposerConn *conn)
{
// walprop_log(LOG, "not implemented");
return 0;
}
void
walprop_finish(WalProposerConn *conn)
{
// walprop_log(LOG, "walprop_finish not implemented");
}
/*
* Receive a message from the safekeeper.
*
* On success, the data is placed in *buf. It is valid until the next call
* to this function.
*/
PGAsyncReadResult
walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
{
uintptr_t len;
char *msg;
Event event;
event = sim_epoll_peek(0);
if (event.tcp != conn->tcp || event.tag != Message || event.any_message != Bytes)
return PG_ASYNC_READ_TRY_AGAIN;
event = sim_epoll_rcv(0);
// walprop_log(LOG, "walprop_async_read, T: %d, tcp: %d, tag: %d", (int) event.tag, (int) event.tcp, (int) event.any_message);
Assert(event.tcp == conn->tcp);
Assert(event.tag == Message);
Assert(event.any_message == Bytes);
msg = (char*) sim_msg_get_bytes(&len);
*buf = msg;
*amount = len;
// walprop_log(LOG, "walprop_async_read: %d", (int) len);
return PG_ASYNC_READ_SUCCESS;
}
PGAsyncWriteResult
walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
{
// walprop_log(LOG, "walprop_async_write");
sim_msg_set_bytes(buf, size);
sim_tcp_send(conn->tcp);
return PG_ASYNC_WRITE_SUCCESS;
}
/*
* This function is very similar to walprop_async_write. For more
* information, refer to the comments there.
*/
bool
walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
{
// walprop_log(LOG, "walprop_blocking_write");
sim_msg_set_bytes(buf, size);
sim_tcp_send(conn->tcp);
return true;
}
void
sim_start_replication(XLogRecPtr startptr)
{
walprop_log(LOG, "sim_start_replication: %X/%X", LSN_FORMAT_ARGS(startptr));
sim_latest_available_lsn = startptr;
for (;;)
{
XLogRecPtr endptr = sim_latest_available_lsn;
Assert(startptr <= endptr);
if (endptr > startptr)
{
WalProposerBroadcast(startptr, endptr);
startptr = endptr;
}
WalProposerPoll();
}
}
#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
static int UsableBytesInSegment =
(DEFAULT_XLOG_SEG_SIZE / XLOG_BLCKSZ * UsableBytesInPage) -
(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
/*
* Converts a "usable byte position" to XLogRecPtr. A usable byte position
* is the position starting from the beginning of WAL, excluding all WAL
* page headers.
*/
static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)
{
uint64 fullsegs;
uint64 fullpages;
uint64 bytesleft;
uint32 seg_offset;
XLogRecPtr result;
fullsegs = bytepos / UsableBytesInSegment;
bytesleft = bytepos % UsableBytesInSegment;
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
{
/* fits on first page of segment */
seg_offset = bytesleft + SizeOfXLogLongPHD;
}
else
{
/* account for the first page on segment with long header */
seg_offset = XLOG_BLCKSZ;
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
fullpages = bytesleft / UsableBytesInPage;
bytesleft = bytesleft % UsableBytesInPage;
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
return result;
}
/*
* Convert an XLogRecPtr to a "usable byte position".
*/
static uint64
XLogRecPtrToBytePos(XLogRecPtr ptr)
{
uint64 fullsegs;
uint32 fullpages;
uint32 offset;
uint64 result;
XLByteToSeg(ptr, fullsegs, wal_segment_size);
fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
offset = ptr % XLOG_BLCKSZ;
if (fullpages == 0)
{
result = fullsegs * UsableBytesInSegment;
if (offset > 0)
{
Assert(offset >= SizeOfXLogLongPHD);
result += offset - SizeOfXLogLongPHD;
}
}
else
{
result = fullsegs * UsableBytesInSegment +
(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
(fullpages - 1) * UsableBytesInPage; /* full pages */
if (offset > 0)
{
Assert(offset >= SizeOfXLogShortPHD);
result += offset - SizeOfXLogShortPHD;
}
}
return result;
}
#define max_rdatas 16
void InitMyInsert();
static void MyBeginInsert();
static void MyRegisterData(char *data, int len);
static XLogRecPtr MyFinishInsert(RmgrId rmid, uint8 info, uint8 flags);
static void MyCopyXLogRecordToWAL(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos);
/*
* An array of XLogRecData structs, to hold registered data.
*/
static XLogRecData rdatas[max_rdatas];
static int num_rdatas; /* entries currently used */
static uint32 mainrdata_len; /* total # of bytes in chain */
static XLogRecData hdr_rdt;
static char hdr_scratch[16000];
static XLogRecPtr CurrBytePos;
static XLogRecPtr PrevBytePos;
void InitMyInsert()
{
CurrBytePos = sim_redo_start_lsn;
PrevBytePos = InvalidXLogRecPtr;
sim_latest_available_lsn = sim_redo_start_lsn;
}
static void MyBeginInsert()
{
num_rdatas = 0;
mainrdata_len = 0;
}
static void MyRegisterData(char *data, int len)
{
XLogRecData *rdata;
if (num_rdatas >= max_rdatas)
walprop_log(ERROR, "too much WAL data");
rdata = &rdatas[num_rdatas++];
rdata->data = data;
rdata->len = len;
rdata->next = NULL;
if (num_rdatas > 1) {
rdatas[num_rdatas - 2].next = rdata;
}
mainrdata_len += len;
}
static XLogRecPtr
MyFinishInsert(RmgrId rmid, uint8 info, uint8 flags)
{
XLogRecData *rdt;
uint32 total_len = 0;
int block_id;
pg_crc32c rdata_crc;
XLogRecord *rechdr;
char *scratch = hdr_scratch;
int size;
XLogRecPtr StartPos;
XLogRecPtr EndPos;
uint64 startbytepos;
uint64 endbytepos;
/*
* Note: this function can be called multiple times for the same record.
* All the modifications we do to the rdata chains below must handle that.
*/
/* The record begins with the fixed-size header */
rechdr = (XLogRecord *) scratch;
scratch += SizeOfXLogRecord;
hdr_rdt.data = hdr_scratch;
if (num_rdatas > 0)
{
hdr_rdt.next = &rdatas[0];
}
else
{
hdr_rdt.next = NULL;
}
/* followed by main data, if any */
if (mainrdata_len > 0)
{
if (mainrdata_len > 255)
{
*(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG;
memcpy(scratch, &mainrdata_len, sizeof(uint32));
scratch += sizeof(uint32);
}
else
{
*(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT;
*(scratch++) = (uint8) mainrdata_len;
}
total_len += mainrdata_len;
}
hdr_rdt.len = (scratch - hdr_scratch);
total_len += hdr_rdt.len;
/*
* Calculate CRC of the data
*
* Note that the record header isn't added into the CRC initially since we
* don't know the prev-link yet. Thus, the CRC will represent the CRC of
* the whole record in the order: rdata, then backup blocks, then record
* header.
*/
INIT_CRC32C(rdata_crc);
COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord);
for (size_t i = 0; i < num_rdatas; i++)
{
rdt = &rdatas[i];
COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
}
/*
* Fill in the fields in the record header. Prev-link is filled in later,
* once we know where in the WAL the record will be inserted. The CRC does
* not include the record header yet.
*/
rechdr->xl_xid = 0;
rechdr->xl_tot_len = total_len;
rechdr->xl_info = info;
rechdr->xl_rmid = rmid;
rechdr->xl_prev = InvalidXLogRecPtr;
rechdr->xl_crc = rdata_crc;
size = MAXALIGN(rechdr->xl_tot_len);
/* All (non xlog-switch) records should contain data. */
Assert(size > SizeOfXLogRecord);
startbytepos = XLogRecPtrToBytePos(CurrBytePos);
endbytepos = startbytepos + size;
// Get the position.
StartPos = XLogBytePosToRecPtr(startbytepos);
EndPos = XLogBytePosToRecPtr(startbytepos + size);
rechdr->xl_prev = PrevBytePos;
Assert(XLogRecPtrToBytePos(StartPos) == startbytepos);
Assert(XLogRecPtrToBytePos(EndPos) == endbytepos);
// Update global pointers.
CurrBytePos = EndPos;
PrevBytePos = StartPos;
/*
* Now that xl_prev has been filled in, calculate CRC of the record
* header.
*/
rdata_crc = rechdr->xl_crc;
COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
FIN_CRC32C(rdata_crc);
rechdr->xl_crc = rdata_crc;
// Now write it to disk.
MyCopyXLogRecordToWAL(rechdr->xl_tot_len, &hdr_rdt, StartPos, EndPos);
return EndPos;
}
#define INSERT_FREESPACE(endptr) \
(((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
static void
MyCopyXLogRecordToWAL(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos)
{
XLogRecPtr CurrPos;
int written;
int freespace;
// Write hdr_rdt and `num_rdatas` other datas.
CurrPos = StartPos;
freespace = INSERT_FREESPACE(CurrPos);
written = 0;
Assert(freespace >= sizeof(uint32));
while (rdata != NULL)
{
char *rdata_data = rdata->data;
int rdata_len = rdata->len;
while (rdata_len >= freespace)
{
char header_buf[SizeOfXLogLongPHD];
XLogPageHeader NewPage = (XLogPageHeader) header_buf;
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
XLogWalPropWrite(rdata_data, freespace, CurrPos);
rdata_data += freespace;
rdata_len -= freespace;
written += freespace;
CurrPos += freespace;
// Init new page
MemSet(header_buf, 0, SizeOfXLogLongPHD);
/*
* Fill the new page's header
*/
NewPage->xlp_magic = XLOG_PAGE_MAGIC;
/* NewPage->xlp_info = 0; */ /* done by memset */
NewPage->xlp_tli = 1;
NewPage->xlp_pageaddr = CurrPos;
/* NewPage->xlp_rem_len = 0; */ /* done by memset */
NewPage->xlp_info |= XLP_BKP_REMOVABLE;
/*
* If first page of an XLOG segment file, make it a long header.
*/
if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
{
XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
NewLongPage->xlp_sysid = 0;
NewLongPage->xlp_seg_size = wal_segment_size;
NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
NewPage->xlp_info |= XLP_LONG_HEADER;
}
NewPage->xlp_rem_len = write_len - written;
if (NewPage->xlp_rem_len > 0) {
NewPage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
}
/* skip over the page header */
if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
{
XLogWalPropWrite(header_buf, SizeOfXLogLongPHD, CurrPos);
CurrPos += SizeOfXLogLongPHD;
}
else
{
XLogWalPropWrite(header_buf, SizeOfXLogShortPHD, CurrPos);
CurrPos += SizeOfXLogShortPHD;
}
freespace = INSERT_FREESPACE(CurrPos);
}
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
XLogWalPropWrite(rdata_data, rdata_len, CurrPos);
CurrPos += rdata_len;
written += rdata_len;
freespace -= rdata_len;
rdata = rdata->next;
}
Assert(written == write_len);
CurrPos = MAXALIGN64(CurrPos);
Assert(CurrPos == EndPos);
}
XLogRecPtr MyInsertRecord()
{
const char *prefix = "prefix";
const char *message = "message";
size_t size = 7;
bool transactional = false;
xl_logical_message xlrec;
xlrec.dbId = 0;
xlrec.transactional = transactional;
/* trailing zero is critical; see logicalmsg_desc */
xlrec.prefix_size = strlen(prefix) + 1;
xlrec.message_size = size;
MyBeginInsert();
MyRegisterData((char *) &xlrec, SizeOfLogicalMessage);
MyRegisterData(unconstify(char *, prefix), xlrec.prefix_size);
MyRegisterData(unconstify(char *, message), size);
return MyFinishInsert(RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLOG_INCLUDE_ORIGIN);
}

View File

@@ -0,0 +1,106 @@
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
/**
* List of all possible AnyMessage.
*/
enum AnyMessageTag {
None,
InternalConnect,
Just32,
ReplCell,
Bytes,
LSN,
};
typedef uint8_t AnyMessageTag;
/**
* List of all possible NodeEvent.
*/
enum EventTag {
Timeout,
Accept,
Closed,
Message,
Internal,
};
typedef uint8_t EventTag;
/**
* Event returned by epoll_recv.
*/
typedef struct Event {
EventTag tag;
int64_t tcp;
AnyMessageTag any_message;
} Event;
void rust_function(uint32_t a);
/**
* C API for the node os.
*/
void sim_sleep(uint64_t ms);
uint64_t sim_random(uint64_t max);
uint32_t sim_id(void);
int64_t sim_open_tcp(uint32_t dst);
int64_t sim_open_tcp_nopoll(uint32_t dst);
/**
* Send MESSAGE_BUF content to the given tcp.
*/
void sim_tcp_send(int64_t tcp);
/**
* Receive a message from the given tcp. Can be used only with tcp opened with
* `sim_open_tcp_nopoll`.
*/
struct Event sim_tcp_recv(int64_t tcp);
struct Event sim_epoll_rcv(int64_t timeout);
struct Event sim_epoll_peek(int64_t timeout);
int64_t sim_now(void);
void sim_exit(int32_t code, const uint8_t *msg);
void sim_set_result(int32_t code, const uint8_t *msg);
void sim_log_event(const int8_t *msg);
/**
* Get tag of the current message.
*/
AnyMessageTag sim_msg_tag(void);
/**
* Read AnyMessage::Just32 message.
*/
void sim_msg_get_just_u32(uint32_t *val);
/**
* Read AnyMessage::LSN message.
*/
void sim_msg_get_lsn(uint64_t *val);
/**
* Write AnyMessage::ReplCell message.
*/
void sim_msg_set_repl_cell(uint32_t value, uint32_t client_id, uint32_t seqno);
/**
* Write AnyMessage::Bytes message.
*/
void sim_msg_set_bytes(const char *bytes, uintptr_t len);
/**
* Read AnyMessage::Bytes message.
*/
const char *sim_msg_get_bytes(uintptr_t *len);

View File

@@ -0,0 +1,36 @@
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
use safekeeper::simlib::node_os::NodeOs;
use tracing::info;
pub mod bindings {
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
}
#[no_mangle]
pub extern "C" fn rust_function(a: u32) {
info!("Hello from Rust!");
info!("a: {}", a);
}
pub mod sim;
pub mod sim_proto;
#[cfg(test)]
mod test;
#[cfg(test)]
pub mod simtest;
pub fn c_context() -> Option<Box<dyn Fn(NodeOs) + Send + Sync>> {
Some(Box::new(|os: NodeOs| {
sim::c_attach_node_os(os);
unsafe { bindings::MyContextInit(); }
}))
}
pub fn enable_debug() {
unsafe { bindings::debug_enabled = true; }
}

240
libs/walproposer/src/sim.rs Normal file
View File

@@ -0,0 +1,240 @@
use log::debug;
use safekeeper::simlib::{network::TCP, node_os::NodeOs, world::NodeEvent};
use std::{
cell::RefCell,
collections::HashMap,
ffi::{CStr, CString},
};
use tracing::trace;
use crate::sim_proto::{anymessage_tag, AnyMessageTag, Event, EventTag, MESSAGE_BUF};
thread_local! {
static CURRENT_NODE_OS: RefCell<Option<NodeOs>> = RefCell::new(None);
static TCP_CACHE: RefCell<HashMap<i64, TCP>> = RefCell::new(HashMap::new());
}
/// Get the current node os.
fn os() -> NodeOs {
CURRENT_NODE_OS.with(|cell| cell.borrow().clone().expect("no node os set"))
}
fn tcp_save(tcp: TCP) -> i64 {
TCP_CACHE.with(|cell| {
let mut cache = cell.borrow_mut();
let id = tcp.id();
cache.insert(id, tcp);
id
})
}
fn tcp_load(id: i64) -> TCP {
TCP_CACHE.with(|cell| {
let cache = cell.borrow();
cache.get(&id).expect("unknown TCP id").clone()
})
}
/// Should be called before calling any of the C functions.
pub(crate) fn c_attach_node_os(os: NodeOs) {
CURRENT_NODE_OS.with(|cell| {
*cell.borrow_mut() = Some(os);
});
TCP_CACHE.with(|cell| {
*cell.borrow_mut() = HashMap::new();
});
}
/// C API for the node os.
#[no_mangle]
pub extern "C" fn sim_sleep(ms: u64) {
os().sleep(ms);
}
#[no_mangle]
pub extern "C" fn sim_random(max: u64) -> u64 {
os().random(max)
}
#[no_mangle]
pub extern "C" fn sim_id() -> u32 {
os().id().into()
}
#[no_mangle]
pub extern "C" fn sim_open_tcp(dst: u32) -> i64 {
tcp_save(os().open_tcp(dst.into()))
}
#[no_mangle]
pub extern "C" fn sim_open_tcp_nopoll(dst: u32) -> i64 {
tcp_save(os().open_tcp_nopoll(dst.into()))
}
#[no_mangle]
/// Send MESSAGE_BUF content to the given tcp.
pub extern "C" fn sim_tcp_send(tcp: i64) {
tcp_load(tcp).send(MESSAGE_BUF.with(|cell| cell.borrow().clone()));
}
#[no_mangle]
/// Receive a message from the given tcp. Can be used only with tcp opened with
/// `sim_open_tcp_nopoll`.
pub extern "C" fn sim_tcp_recv(tcp: i64) -> Event {
let event = tcp_load(tcp).recv();
match event {
NodeEvent::Accept(_) => unreachable!(),
NodeEvent::Closed(_) => Event {
tag: EventTag::Closed,
tcp: 0,
any_message: AnyMessageTag::None,
},
NodeEvent::Internal(_) => unreachable!(),
NodeEvent::Message((message, _)) => {
// store message in thread local storage, C code should use
// sim_msg_* functions to access it.
MESSAGE_BUF.with(|cell| {
*cell.borrow_mut() = message.clone();
});
Event {
tag: EventTag::Message,
tcp: 0,
any_message: anymessage_tag(&message),
}
}
NodeEvent::WakeTimeout(_) => unreachable!(),
}
}
#[no_mangle]
pub extern "C" fn sim_epoll_rcv(timeout: i64) -> Event {
let event = os().epoll_recv(timeout);
let event = if let Some(event) = event {
event
} else {
return Event {
tag: EventTag::Timeout,
tcp: 0,
any_message: AnyMessageTag::None,
};
};
match event {
NodeEvent::Accept(tcp) => Event {
tag: EventTag::Accept,
tcp: tcp_save(tcp),
any_message: AnyMessageTag::None,
},
NodeEvent::Closed(tcp) => Event {
tag: EventTag::Closed,
tcp: tcp_save(tcp),
any_message: AnyMessageTag::None,
},
NodeEvent::Message((message, tcp)) => {
// store message in thread local storage, C code should use
// sim_msg_* functions to access it.
MESSAGE_BUF.with(|cell| {
*cell.borrow_mut() = message.clone();
});
Event {
tag: EventTag::Message,
tcp: tcp_save(tcp),
any_message: anymessage_tag(&message),
}
}
NodeEvent::Internal(message) => {
// store message in thread local storage, C code should use
// sim_msg_* functions to access it.
MESSAGE_BUF.with(|cell| {
*cell.borrow_mut() = message.clone();
});
Event {
tag: EventTag::Internal,
tcp: 0,
any_message: anymessage_tag(&message),
}
}
NodeEvent::WakeTimeout(_) => {
// can't happen
unreachable!()
}
}
}
#[no_mangle]
pub extern "C" fn sim_epoll_peek(timeout: i64) -> Event {
let event = os().epoll_peek(timeout);
let event = if let Some(event) = event {
event
} else {
return Event {
tag: EventTag::Timeout,
tcp: 0,
any_message: AnyMessageTag::None,
};
};
match event {
NodeEvent::Accept(tcp) => Event {
tag: EventTag::Accept,
tcp: tcp_save(tcp),
any_message: AnyMessageTag::None,
},
NodeEvent::Closed(tcp) => Event {
tag: EventTag::Closed,
tcp: tcp_save(tcp),
any_message: AnyMessageTag::None,
},
NodeEvent::Message((message, tcp)) => Event {
tag: EventTag::Message,
tcp: tcp_save(tcp),
any_message: anymessage_tag(&message),
},
NodeEvent::Internal(message) => Event {
tag: EventTag::Internal,
tcp: 0,
any_message: anymessage_tag(&message),
},
NodeEvent::WakeTimeout(_) => {
// can't happen
unreachable!()
}
}
}
#[no_mangle]
pub extern "C" fn sim_now() -> i64 {
os().now() as i64
}
#[no_mangle]
pub extern "C" fn sim_exit(code: i32, msg: *const u8) {
trace!("sim_exit({}, {:?})", code, msg);
sim_set_result(code, msg);
// I tried to make use of pthread_exit, but it doesn't work.
// https://github.com/rust-lang/unsafe-code-guidelines/issues/211
// unsafe { libc::pthread_exit(std::ptr::null_mut()) };
// https://doc.rust-lang.org/nomicon/unwinding.html
// Everyone on the internet saying this is UB, but it works for me,
// so I'm going to use it for now.
panic!("sim_exit() called from C code")
}
#[no_mangle]
pub extern "C" fn sim_set_result(code: i32, msg: *const u8) {
let msg = unsafe { CStr::from_ptr(msg as *const i8) };
let msg = msg.to_string_lossy().into_owned();
debug!("sim_set_result({}, {:?})", code, msg);
os().set_result(code, msg);
}
#[no_mangle]
pub extern "C" fn sim_log_event(msg: *const i8) {
let msg = unsafe { CStr::from_ptr(msg) };
let msg = msg.to_string_lossy().into_owned();
debug!("sim_log_event({:?})", msg);
os().log_event(msg);
}

View File

@@ -0,0 +1,114 @@
use safekeeper::simlib::proto::{AnyMessage, ReplCell};
use std::{cell::RefCell, ffi::c_char};
pub(crate) fn anymessage_tag(msg: &AnyMessage) -> AnyMessageTag {
match msg {
AnyMessage::None => AnyMessageTag::None,
AnyMessage::InternalConnect => AnyMessageTag::InternalConnect,
AnyMessage::Just32(_) => AnyMessageTag::Just32,
AnyMessage::ReplCell(_) => AnyMessageTag::ReplCell,
AnyMessage::Bytes(_) => AnyMessageTag::Bytes,
AnyMessage::LSN(_) => AnyMessageTag::LSN,
}
}
thread_local! {
pub static MESSAGE_BUF: RefCell<AnyMessage> = RefCell::new(AnyMessage::None);
}
#[no_mangle]
/// Get tag of the current message.
pub extern "C" fn sim_msg_tag() -> AnyMessageTag {
MESSAGE_BUF.with(|cell| anymessage_tag(&*cell.borrow()))
}
#[no_mangle]
/// Read AnyMessage::Just32 message.
pub extern "C" fn sim_msg_get_just_u32(val: &mut u32) {
MESSAGE_BUF.with(|cell| match &*cell.borrow() {
AnyMessage::Just32(v) => {
*val = *v;
}
_ => panic!("expected Just32 message"),
});
}
#[no_mangle]
/// Read AnyMessage::LSN message.
pub extern "C" fn sim_msg_get_lsn(val: &mut u64) {
MESSAGE_BUF.with(|cell| match &*cell.borrow() {
AnyMessage::LSN(v) => {
*val = *v;
}
_ => panic!("expected LSN message"),
});
}
#[no_mangle]
/// Write AnyMessage::ReplCell message.
pub extern "C" fn sim_msg_set_repl_cell(value: u32, client_id: u32, seqno: u32) {
MESSAGE_BUF.with(|cell| {
*cell.borrow_mut() = AnyMessage::ReplCell(ReplCell {
value,
client_id,
seqno,
});
});
}
#[no_mangle]
/// Write AnyMessage::Bytes message.
pub extern "C" fn sim_msg_set_bytes(bytes: *const c_char, len: usize) {
MESSAGE_BUF.with(|cell| {
// copy bytes to a Rust Vec
let mut v: Vec<u8> = Vec::with_capacity(len);
unsafe {
v.set_len(len);
std::ptr::copy_nonoverlapping(bytes as *const u8, v.as_mut_ptr(), len);
}
*cell.borrow_mut() = AnyMessage::Bytes(v.into());
});
}
#[no_mangle]
/// Read AnyMessage::Bytes message.
pub extern "C" fn sim_msg_get_bytes(len: *mut usize) -> *const c_char {
MESSAGE_BUF.with(|cell| match &*cell.borrow() {
AnyMessage::Bytes(v) => {
unsafe {
*len = v.len();
v.as_ptr() as *const i8
}
}
_ => panic!("expected Bytes message"),
})
}
#[repr(C)]
/// Event returned by epoll_recv.
pub struct Event {
pub tag: EventTag,
pub tcp: i64,
pub any_message: AnyMessageTag,
}
#[repr(u8)]
/// List of all possible NodeEvent.
pub enum EventTag {
Timeout,
Accept,
Closed,
Message,
Internal,
}
#[repr(u8)]
/// List of all possible AnyMessage.
pub enum AnyMessageTag {
None,
InternalConnect,
Just32,
ReplCell,
Bytes,
LSN,
}

View File

@@ -0,0 +1,88 @@
use std::collections::HashMap;
use std::sync::Arc;
use safekeeper::safekeeper::SafeKeeperState;
use safekeeper::simlib::sync::Mutex;
use utils::id::TenantTimelineId;
pub struct Disk {
pub timelines: Mutex<HashMap<TenantTimelineId, Arc<TimelineDisk>>>,
}
impl Disk {
pub fn new() -> Self {
Disk {
timelines: Mutex::new(HashMap::new()),
}
}
pub fn put_state(&self, ttid: &TenantTimelineId, state: SafeKeeperState) -> Arc<TimelineDisk> {
self.timelines
.lock()
.entry(ttid.clone())
.and_modify(|e| {
let mut mu = e.state.lock();
*mu = state.clone();
})
.or_insert_with(|| {
Arc::new(TimelineDisk {
state: Mutex::new(state),
wal: Mutex::new(BlockStorage::new()),
})
})
.clone()
}
}
pub struct TimelineDisk {
pub state: Mutex<SafeKeeperState>,
pub wal: Mutex<BlockStorage>,
}
const BLOCK_SIZE: usize = 8192;
pub struct BlockStorage {
blocks: HashMap<u64, [u8; BLOCK_SIZE]>,
}
impl BlockStorage {
pub fn new() -> Self {
BlockStorage {
blocks: HashMap::new(),
}
}
pub fn read(&self, pos: u64, buf: &mut [u8]) {
let mut buf_offset = 0;
let mut storage_pos = pos;
while buf_offset < buf.len() {
let block_id = storage_pos / BLOCK_SIZE as u64;
let block = self.blocks.get(&block_id).unwrap_or(&[0; BLOCK_SIZE]);
let block_offset = storage_pos % BLOCK_SIZE as u64;
let block_len = BLOCK_SIZE as u64 - block_offset;
let buf_len = buf.len() - buf_offset;
let copy_len = std::cmp::min(block_len as usize, buf_len);
buf[buf_offset..buf_offset + copy_len]
.copy_from_slice(&block[block_offset as usize..block_offset as usize + copy_len]);
buf_offset += copy_len;
storage_pos += copy_len as u64;
}
}
pub fn write(&mut self, pos: u64, buf: &[u8]) {
let mut buf_offset = 0;
let mut storage_pos = pos;
while buf_offset < buf.len() {
let block_id = storage_pos / BLOCK_SIZE as u64;
let block = self.blocks.entry(block_id).or_insert([0; BLOCK_SIZE]);
let block_offset = storage_pos % BLOCK_SIZE as u64;
let block_len = BLOCK_SIZE as u64 - block_offset;
let buf_len = buf.len() - buf_offset;
let copy_len = std::cmp::min(block_len as usize, buf_len);
block[block_offset as usize..block_offset as usize + copy_len]
.copy_from_slice(&buf[buf_offset..buf_offset + copy_len]);
buf_offset += copy_len;
storage_pos += copy_len as u64
}
}
}

View File

@@ -0,0 +1,61 @@
use std::{sync::Arc, fmt};
use safekeeper::simlib::{world::World, sync::Mutex};
use tracing_subscriber::fmt::{time::FormatTime, format::Writer};
use utils::logging;
use crate::bindings;
#[derive(Clone)]
pub struct SimClock {
world_ptr: Arc<Mutex<Option<Arc<World>>>>,
}
impl Default for SimClock {
fn default() -> Self {
SimClock {
world_ptr: Arc::new(Mutex::new(None)),
}
}
}
impl SimClock {
pub fn set_world(&self, world: Arc<World>) {
*self.world_ptr.lock() = Some(world);
}
}
impl FormatTime for SimClock {
fn format_time(&self, w: &mut Writer<'_>) -> fmt::Result {
let world = self.world_ptr.lock().clone();
if let Some(world) = world {
let now = world.now();
write!(w, "[{}]", now)
} else {
write!(w, "[?]")
}
}
}
pub fn init_logger() -> SimClock {
let debug_enabled = unsafe { bindings::debug_enabled };
let clock = SimClock::default();
let base_logger = tracing_subscriber::fmt()
.with_target(false)
.with_timer(clock.clone())
.with_ansi(true)
.with_max_level(match debug_enabled {
true => tracing::Level::DEBUG,
false => tracing::Level::INFO,
})
.with_writer(std::io::stdout);
base_logger.init();
// logging::replace_panic_hook_with_tracing_panic_hook().forget();
std::panic::set_hook(Box::new(|_| {}));
clock
}

View File

@@ -0,0 +1,11 @@
#[cfg(test)]
pub mod simple_client;
#[cfg(test)]
pub mod wp_sk;
pub mod disk;
pub mod safekeeper;
pub mod storage;
pub mod log;
pub mod util;

View File

@@ -0,0 +1,372 @@
//! Safekeeper communication endpoint to WAL proposer (compute node).
//! Gets messages from the network, passes them down to consensus module and
//! sends replies back.
use std::{collections::HashMap, path::PathBuf, sync::Arc, time::Duration};
use anyhow::{anyhow, bail, Result};
use bytes::{Bytes, BytesMut};
use hyper::Uri;
use log::info;
use safekeeper::{
safekeeper::{
ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo, UNKNOWN_SERVER_VERSION,
},
simlib::{network::TCP, node_os::NodeOs, proto::AnyMessage, world::NodeEvent},
timeline::TimelineError,
SafeKeeperConf, wal_storage::Storage,
};
use tracing::{debug, info_span};
use utils::{
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
};
use crate::simtest::storage::DiskStateStorage;
use super::{
disk::{Disk, TimelineDisk},
storage::DiskWALStorage,
};
struct ConnState {
tcp: TCP,
greeting: bool,
ttid: TenantTimelineId,
flush_pending: bool,
}
struct SharedState {
sk: SafeKeeper<DiskStateStorage, DiskWALStorage>,
disk: Arc<TimelineDisk>,
}
struct GlobalMap {
timelines: HashMap<TenantTimelineId, SharedState>,
conf: SafeKeeperConf,
disk: Arc<Disk>,
}
impl GlobalMap {
fn new(disk: Arc<Disk>, conf: SafeKeeperConf) -> Result<Self> {
let mut timelines = HashMap::new();
for (&ttid, disk) in disk.timelines.lock().iter() {
debug!("loading timeline {}", ttid);
let state = disk.state.lock().clone();
if state.server.wal_seg_size == 0 {
bail!(TimelineError::UninitializedWalSegSize(ttid));
}
if state.server.pg_version == UNKNOWN_SERVER_VERSION {
bail!(TimelineError::UninitialinzedPgVersion(ttid));
}
if state.commit_lsn < state.local_start_lsn {
bail!(
"commit_lsn {} is higher than local_start_lsn {}",
state.commit_lsn,
state.local_start_lsn
);
}
let control_store = DiskStateStorage::new(disk.clone());
let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?;
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
timelines.insert(
ttid.clone(),
SharedState {
sk,
disk: disk.clone(),
},
);
}
Ok(Self {
timelines,
conf,
disk,
})
}
fn create(&mut self, ttid: TenantTimelineId, server_info: ServerInfo) -> Result<()> {
if self.timelines.contains_key(&ttid) {
bail!("timeline {} already exists", ttid);
}
debug!("creating new timeline {}", ttid);
let commit_lsn = Lsn::INVALID;
let local_start_lsn = Lsn::INVALID;
// TODO: load state from in-memory storage
let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
if state.server.wal_seg_size == 0 {
bail!(TimelineError::UninitializedWalSegSize(ttid));
}
if state.server.pg_version == UNKNOWN_SERVER_VERSION {
bail!(TimelineError::UninitialinzedPgVersion(ttid));
}
if state.commit_lsn < state.local_start_lsn {
bail!(
"commit_lsn {} is higher than local_start_lsn {}",
state.commit_lsn,
state.local_start_lsn
);
}
let disk_timeline = self.disk.put_state(&ttid, state);
let control_store = DiskStateStorage::new(disk_timeline.clone());
let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?;
let sk = SafeKeeper::new(control_store, wal_store, self.conf.my_id)?;
self.timelines.insert(
ttid.clone(),
SharedState {
sk,
disk: disk_timeline,
},
);
Ok(())
}
fn get(&mut self, ttid: &TenantTimelineId) -> &mut SharedState {
self.timelines.get_mut(ttid).expect("timeline must exist")
}
fn has_tli(&self, ttid: &TenantTimelineId) -> bool {
self.timelines.contains_key(ttid)
}
}
pub fn run_server(os: NodeOs, disk: Arc<Disk>) -> Result<()> {
let _enter = info_span!("safekeeper", id = os.id()).entered();
debug!("started server");
os.log_event("started;safekeeper".to_owned());
let conf = SafeKeeperConf {
workdir: PathBuf::from("."),
my_id: NodeId(os.id() as u64),
listen_pg_addr: String::new(),
listen_http_addr: String::new(),
no_sync: false,
broker_endpoint: "/".parse::<Uri>().unwrap(),
broker_keepalive_interval: Duration::from_secs(0),
heartbeat_timeout: Duration::from_secs(0),
remote_storage: None,
max_offloader_lag_bytes: 0,
backup_runtime_threads: None,
wal_backup_enabled: false,
auth: None,
};
let mut global = GlobalMap::new(disk, conf.clone())?;
let mut conns: HashMap<i64, ConnState> = HashMap::new();
for (&ttid, shared_state) in global.timelines.iter_mut() {
let flush_lsn = shared_state.sk.wal_store.flush_lsn();
let commit_lsn = shared_state.sk.state.commit_lsn;
os.log_event(format!("tli_loaded;{};{}", flush_lsn.0, commit_lsn.0));
}
let epoll = os.epoll();
loop {
// waiting for the next message
let mut next_event = Some(epoll.recv());
loop {
let event = match next_event {
Some(event) => event,
None => break,
};
match event {
NodeEvent::Accept(tcp) => {
conns.insert(
tcp.id(),
ConnState {
tcp,
greeting: false,
ttid: TenantTimelineId::empty(),
flush_pending: false,
},
);
}
NodeEvent::Message((msg, tcp)) => {
let conn = conns.get_mut(&tcp.id());
if let Some(conn) = conn {
let res = conn.process_any(msg, &mut global);
if res.is_err() {
debug!("conn {:?} error: {:#}", tcp, res.unwrap_err());
conns.remove(&tcp.id());
}
} else {
debug!("conn {:?} was closed, dropping msg {:?}", tcp, msg);
}
}
NodeEvent::Internal(_) => {}
NodeEvent::Closed(_) => {}
NodeEvent::WakeTimeout(_) => {}
}
// TODO: make simulator support multiple events per tick
next_event = epoll.try_recv();
}
conns.retain(|_, conn| {
let res = conn.flush(&mut global);
if res.is_err() {
debug!("conn {:?} error: {:?}", conn.tcp, res);
}
res.is_ok()
});
}
}
impl ConnState {
fn process_any(&mut self, any: AnyMessage, global: &mut GlobalMap) -> Result<()> {
if let AnyMessage::Bytes(copy_data) = any {
let repl_prefix = b"START_REPLICATION ";
if !self.greeting && copy_data.starts_with(repl_prefix) {
self.process_start_replication(copy_data.slice(repl_prefix.len()..), global)?;
bail!("finished processing START_REPLICATION")
}
let msg = ProposerAcceptorMessage::parse(copy_data)?;
debug!("got msg: {:?}", msg);
return self.process(msg, global);
} else {
bail!("unexpected message, expected AnyMessage::Bytes");
}
}
fn process_start_replication(
&mut self,
copy_data: Bytes,
global: &mut GlobalMap,
) -> Result<()> {
// format is "<tenant_id> <timeline_id> <start_lsn> <end_lsn>"
let str = String::from_utf8(copy_data.to_vec())?;
let mut parts = str.split(' ');
let tenant_id = parts.next().unwrap().parse::<TenantId>()?;
let timeline_id = parts.next().unwrap().parse::<TimelineId>()?;
let start_lsn = parts.next().unwrap().parse::<u64>()?;
let end_lsn = parts.next().unwrap().parse::<u64>()?;
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
let shared_state = global.get(&ttid);
// read bytes from start_lsn to end_lsn
let mut buf = vec![0; (end_lsn - start_lsn) as usize];
shared_state.disk.wal.lock().read(start_lsn, &mut buf);
// send bytes to the client
self.tcp.send(AnyMessage::Bytes(Bytes::from(buf)));
Ok(())
}
fn init_timeline(
&mut self,
ttid: TenantTimelineId,
server_info: ServerInfo,
global: &mut GlobalMap,
) -> Result<()> {
self.ttid = ttid;
if global.has_tli(&ttid) {
return Ok(());
}
global.create(ttid, server_info)
}
fn process(&mut self, msg: ProposerAcceptorMessage, global: &mut GlobalMap) -> Result<()> {
if !self.greeting {
self.greeting = true;
match msg {
ProposerAcceptorMessage::Greeting(ref greeting) => {
debug!(
"start handshake with walproposer {:?}",
self.tcp,
);
let server_info = ServerInfo {
pg_version: greeting.pg_version,
system_id: greeting.system_id,
wal_seg_size: greeting.wal_seg_size,
};
let ttid = TenantTimelineId::new(greeting.tenant_id, greeting.timeline_id);
self.init_timeline(ttid, server_info, global)?
}
_ => {
bail!("unexpected message {msg:?} instead of greeting");
}
}
}
let tli = global.get(&self.ttid);
match msg {
ProposerAcceptorMessage::AppendRequest(append_request) => {
self.flush_pending = true;
self.process_sk_msg(
tli,
&ProposerAcceptorMessage::NoFlushAppendRequest(append_request),
)?;
}
other => {
self.process_sk_msg(tli, &other)?;
}
}
Ok(())
}
/// Process FlushWAL if needed.
// TODO: add extra flushes, to verify that extra flushes don't break anything
fn flush(&mut self, global: &mut GlobalMap) -> Result<()> {
if !self.flush_pending {
return Ok(());
}
self.flush_pending = false;
let shared_state = global.get(&self.ttid);
self.process_sk_msg(shared_state, &ProposerAcceptorMessage::FlushWAL)
}
/// Make safekeeper process a message and send a reply to the TCP
fn process_sk_msg(
&mut self,
shared_state: &mut SharedState,
msg: &ProposerAcceptorMessage,
) -> Result<()> {
let mut reply = shared_state.sk.process_msg(msg)?;
if let Some(reply) = &mut reply {
// // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
// if let AcceptorProposerMessage::AppendResponse(ref mut resp) = reply {
// // TODO:
// }
let mut buf = BytesMut::with_capacity(128);
reply.serialize(&mut buf)?;
self.tcp.send(AnyMessage::Bytes(buf.into()));
}
Ok(())
}
}
impl Drop for ConnState {
fn drop(&mut self) {
debug!("dropping conn: {:?}", self.tcp);
if !std::thread::panicking() {
self.tcp.close();
}
// TODO: clean up non-fsynced WAL
}
}

View File

@@ -0,0 +1,38 @@
use std::sync::Arc;
use safekeeper::{
simlib::{
network::{Delay, NetworkOptions},
world::World,
},
simtest::{start_simulation, Options},
};
use crate::{bindings::RunClientC, c_context};
#[test]
fn run_rust_c_test() {
let delay = Delay {
min: 1,
max: 5,
fail_prob: 0.5,
};
let network = NetworkOptions {
keepalive_timeout: Some(50),
connect_delay: delay.clone(),
send_delay: delay.clone(),
};
let u32_data: [u32; 5] = [1, 2, 3, 4, 5];
let world = Arc::new(World::new(1337, Arc::new(network), c_context()));
start_simulation(Options {
world,
time_limit: 1_000_000,
client_fn: Box::new(move |_, server_id| unsafe {
RunClientC(server_id);
}),
u32_data,
});
}

View File

@@ -0,0 +1,234 @@
use std::{ops::Deref, sync::Arc};
use anyhow::Result;
use bytes::{Buf, BytesMut};
use log::{debug, info};
use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo};
use safekeeper::{control_file, safekeeper::SafeKeeperState, wal_storage};
use utils::lsn::Lsn;
use super::disk::TimelineDisk;
pub struct DiskStateStorage {
persisted_state: SafeKeeperState,
disk: Arc<TimelineDisk>,
}
impl DiskStateStorage {
pub fn new(disk: Arc<TimelineDisk>) -> Self {
let guard = disk.state.lock();
let state = guard.clone();
drop(guard);
DiskStateStorage {
persisted_state: state,
disk,
}
}
}
impl control_file::Storage for DiskStateStorage {
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
self.persisted_state = s.clone();
*self.disk.state.lock() = s.clone();
Ok(())
}
}
impl Deref for DiskStateStorage {
type Target = SafeKeeperState;
fn deref(&self) -> &Self::Target {
&self.persisted_state
}
}
pub struct DummyWalStore {
lsn: Lsn,
}
impl DummyWalStore {
pub fn new() -> Self {
DummyWalStore { lsn: Lsn::INVALID }
}
}
impl wal_storage::Storage for DummyWalStore {
fn flush_lsn(&self) -> Lsn {
self.lsn
}
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
self.lsn = startpos + buf.len() as u64;
Ok(())
}
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
self.lsn = end_pos;
Ok(())
}
fn flush_wal(&mut self) -> Result<()> {
Ok(())
}
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
}
fn get_metrics(&self) -> safekeeper::metrics::WalStorageMetrics {
safekeeper::metrics::WalStorageMetrics::default()
}
}
pub struct DiskWALStorage {
/// Written to disk, but possibly still in the cache and not fully persisted.
/// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
write_lsn: Lsn,
/// The LSN of the last WAL record written to disk. Still can be not fully flushed.
write_record_lsn: Lsn,
/// The LSN of the last WAL record flushed to disk.
flush_record_lsn: Lsn,
/// Decoder is required for detecting boundaries of WAL records.
decoder: WalStreamDecoder,
unflushed_bytes: BytesMut,
disk: Arc<TimelineDisk>,
}
impl DiskWALStorage {
pub fn new(disk: Arc<TimelineDisk>, state: &SafeKeeperState) -> Result<Self> {
let write_lsn = if state.commit_lsn == Lsn(0) {
Lsn(0)
} else {
Self::find_end_of_wal(disk.clone(), state.commit_lsn)?
};
let flush_lsn = write_lsn;
Ok(DiskWALStorage {
write_lsn,
write_record_lsn: flush_lsn,
flush_record_lsn: flush_lsn,
decoder: WalStreamDecoder::new(flush_lsn, 15),
unflushed_bytes: BytesMut::new(),
disk,
})
}
fn find_end_of_wal(disk: Arc<TimelineDisk>, start_lsn: Lsn) -> Result<Lsn> {
let mut buf = [0; 8192];
let mut pos = start_lsn.0;
let mut decoder = WalStreamDecoder::new(start_lsn, 15);
let mut result = start_lsn;
loop {
disk.wal.lock().read(pos, &mut buf);
pos += buf.len() as u64;
decoder.feed_bytes(&buf);
loop {
match decoder.poll_decode() {
Ok(Some(record)) => result = record.0,
Err(e) => {
debug!(
"find_end_of_wal reached end at {:?}, decode error: {:?}",
result, e
);
return Ok(result);
}
Ok(None) => break, // need more data
}
}
}
}
}
impl wal_storage::Storage for DiskWALStorage {
fn flush_lsn(&self) -> Lsn {
self.flush_record_lsn
}
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
if self.write_lsn != startpos {
panic!("write_wal called with wrong startpos");
}
self.unflushed_bytes.extend_from_slice(buf);
self.write_lsn += buf.len() as u64;
if self.decoder.available() != startpos {
info!(
"restart decoder from {} to {}",
self.decoder.available(),
startpos,
);
self.decoder = WalStreamDecoder::new(startpos, 15);
}
self.decoder.feed_bytes(buf);
loop {
match self.decoder.poll_decode()? {
None => break, // no full record yet
Some((lsn, _rec)) => {
self.write_record_lsn = lsn;
}
}
}
Ok(())
}
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
panic!(
"truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
self.write_lsn, end_pos
);
}
self.flush_wal()?;
// write zeroes to disk from end_pos until self.write_lsn
let buf = [0; 8192];
let mut pos = end_pos.0;
while pos < self.write_lsn.0 {
self.disk.wal.lock().write(pos, &buf);
pos += buf.len() as u64;
}
self.write_lsn = end_pos;
self.write_record_lsn = end_pos;
self.flush_record_lsn = end_pos;
self.unflushed_bytes.clear();
self.decoder = WalStreamDecoder::new(end_pos, 15);
Ok(())
}
fn flush_wal(&mut self) -> Result<()> {
if self.flush_record_lsn == self.write_record_lsn {
// no need to do extra flush
return Ok(());
}
let num_bytes = self.write_record_lsn.0 - self.flush_record_lsn.0;
self.disk.wal.lock().write(
self.flush_record_lsn.0,
&self.unflushed_bytes[..num_bytes as usize],
);
self.unflushed_bytes.advance(num_bytes as usize);
self.flush_record_lsn = self.write_record_lsn;
Ok(())
}
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
}
fn get_metrics(&self) -> safekeeper::metrics::WalStorageMetrics {
safekeeper::metrics::WalStorageMetrics::default()
}
}

View File

@@ -0,0 +1,610 @@
use std::{ffi::CString, path::Path, str::FromStr, sync::Arc, collections::HashMap};
use rand::{Rng, SeedableRng};
use safekeeper::simlib::{
network::{Delay, NetworkOptions},
proto::AnyMessage,
time::EmptyEvent,
world::World,
world::{Node, NodeEvent, SEvent, NodeId},
};
use tracing::{debug, error, info, warn};
use utils::{id::TenantTimelineId, lsn::Lsn};
use crate::{
bindings::{
neon_tenant_walproposer, neon_timeline_walproposer, sim_redo_start_lsn, syncSafekeepers,
wal_acceptor_connection_timeout, wal_acceptor_reconnect_timeout, wal_acceptors_list,
MyInsertRecord, WalProposerCleanup, WalProposerRust,
},
c_context,
simtest::{
log::{init_logger, SimClock},
safekeeper::run_server,
},
};
use super::disk::Disk;
pub struct SkNode {
pub node: Arc<Node>,
pub id: u32,
pub disk: Arc<Disk>,
}
impl SkNode {
pub fn new(node: Arc<Node>) -> Self {
let disk = Arc::new(Disk::new());
let res = Self {
id: node.id,
node,
disk,
};
res.launch();
res
}
pub fn launch(&self) {
let id = self.id;
let disk = self.disk.clone();
// start the server thread
self.node.launch(move |os| {
let res = run_server(os, disk);
debug!("server {} finished: {:?}", id, res);
});
}
pub fn restart(&self) {
self.node.crash_stop();
self.launch();
}
}
pub struct TestConfig {
pub network: NetworkOptions,
pub timeout: u64,
pub clock: Option<SimClock>,
}
impl TestConfig {
pub fn new(clock: Option<SimClock>) -> Self {
Self {
network: NetworkOptions {
keepalive_timeout: Some(2000),
connect_delay: Delay {
min: 1,
max: 5,
fail_prob: 0.0,
},
send_delay: Delay {
min: 1,
max: 5,
fail_prob: 0.0,
},
},
timeout: 1_000 * 10,
clock,
}
}
pub fn start(&self, seed: u64) -> Test {
let world = Arc::new(World::new(
seed,
Arc::new(self.network.clone()),
c_context(),
));
world.register_world();
if let Some(clock) = &self.clock {
clock.set_world(world.clone());
}
let servers = [
SkNode::new(world.new_node()),
SkNode::new(world.new_node()),
SkNode::new(world.new_node()),
];
let server_ids = [servers[0].id, servers[1].id, servers[2].id];
let safekeepers_guc = server_ids.map(|id| format!("node:{}", id)).join(",");
let ttid = TenantTimelineId::generate();
// wait init for all servers
world.await_all();
// clean up pgdata directory
self.init_pgdata();
Test {
world,
servers,
safekeepers_guc,
ttid,
timeout: self.timeout,
}
}
pub fn init_pgdata(&self) {
let pgdata = Path::new("/home/admin/simulator/libs/walproposer/pgdata");
if pgdata.exists() {
std::fs::remove_dir_all(pgdata).unwrap();
}
std::fs::create_dir(pgdata).unwrap();
// create empty pg_wal and pg_notify subdirs
std::fs::create_dir(pgdata.join("pg_wal")).unwrap();
std::fs::create_dir(pgdata.join("pg_notify")).unwrap();
// write postgresql.conf
let mut conf = std::fs::File::create(pgdata.join("postgresql.conf")).unwrap();
let content = "
wal_log_hints=off
hot_standby=on
fsync=off
wal_level=replica
restart_after_crash=off
shared_preload_libraries=neon
neon.pageserver_connstring=''
neon.tenant_id=cc6e67313d57283bad411600fbf5c142
neon.timeline_id=de6fa815c1e45aa61491c3d34c4eb33e
synchronous_standby_names=walproposer
neon.safekeepers='node:1,node:2,node:3'
max_connections=100
";
std::io::Write::write_all(&mut conf, content.as_bytes()).unwrap();
}
}
pub struct Test {
pub world: Arc<World>,
pub servers: [SkNode; 3],
pub safekeepers_guc: String,
pub ttid: TenantTimelineId,
pub timeout: u64,
}
impl Test {
fn launch_sync(&self) -> Arc<Node> {
let client_node = self.world.new_node();
debug!("sync-safekeepers started at node {}", client_node.id);
// start the client thread
let guc = self.safekeepers_guc.clone();
let ttid = self.ttid.clone();
client_node.launch(move |_| {
let list = CString::new(guc).unwrap();
unsafe {
WalProposerCleanup();
syncSafekeepers = true;
wal_acceptors_list = list.into_raw();
wal_acceptor_reconnect_timeout = 1000;
wal_acceptor_connection_timeout = 5000;
neon_tenant_walproposer =
CString::new(ttid.tenant_id.to_string()).unwrap().into_raw();
neon_timeline_walproposer = CString::new(ttid.timeline_id.to_string())
.unwrap()
.into_raw();
WalProposerRust();
}
});
self.world.await_all();
client_node
}
pub fn sync_safekeepers(&self) -> anyhow::Result<Lsn> {
let client_node = self.launch_sync();
// poll until exit or timeout
let time_limit = self.timeout;
while self.world.step() && self.world.now() < time_limit && !client_node.is_finished() {}
if !client_node.is_finished() {
anyhow::bail!("timeout or idle stuck");
}
let res = client_node.result.lock().clone();
if res.0 != 0 {
anyhow::bail!("non-zero exitcode: {:?}", res);
}
let lsn = Lsn::from_str(&res.1)?;
Ok(lsn)
}
pub fn launch_walproposer(&self, lsn: Lsn) -> WalProposer {
let client_node = self.world.new_node();
let lsn = if lsn.0 == 0 {
// usual LSN after basebackup
Lsn(21623024)
} else {
lsn
};
// start the client thread
let guc = self.safekeepers_guc.clone();
let ttid = self.ttid.clone();
client_node.launch(move |_| {
let list = CString::new(guc).unwrap();
unsafe {
WalProposerCleanup();
sim_redo_start_lsn = lsn.0;
syncSafekeepers = false;
wal_acceptors_list = list.into_raw();
wal_acceptor_reconnect_timeout = 1000;
wal_acceptor_connection_timeout = 5000;
neon_tenant_walproposer =
CString::new(ttid.tenant_id.to_string()).unwrap().into_raw();
neon_timeline_walproposer = CString::new(ttid.timeline_id.to_string())
.unwrap()
.into_raw();
WalProposerRust();
}
});
self.world.await_all();
WalProposer {
node: client_node,
}
}
pub fn poll_for_duration(&self, duration: u64) {
let time_limit = std::cmp::min(self.world.now() + duration, self.timeout);
while self.world.step() && self.world.now() < time_limit {}
}
pub fn run_schedule(&self, schedule: &Schedule) -> anyhow::Result<()> {
{
let empty_event = Box::new(EmptyEvent);
let now = self.world.now();
for (time, _) in schedule {
if *time < now {
continue;
}
self.world.schedule(*time - now, empty_event.clone())
}
}
let mut wait_node = self.launch_sync();
// fake walproposer
let mut wp = WalProposer {
node: wait_node.clone(),
};
let mut sync_in_progress = true;
let mut skipped_tx = 0;
let mut started_tx = 0;
let mut schedule_ptr = 0;
loop {
if sync_in_progress && wait_node.is_finished() {
let res = wait_node.result.lock().clone();
if res.0 != 0 {
warn!("sync non-zero exitcode: {:?}", res);
debug!("restarting walproposer");
wait_node = self.launch_sync();
continue;
}
let lsn = Lsn::from_str(&res.1)?;
debug!("sync-safekeepers finished at LSN {}", lsn);
wp = self.launch_walproposer(lsn);
wait_node = wp.node.clone();
debug!("walproposer started at node {}", wait_node.id);
sync_in_progress = false;
}
let now = self.world.now();
while schedule_ptr < schedule.len() && schedule[schedule_ptr].0 <= now {
if now != schedule[schedule_ptr].0 {
warn!("skipped event {:?} at {}", schedule[schedule_ptr], now);
}
let action = &schedule[schedule_ptr].1;
match action {
TestAction::WriteTx(size) => {
if !sync_in_progress && !wait_node.is_finished() {
started_tx += *size;
wp.write_tx(*size);
debug!("written {} transactions", size);
} else {
skipped_tx += size;
debug!("skipped {} transactions", size);
}
}
TestAction::RestartSafekeeper(id) => {
debug!("restarting safekeeper {}", id);
self.servers[*id as usize].restart();
}
TestAction::RestartWalProposer => {
debug!("restarting walproposer");
wait_node.crash_stop();
sync_in_progress = true;
wait_node = self.launch_sync();
}
}
schedule_ptr += 1;
}
if schedule_ptr == schedule.len() {
break;
}
let next_event_time = schedule[schedule_ptr].0;
// poll until the next event
if wait_node.is_finished() {
while self.world.step() && self.world.now() < next_event_time {}
} else {
while self.world.step()
&& self.world.now() < next_event_time
&& !wait_node.is_finished()
{}
}
}
debug!("finished schedule");
debug!("skipped_tx: {}", skipped_tx);
debug!("started_tx: {}", started_tx);
Ok(())
}
}
pub struct WalProposer {
pub node: Arc<Node>,
}
impl WalProposer {
pub fn write_tx(&mut self, cnt: usize) {
self.node
.network_chan()
.send(NodeEvent::Internal(AnyMessage::Just32(cnt as u32)));
}
pub fn stop(&self) {
self.node.crash_stop();
}
}
#[derive(Debug, Clone)]
pub enum TestAction {
WriteTx(usize),
RestartSafekeeper(usize),
RestartWalProposer,
}
pub type Schedule = Vec<(u64, TestAction)>;
pub fn generate_schedule(seed: u64) -> Schedule {
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
let mut schedule = Vec::new();
let mut time = 0;
let cnt = rng.gen_range(1..100);
for _ in 0..cnt {
time += rng.gen_range(0..500);
let action = match rng.gen_range(0..3) {
0 => TestAction::WriteTx(rng.gen_range(1..10)),
1 => TestAction::RestartSafekeeper(rng.gen_range(0..3)),
2 => TestAction::RestartWalProposer,
_ => unreachable!(),
};
schedule.push((time, action));
}
schedule
}
pub fn generate_network_opts(seed: u64) -> NetworkOptions {
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
let timeout = rng.gen_range(100..2000);
let max_delay = rng.gen_range(1..2*timeout);
let min_delay = rng.gen_range(1..=max_delay);
let max_fail_prob = rng.gen_range(0.0..0.9);
let connect_fail_prob = rng.gen_range(0.0..max_fail_prob);
let send_fail_prob = rng.gen_range(0.0..connect_fail_prob);
NetworkOptions {
keepalive_timeout: Some(timeout),
connect_delay: Delay {
min: min_delay,
max: max_delay,
fail_prob: connect_fail_prob,
},
send_delay: Delay {
min: min_delay,
max: max_delay,
fail_prob: send_fail_prob,
},
}
}
#[derive(Debug,Clone,PartialEq,Eq)]
enum NodeKind {
Unknown,
Safekeeper,
WalProposer,
}
impl Default for NodeKind {
fn default() -> Self {
Self::Unknown
}
}
#[derive(Clone, Debug, Default)]
struct NodeInfo {
kind: NodeKind,
// walproposer
is_sync: bool,
term: u64,
epoch_lsn: u64,
// safekeeper
commit_lsn: u64,
flush_lsn: u64,
}
impl NodeInfo {
fn init_kind(&mut self, kind: NodeKind) {
if self.kind == NodeKind::Unknown {
self.kind = kind;
} else {
assert!(self.kind == kind);
}
}
fn started(&mut self, data: &str) {
let mut parts = data.split(';');
assert!(parts.next().unwrap() == "started");
match parts.next().unwrap() {
"safekeeper" => {
self.init_kind(NodeKind::Safekeeper);
}
"walproposer" => {
self.init_kind(NodeKind::WalProposer);
let is_sync: u8 = parts.next().unwrap().parse().unwrap();
self.is_sync = is_sync != 0;
}
_ => unreachable!(),
}
}
}
#[derive(Debug,Default)]
struct GlobalState {
nodes: Vec<NodeInfo>,
commit_lsn: u64,
write_lsn: u64,
max_write_lsn: u64,
written_wal: u64,
written_records: u64,
}
impl GlobalState {
fn new() -> Self {
Default::default()
}
fn get(&mut self, id: u32) -> &mut NodeInfo {
let id = id as usize;
if id >= self.nodes.len() {
self.nodes.resize(id + 1, NodeInfo::default());
}
&mut self.nodes[id]
}
}
pub fn validate_events(events: Vec<SEvent>) {
const INITDB_LSN: u64 = 21623024;
let hook = std::panic::take_hook();
scopeguard::defer_on_success! {
std::panic::set_hook(hook);
};
let mut state = GlobalState::new();
state.max_write_lsn = INITDB_LSN;
for event in events {
debug!("{:?}", event);
let node = state.get(event.node);
if event.data.starts_with("started;") {
node.started(&event.data);
continue;
}
assert!(node.kind != NodeKind::Unknown);
// drop reference to unlock state
let mut node = node.clone();
let mut parts = event.data.split(';');
match node.kind {
NodeKind::Safekeeper => {
match parts.next().unwrap() {
"tli_loaded" => {
let flush_lsn: u64 = parts.next().unwrap().parse().unwrap();
let commit_lsn: u64 = parts.next().unwrap().parse().unwrap();
node.flush_lsn = flush_lsn;
node.commit_lsn = commit_lsn;
}
_ => unreachable!(),
}
}
NodeKind::WalProposer => {
match parts.next().unwrap() {
"prop_elected" => {
let prop_lsn: u64 = parts.next().unwrap().parse().unwrap();
let prop_term: u64 = parts.next().unwrap().parse().unwrap();
let prev_lsn: u64 = parts.next().unwrap().parse().unwrap();
let prev_term: u64 = parts.next().unwrap().parse().unwrap();
assert!(prop_lsn >= prev_lsn);
assert!(prop_term >= prev_term);
assert!(prop_lsn >= state.commit_lsn);
if prop_lsn > state.write_lsn {
assert!(prop_lsn <= state.max_write_lsn);
debug!("moving write_lsn up from {} to {}", state.write_lsn, prop_lsn);
state.write_lsn = prop_lsn;
}
if prop_lsn < state.write_lsn {
debug!("moving write_lsn down from {} to {}", state.write_lsn, prop_lsn);
state.write_lsn = prop_lsn;
}
node.epoch_lsn = prop_lsn;
node.term = prop_term;
}
"write_wal" => {
assert!(!node.is_sync);
let start_lsn: u64 = parts.next().unwrap().parse().unwrap();
let end_lsn: u64 = parts.next().unwrap().parse().unwrap();
let cnt: u64 = parts.next().unwrap().parse().unwrap();
let size = end_lsn - start_lsn;
state.written_wal += size;
state.written_records += cnt;
// TODO: If we allow writing WAL before winning the election
assert!(start_lsn >= state.commit_lsn);
assert!(end_lsn >= start_lsn);
assert!(start_lsn == state.write_lsn);
state.write_lsn = end_lsn;
if end_lsn > state.max_write_lsn {
state.max_write_lsn = end_lsn;
}
}
"commit_lsn" => {
let lsn: u64 = parts.next().unwrap().parse().unwrap();
assert!(lsn >= state.commit_lsn);
state.commit_lsn = lsn;
}
_ => unreachable!(),
}
}
_ => unreachable!(),
}
// update the node in the state struct
*state.get(event.node) = node;
}
}

View File

@@ -0,0 +1,265 @@
use std::{ffi::CString, path::Path, str::FromStr, sync::Arc};
use rand::Rng;
use safekeeper::simlib::{
network::{Delay, NetworkOptions},
proto::AnyMessage,
world::World,
world::{Node, NodeEvent},
};
use tracing::{info, warn};
use utils::{id::TenantTimelineId, lsn::Lsn};
use crate::{
bindings::{
neon_tenant_walproposer, neon_timeline_walproposer, sim_redo_start_lsn, syncSafekeepers,
wal_acceptor_connection_timeout, wal_acceptor_reconnect_timeout, wal_acceptors_list,
MyInsertRecord, WalProposerCleanup, WalProposerRust,
},
c_context,
simtest::{
log::{init_logger, SimClock},
safekeeper::run_server,
util::{generate_schedule, TestConfig, generate_network_opts, validate_events},
}, enable_debug,
};
use super::{
disk::Disk,
util::{Schedule, TestAction},
};
#[test]
fn sync_empty_safekeepers() {
let clock = init_logger();
let mut config = TestConfig::new(Some(clock));
let test = config.start(1337);
let lsn = test.sync_safekeepers().unwrap();
assert_eq!(lsn, Lsn(0));
info!("Sucessfully synced empty safekeepers at 0/0");
let lsn = test.sync_safekeepers().unwrap();
assert_eq!(lsn, Lsn(0));
info!("Sucessfully synced (again) empty safekeepers at 0/0");
}
#[test]
fn run_walproposer_generate_wal() {
let clock = init_logger();
let mut config = TestConfig::new(Some(clock));
// config.network.timeout = Some(250);
let test = config.start(1337);
let lsn = test.sync_safekeepers().unwrap();
assert_eq!(lsn, Lsn(0));
info!("Sucessfully synced empty safekeepers at 0/0");
let mut wp = test.launch_walproposer(lsn);
test.poll_for_duration(30);
for i in 0..100 {
wp.write_tx(1);
test.poll_for_duration(5);
}
}
#[test]
fn crash_safekeeper() {
let clock = init_logger();
let mut config = TestConfig::new(Some(clock));
// config.network.timeout = Some(250);
let test = config.start(1337);
let lsn = test.sync_safekeepers().unwrap();
assert_eq!(lsn, Lsn(0));
info!("Sucessfully synced empty safekeepers at 0/0");
let mut wp = test.launch_walproposer(lsn);
test.poll_for_duration(30);
wp.write_tx(3);
test.servers[0].restart();
test.poll_for_duration(100);
test.poll_for_duration(1000);
}
#[test]
fn test_simple_restart() {
let clock = init_logger();
let mut config = TestConfig::new(Some(clock));
// config.network.timeout = Some(250);
let test = config.start(1337);
let lsn = test.sync_safekeepers().unwrap();
assert_eq!(lsn, Lsn(0));
info!("Sucessfully synced empty safekeepers at 0/0");
let mut wp = test.launch_walproposer(lsn);
test.poll_for_duration(30);
wp.write_tx(3);
test.poll_for_duration(100);
wp.stop();
drop(wp);
let lsn = test.sync_safekeepers().unwrap();
info!("Sucessfully synced safekeepers at {}", lsn);
}
#[test]
fn test_simple_schedule() -> anyhow::Result<()> {
let clock = init_logger();
let mut config = TestConfig::new(Some(clock));
config.network.keepalive_timeout = Some(100);
let test = config.start(1337);
let schedule: Schedule = vec![
(0, TestAction::RestartWalProposer),
(50, TestAction::WriteTx(5)),
(100, TestAction::RestartSafekeeper(0)),
(100, TestAction::WriteTx(5)),
(110, TestAction::RestartSafekeeper(1)),
(110, TestAction::WriteTx(5)),
(120, TestAction::RestartSafekeeper(2)),
(120, TestAction::WriteTx(5)),
(201, TestAction::RestartWalProposer),
(251, TestAction::RestartSafekeeper(0)),
(251, TestAction::RestartSafekeeper(1)),
(251, TestAction::RestartSafekeeper(2)),
(251, TestAction::WriteTx(5)),
(255, TestAction::WriteTx(5)),
(1000, TestAction::WriteTx(5)),
];
test.run_schedule(&schedule)?;
info!("Test finished, stopping all threads");
test.world.deallocate();
Ok(())
}
#[test]
fn test_many_tx() -> anyhow::Result<()> {
enable_debug();
let clock = init_logger();
let mut config = TestConfig::new(Some(clock));
let test = config.start(1337);
let mut schedule: Schedule = vec![];
for i in 0..100 {
schedule.push((i * 10, TestAction::WriteTx(10)));
}
test.run_schedule(&schedule)?;
info!("Test finished, stopping all threads");
test.world.stop_all();
let events = test.world.take_events();
info!("Events: {:?}", events);
let last_commit_lsn = events
.iter()
.filter_map(|event| {
if event.data.starts_with("commit_lsn;") {
let lsn: u64 = event.data.split(';').nth(1).unwrap().parse().unwrap();
return Some(lsn);
}
None
})
.last()
.unwrap();
let initdb_lsn = 21623024;
let diff = last_commit_lsn - initdb_lsn;
info!("Last commit lsn: {}, diff: {}", last_commit_lsn, diff);
assert!(diff > 1000 * 8);
Ok(())
}
#[test]
fn test_random_schedules() -> anyhow::Result<()> {
let clock = init_logger();
let mut config = TestConfig::new(Some(clock));
config.network.keepalive_timeout = Some(100);
for i in 0..30000 {
let seed: u64 = rand::thread_rng().gen();
config.network = generate_network_opts(seed);
let test = config.start(seed);
warn!("Running test with seed {}", seed);
let schedule = generate_schedule(seed);
test.run_schedule(&schedule).unwrap();
validate_events(test.world.take_events());
test.world.deallocate();
}
Ok(())
}
#[test]
fn test_one_schedule() -> anyhow::Result<()> {
enable_debug();
let clock = init_logger();
let mut config = TestConfig::new(Some(clock));
config.network.keepalive_timeout = Some(100);
// let seed = 6762900106769428342;
// let test = config.start(seed);
// warn!("Running test with seed {}", seed);
// let schedule = generate_schedule(seed);
// info!("schedule: {:?}", schedule);
// test.run_schedule(&schedule)?;
// test.world.deallocate();
let seed = 3649773280641776194;
config.network = generate_network_opts(seed);
info!("network: {:?}", config.network);
let test = config.start(seed);
warn!("Running test with seed {}", seed);
let schedule = generate_schedule(seed);
info!("schedule: {:?}", schedule);
test.run_schedule(&schedule).unwrap();
validate_events(test.world.take_events());
test.world.deallocate();
Ok(())
}
#[test]
fn test_res_dealloc() -> anyhow::Result<()> {
// enable_debug();
let clock = init_logger();
let mut config = TestConfig::new(Some(clock));
// print pid
let pid = unsafe { libc::getpid() };
info!("pid: {}", pid);
let seed = 123456;
config.network = generate_network_opts(seed);
let test = config.start(seed);
warn!("Running test with seed {}", seed);
let schedule = generate_schedule(seed);
info!("schedule: {:?}", schedule);
test.run_schedule(&schedule).unwrap();
test.world.stop_all();
let world = test.world.clone();
drop(test);
info!("world strong count: {}", Arc::strong_count(&world));
world.deallocate();
info!("world strong count: {}", Arc::strong_count(&world));
Ok(())
}

View File

@@ -0,0 +1,31 @@
use tracing::info;
use crate::bindings::{TestFunc, MyContextInit};
#[test]
fn test_rust_c_calls() {
let res = std::thread::spawn(|| {
let res = unsafe {
MyContextInit();
TestFunc(1, 2)
};
res
}).join().unwrap();
info!("res: {}", res);
}
#[test]
fn test_sim_bindings() {
std::thread::spawn(|| {
unsafe {
MyContextInit();
TestFunc(1, 2)
}
}).join().unwrap();
std::thread::spawn(|| {
unsafe {
MyContextInit();
TestFunc(1, 2)
}
}).join().unwrap();
}

100
libs/walproposer/test.c Normal file
View File

@@ -0,0 +1,100 @@
#include "bindgen_deps.h"
#include "rust_bindings.h"
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include "postgres.h"
#include "utils/memutils.h"
#include "utils/guc.h"
#include "miscadmin.h"
#include "common/pg_prng.h"
// From src/backend/main/main.c
const char *progname = "fakepostgres";
int TestFunc(int a, int b) {
printf("TestFunc: %d + %d = %d\n", a, b, a + b);
rust_function(0);
elog(LOG, "postgres elog test");
printf("After rust_function\n");
return a + b;
}
// This is a quick experiment with rewriting existing Rust code in C.
void RunClientC(uint32_t serverId) {
uint32_t clientId = sim_id();
elog(LOG, "started client");
int data_len = 5;
int delivered = 0;
int tcp = sim_open_tcp(serverId);
while (delivered < data_len) {
sim_msg_set_repl_cell(delivered+1, clientId, delivered);
sim_tcp_send(tcp);
Event event = sim_epoll_rcv(-1);
switch (event.tag)
{
case Closed:
elog(LOG, "connection closed");
tcp = sim_open_tcp(serverId);
break;
case Message:
Assert(event.any_message == Just32);
uint32_t val;
sim_msg_get_just_u32(&val);
if (val == delivered + 1) {
delivered += 1;
}
break;
default:
Assert(false);
}
}
}
bool debug_enabled = false;
bool initializedMemoryContext = false;
// pthread_mutex_init(&lock, NULL)?
pthread_mutex_t lock;
void MyContextInit() {
// initializes global variables, TODO how to make them thread-local?
pthread_mutex_lock(&lock);
if (!initializedMemoryContext) {
initializedMemoryContext = true;
MemoryContextInit();
pg_prng_seed(&pg_global_prng_state, 0);
setenv("PGDATA", "/home/admin/simulator/libs/walproposer/pgdata", 1);
/*
* Set default values for command-line options.
*/
InitializeGUCOptions();
/* Acquire configuration parameters */
if (!SelectConfigFiles(NULL, progname))
exit(1);
if (debug_enabled) {
log_min_messages = LOG;
} else {
log_min_messages = FATAL;
}
Log_line_prefix = "[%p] ";
InitializeMaxBackends();
ChangeToDataDir();
CreateSharedMemoryAndSemaphores();
SetInstallXLogFileSegmentActive();
// CreateAuxProcessResourceOwner();
// StartupXLOG();
}
pthread_mutex_unlock(&lock);
}

View File

@@ -13,14 +13,13 @@ OBJS = \
walproposer.o \
walproposer_utils.o
PG_CPPFLAGS = -I$(libpq_srcdir)
SHLIB_LINK_INTERNAL = $(libpq)
PG_CPPFLAGS = -I$(libpq_srcdir) -DSIMLIB
PG_LIBS = $(libpq)
EXTENSION = neon
DATA = neon--1.0.sql
PGFILEDESC = "neon - cloud storage for PostgreSQL"
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)

1
pgxn/neon/rust_bindings.h Symbolic link
View File

@@ -0,0 +1 @@
../../libs/walproposer/rust_bindings.h

View File

@@ -71,7 +71,7 @@
#include "walproposer.h"
#include "walproposer_utils.h"
static bool syncSafekeepers = false;
bool syncSafekeepers = false;
char *wal_acceptors_list;
int wal_acceptor_reconnect_timeout;
@@ -84,6 +84,11 @@ char *neon_safekeeper_token_walproposer = NULL;
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
#ifdef SIMLIB
#include "rust_bindings.h"
#define GetCurrentTimestamp() ((TimestampTz) sim_now())
#endif
static int n_safekeepers = 0;
static int quorum = 0;
static Safekeeper safekeeper[MAX_SAFEKEEPERS];
@@ -316,6 +321,84 @@ nwp_shmem_startup_hook(void)
WalproposerShmemInit();
}
void WalProposerCleanup()
{
for (int i = 0; i < n_safekeepers; i++)
{
if (safekeeper[i].xlogreader)
{
XLogReaderFree(safekeeper[i].xlogreader);
safekeeper[i].xlogreader = NULL;
}
}
n_safekeepers = 0;
quorum = 0;
lastSentCommitLsn = 0;
availableLsn = 0;
lastSentCommitLsn = 0;
truncateLsn = 0;
propTerm = 0;
propTermHistory.entries = NULL;
propTermHistory.n_entries = 0;
propEpochStartLsn = 0;
donorEpoch = 0;
donor = 0;
timelineStartLsn = 0;
n_votes = 0;
n_connected = 0;
last_reconnect_attempt = 0;
walprop_shared = palloc(WalproposerShmemSize());
if (walprop_shared != NULL)
{
memset(walprop_shared, 0, WalproposerShmemSize());
SpinLockInit(&walprop_shared->mutex);
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
}
XLogWalPropClose(0);
}
void InitMyInsert();
void WalProposerRust()
{
struct stat stat_buf;
walprop_log(LOG, "WalProposerRust");
InitMyInsert();
sim_log("started;walproposer;%d", (int) syncSafekeepers);
#if PG_VERSION_NUM < 150000
ThisTimeLineID = 1;
#endif
ChangeToDataDir();
/* Create pg_wal directory, if it doesn't exist */
if (stat(XLOGDIR, &stat_buf) != 0)
{
ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
if (MakePGDirectory(XLOGDIR) < 0)
{
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create directory \"%s\": %m",
XLOGDIR)));
exit(1);
}
}
WalProposerInit(0, 0);
BackgroundWorkerUnblockSignals();
WalProposerStart();
}
/*
* WAL proposer bgworker entry point.
*/
@@ -377,6 +460,68 @@ WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos)
BroadcastAppendRequest();
}
#ifdef SIMLIB
XLogRecPtr MyInsertRecord();
int
SimWaitEventSetWait(Safekeeper **sk, long timeout, WaitEvent *occurred_events)
{
Event event = sim_epoll_peek(timeout);
if (event.tag == Closed) {
sim_epoll_rcv(0);
for (int i = 0; i < n_safekeepers; i++) {
if (safekeeper[i].conn && ((int64_t) walprop_socket(safekeeper[i].conn)) == event.tcp) {
walprop_log(LOG, "connection to %s:%s is closed", safekeeper[i].host, safekeeper[i].port);
ResetConnection(&safekeeper[i]);
}
}
return 0;
} else if (event.tag == Message && event.any_message == Bytes) {
// !!! code must read the message
for (int i = 0; i < n_safekeepers; i++) {
if (safekeeper[i].conn && ((int64_t) walprop_socket(safekeeper[i].conn)) == event.tcp) {
*occurred_events = (WaitEvent) {
.events = WL_SOCKET_READABLE,
};
*sk = &safekeeper[i];
return 1;
}
}
walprop_log(FATAL, "unknown tcp connection");
} else if (event.tag == Internal && event.any_message == Just32) {
uint32_t tx_count;
XLogRecPtr start_lsn = sim_latest_available_lsn;
XLogRecPtr finish_lsn = sim_latest_available_lsn;
Assert(!syncSafekeepers);
sim_epoll_rcv(0);
sim_msg_get_just_u32(&tx_count);
// don't write WAL before winning the election
if (propEpochStartLsn != 0)
{
for (uint32_t i = 0; i < tx_count; i++)
{
finish_lsn = MyInsertRecord();
}
sim_log("write_wal;%lu;%lu;%d", start_lsn, finish_lsn, (int) tx_count);
sim_latest_available_lsn = finish_lsn;
}
*occurred_events = (WaitEvent) {
.events = WL_LATCH_SET,
};
return 1;
} else if (event.tag == Timeout) {
return 0;
} else {
Assert(false);
}
}
#endif
/*
* Advance the WAL proposer state machine, waiting each time for events to occur.
* Will exit only when latch is set, i.e. new WAL should be pushed from walsender
@@ -392,16 +537,25 @@ WalProposerPoll(void)
WaitEvent event;
TimestampTz now = GetCurrentTimestamp();
#ifndef SIMLIB
rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
&event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
sk = (Safekeeper *) event.user_data;
#else
rc = SimWaitEventSetWait(&sk, TimeToReconnect(now), &event);
#endif
/*
* If the event contains something that one of our safekeeper states
* was waiting for, we'll advance its state.
*/
if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
{
AdvancePollState(sk, event.events);
#ifdef SIMLIB
// TODO: assert that code consumed incoming message
#endif
}
/*
* If the timeout expired, attempt to reconnect to any safekeepers
@@ -416,7 +570,9 @@ WalProposerPoll(void)
*/
if (rc != 0 && (event.events & WL_LATCH_SET))
{
#ifndef SIMLIB
ResetLatch(MyLatch);
#endif
break;
}
@@ -431,6 +587,7 @@ WalProposerPoll(void)
*/
if (availableLsn != InvalidXLogRecPtr)
{
walprop_log(LOG, "no WAL generated during timeout, sending pool message");
BroadcastAppendRequest();
}
@@ -445,7 +602,7 @@ WalProposerPoll(void)
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
wal_acceptor_connection_timeout))
{
elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
walprop_log(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
ShutdownConnection(sk);
}
@@ -486,16 +643,18 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
char *sep;
char *port;
#ifndef SIMLIB
load_file("libpqwalreceiver", false);
if (WalReceiverFunctions == NULL)
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
walprop_log(ERROR, "libpqwalreceiver didn't initialize correctly");
#endif
for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
{
port = strchr(host, ':');
if (port == NULL)
{
elog(FATAL, "port is not specified");
walprop_log(FATAL, "port is not specified");
}
*port++ = '\0';
sep = strchr(port, ',');
@@ -503,8 +662,11 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
*sep++ = '\0';
if (n_safekeepers + 1 >= MAX_SAFEKEEPERS)
{
elog(FATAL, "Too many safekeepers");
walprop_log(FATAL, "Too many safekeepers");
}
memset(&safekeeper[n_safekeepers], 0, sizeof(Safekeeper));
safekeeper[n_safekeepers].host = host;
safekeeper[n_safekeepers].port = port;
safekeeper[n_safekeepers].state = SS_OFFLINE;
@@ -526,13 +688,13 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
}
if (written > MAXCONNINFO || written < 0)
elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
}
initStringInfo(&safekeeper[n_safekeepers].outbuf);
safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
if (safekeeper[n_safekeepers].xlogreader == NULL)
elog(FATAL, "Failed to allocate xlog reader");
walprop_log(FATAL, "Failed to allocate xlog reader");
safekeeper[n_safekeepers].flushWrite = false;
safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr;
@@ -540,7 +702,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
}
if (n_safekeepers < 1)
{
elog(FATAL, "Safekeepers addresses are not specified");
walprop_log(FATAL, "Safekeepers addresses are not specified");
}
quorum = n_safekeepers / 2 + 1;
@@ -551,15 +713,15 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
greetRequest.systemId = systemId;
if (!neon_timeline_walproposer)
elog(FATAL, "neon.timeline_id is not provided");
walprop_log(FATAL, "neon.timeline_id is not provided");
if (*neon_timeline_walproposer != '\0' &&
!HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16))
elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer);
walprop_log(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer);
if (!neon_tenant_walproposer)
elog(FATAL, "neon.tenant_id is not provided");
walprop_log(FATAL, "neon.tenant_id is not provided");
if (*neon_tenant_walproposer != '\0' &&
!HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16))
elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer);
walprop_log(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer);
#if PG_VERSION_NUM >= 150000
/* FIXME don't use hardcoded timeline id */
@@ -592,12 +754,14 @@ WalProposerLoop(void)
WalProposerPoll();
}
#ifndef SIMLIB
/* Initializes the internal event set, provided that it is currently null */
static void
InitEventSet(void)
{
if (waitEvents)
elog(FATAL, "double-initialization of event set");
walprop_log(FATAL, "double-initialization of event set");
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
@@ -663,6 +827,26 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove)
}
}
#else
static void
InitEventSet(void)
{
walprop_log(DEBUG5, "InitEventSet");
}
static void
UpdateEventSet(Safekeeper *sk, uint32 events)
{
walprop_log(DEBUG5, "UpdateEventSet");
}
static void
HackyRemoveWalProposerEvent(Safekeeper *to_remove)
{
walprop_log(DEBUG5, "HackyRemoveWalProposerEvent");
}
#endif
/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
static void
ShutdownConnection(Safekeeper *sk)
@@ -707,7 +891,7 @@ ResetConnection(Safekeeper *sk)
* PGconn structure"
*/
if (!sk->conn)
elog(FATAL, "failed to allocate new PGconn object");
walprop_log(FATAL, "failed to allocate new PGconn object");
/*
* PQconnectStart won't actually start connecting until we run
@@ -725,7 +909,7 @@ ResetConnection(Safekeeper *sk)
*
* https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
*/
elog(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
sk->host, sk->port, walprop_error_message(sk->conn));
/*
@@ -750,13 +934,18 @@ ResetConnection(Safekeeper *sk)
* (see libpqrcv_connect, defined in
* src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
*/
elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
sk->state = SS_CONNECTING_WRITE;
sk->latestMsgReceivedAt = GetCurrentTimestamp();
#ifndef SIMLIB
sock = walprop_socket(sk->conn);
sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
#else
HandleConnectionEvent(sk);
RecvStartWALPushResult(sk);
#endif
return;
}
@@ -819,7 +1008,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
* ResetConnection
*/
case SS_OFFLINE:
elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
sk->host, sk->port);
break; /* actually unreachable, but prevents
* -Wimplicit-fallthrough */
@@ -855,7 +1044,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
* requests.
*/
case SS_VOTING:
elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
sk->port, FormatSafekeeperState(sk->state));
ResetConnection(sk);
return;
@@ -884,7 +1073,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
* Idle state for waiting votes from quorum.
*/
case SS_IDLE:
elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
sk->port, FormatSafekeeperState(sk->state));
ResetConnection(sk);
return;
@@ -909,7 +1098,7 @@ HandleConnectionEvent(Safekeeper *sk)
switch (result)
{
case WP_CONN_POLLING_OK:
elog(LOG, "connected with node %s:%s", sk->host,
walprop_log(LOG, "connected with node %s:%s", sk->host,
sk->port);
sk->latestMsgReceivedAt = GetCurrentTimestamp();
/*
@@ -932,7 +1121,7 @@ HandleConnectionEvent(Safekeeper *sk)
break;
case WP_CONN_POLLING_FAILED:
elog(WARNING, "failed to connect to node '%s:%s': %s",
walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
sk->host, sk->port, walprop_error_message(sk->conn));
/*
@@ -945,12 +1134,14 @@ HandleConnectionEvent(Safekeeper *sk)
return;
}
#ifndef SIMLIB
/*
* Because PQconnectPoll can change the socket, we have to un-register the
* old event and re-register an event on the new socket.
*/
HackyRemoveWalProposerEvent(sk);
sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk);
#endif
/* If we successfully connected, send START_WAL_PUSH query */
if (result == WP_CONN_POLLING_OK)
@@ -967,7 +1158,7 @@ SendStartWALPush(Safekeeper *sk)
{
if (!walprop_send_query(sk->conn, "START_WAL_PUSH"))
{
elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
sk->host, sk->port, walprop_error_message(sk->conn));
ShutdownConnection(sk);
return;
@@ -1002,7 +1193,7 @@ RecvStartWALPushResult(Safekeeper *sk)
break;
case WP_EXEC_FAILED:
elog(WARNING, "Failed to send query to safekeeper %s:%s: %s",
walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
sk->host, sk->port, walprop_error_message(sk->conn));
ShutdownConnection(sk);
return;
@@ -1013,7 +1204,7 @@ RecvStartWALPushResult(Safekeeper *sk)
* wrong"
*/
case WP_EXEC_UNEXPECTED_SUCCESS:
elog(WARNING, "Received bad response from safekeeper %s:%s query execution",
walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
sk->host, sk->port);
ShutdownConnection(sk);
return;
@@ -1060,7 +1251,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
if (n_connected == quorum)
{
propTerm++;
elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm);
walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm);
voteRequest = (VoteRequest)
{
@@ -1073,7 +1264,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
else if (sk->greetResponse.term > propTerm)
{
/* Another compute with higher term is running. */
elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
sk->host, sk->port,
sk->greetResponse.term, propTerm);
}
@@ -1113,7 +1304,7 @@ static void
SendVoteRequest(Safekeeper *sk)
{
/* We have quorum for voting, send our vote request */
elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term);
walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term);
/* On failure, logging & resetting is handled */
if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
return;
@@ -1128,7 +1319,7 @@ RecvVoteResponse(Safekeeper *sk)
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->voteResponse))
return;
elog(LOG,
walprop_log(LOG,
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
@@ -1143,7 +1334,7 @@ RecvVoteResponse(Safekeeper *sk)
if ((!sk->voteResponse.voteGiven) &&
(sk->voteResponse.term > propTerm || n_votes < quorum))
{
elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
sk->host, sk->port,
sk->voteResponse.term, propTerm);
}
@@ -1188,17 +1379,24 @@ HandleElectedProposer(void)
*/
if (truncateLsn < propEpochStartLsn)
{
elog(LOG,
walprop_log(LOG,
"start recovery because truncateLsn=%X/%X is not "
"equal to epochStartLsn=%X/%X",
LSN_FORMAT_ARGS(truncateLsn),
LSN_FORMAT_ARGS(propEpochStartLsn));
/* Perform recovery */
if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn))
elog(FATAL, "Failed to recover state");
walprop_log(FATAL, "Failed to recover state");
}
else if (syncSafekeepers)
{
#ifdef SIMLIB
char lsn_str[8 + 1 + 8 + 1];
snprintf(lsn_str, sizeof(lsn_str), "%X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
sim_exit(0, lsn_str);
#endif
/* Sync is not needed: just exit */
fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
exit(0);
@@ -1275,6 +1473,7 @@ static void
DetermineEpochStartLsn(void)
{
TermHistory *dth;
int n_ready = 0;
propEpochStartLsn = InvalidXLogRecPtr;
donorEpoch = 0;
@@ -1285,6 +1484,8 @@ DetermineEpochStartLsn(void)
{
if (safekeeper[i].state == SS_IDLE)
{
n_ready++;
if (GetEpoch(&safekeeper[i]) > donorEpoch ||
(GetEpoch(&safekeeper[i]) == donorEpoch &&
safekeeper[i].voteResponse.flushLsn > propEpochStartLsn))
@@ -1301,7 +1502,7 @@ DetermineEpochStartLsn(void)
if (timelineStartLsn != InvalidXLogRecPtr &&
timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn)
{
elog(WARNING,
walprop_log(WARNING,
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
LSN_FORMAT_ARGS(timelineStartLsn),
LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn));
@@ -1311,6 +1512,9 @@ DetermineEpochStartLsn(void)
}
}
if (n_ready < quorum)
walprop_log(FATAL, "missing majority of votes, expected %d, got %d", n_votes, n_ready);
/*
* If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
* was committed yet. Start streaming then from the basebackup LSN.
@@ -1322,7 +1526,18 @@ DetermineEpochStartLsn(void)
{
timelineStartLsn = GetRedoStartLsn();
}
elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
}
if (truncateLsn == InvalidXLogRecPtr && timelineStartLsn == propEpochStartLsn)
{
/*
* If truncateLsn is 0 everywhere, we are bootstrapping -- nothing was
* committed yet. But if timelineStartLsn is not 0, we already know
* the first record location, so we can bump truncateLsn to it.
*/
truncateLsn = timelineStartLsn;
walprop_log(LOG, "bumped truncateLsn to timelineStartLsn %X/%X", LSN_FORMAT_ARGS(truncateLsn));
}
/*
@@ -1349,13 +1564,24 @@ DetermineEpochStartLsn(void)
propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
quorum,
propTerm,
LSN_FORMAT_ARGS(propEpochStartLsn),
safekeeper[donor].host, safekeeper[donor].port,
LSN_FORMAT_ARGS(truncateLsn));
{
XLogRecPtr prev_lsn = 0;
term_t prev_term = 0;
if (propTermHistory.n_entries > 1)
{
prev_lsn = propTermHistory.entries[propTermHistory.n_entries - 2].lsn;
prev_term = propTermHistory.entries[propTermHistory.n_entries - 2].term;
}
sim_log("prop_elected;%lu;%lu;%lu;%lu", propEpochStartLsn, propTerm, prev_lsn, prev_term);
}
/*
* Ensure the basebackup we are running (at RedoStartLsn) matches LSN
* since which we are going to write according to the consensus. If not,
@@ -1379,7 +1605,7 @@ DetermineEpochStartLsn(void)
if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
walprop_shared->mineLastElectedTerm)))
{
elog(PANIC,
walprop_log(PANIC,
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
LSN_FORMAT_ARGS(propEpochStartLsn),
LSN_FORMAT_ARGS(GetRedoStartLsn()));
@@ -1389,6 +1615,60 @@ DetermineEpochStartLsn(void)
}
}
#ifdef SIMLIB
static bool
WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
{
int node_id;
int64_t tcp_id;
char startcmd[1024];
int len;
XLogRecPtr pos = startpos;
const char *connstr_prefix = "host=node port=";
Assert(strncmp(safekeeper[donor].conninfo, connstr_prefix, strlen(connstr_prefix)) == 0);
node_id = atoi(safekeeper[donor].conninfo + strlen(connstr_prefix));
tcp_id = sim_open_tcp_nopoll(node_id);
len = snprintf(
startcmd,
sizeof(startcmd),
"START_REPLICATION %s %s %ld %ld",
neon_tenant_walproposer,
neon_timeline_walproposer,
(int64_t) startpos,
(int64_t) endpos
);
Assert(len > 0 && len < sizeof(startcmd));
sim_msg_set_bytes(startcmd, len);
sim_tcp_send(tcp_id);
while (pos < endpos)
{
uintptr_t msg_len;
char *msg;
Event event = sim_tcp_recv(tcp_id);
if (event.tag == Closed)
break;
Assert(event.tag == Message);
walprop_log(LOG, "recovery received event %d", (int) event.any_message);
Assert(event.any_message == Bytes);
msg = (char*) sim_msg_get_bytes(&msg_len);
XLogWalPropWrite(msg, msg_len, pos);
pos += msg_len;
}
walprop_log(LOG, "recovery finished at %X/%X, from %X/%X to %X/%X",
LSN_FORMAT_ARGS(pos),
LSN_FORMAT_ARGS(startpos),
LSN_FORMAT_ARGS(endpos));
return pos == endpos;
}
#else
/*
* Receive WAL from most advanced safekeeper
*/
@@ -1408,7 +1688,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
err)));
return false;
}
elog(LOG,
walprop_log(LOG,
"start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
"%d",
safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32),
@@ -1474,6 +1754,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
return true;
}
#endif
/*
* Determine for sk the starting streaming point and send it message
@@ -1542,7 +1823,7 @@ SendProposerElected(Safekeeper *sk)
*/
sk->startStreamingAt = truncateLsn;
elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn),
LSN_FORMAT_ARGS(sk->startStreamingAt));
}
@@ -1577,7 +1858,7 @@ SendProposerElected(Safekeeper *sk)
msg.timelineStartLsn = timelineStartLsn;
lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
elog(LOG,
walprop_log(LOG,
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
@@ -1607,7 +1888,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
{
StartReplicationCmd cmd;
elog(LOG, "WAL proposer starts streaming at %X/%X",
walprop_log(LOG, "WAL proposer starts streaming at %X/%X",
LSN_FORMAT_ARGS(startpos));
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
cmd.timeline = greetRequest.timeline;
@@ -1809,7 +2090,7 @@ SendAppendRequests(Safekeeper *sk)
return true;
case PG_ASYNC_WRITE_FAIL:
elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk->state),
walprop_error_message(sk->conn));
ShutdownConnection(sk);
@@ -1858,7 +2139,7 @@ RecvAppendResponses(Safekeeper *sk)
if (sk->appendResponse.term > propTerm)
{
/* Another compute with higher term is running. */
elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
sk->host, sk->port,
sk->appendResponse.term, propTerm);
}
@@ -1877,6 +2158,7 @@ RecvAppendResponses(Safekeeper *sk)
minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
if (minQuorumLsn > lastSentCommitLsn)
{
sim_log("commit_lsn;%lu", minQuorumLsn);
BroadcastAppendRequest();
lastSentCommitLsn = minQuorumLsn;
}
@@ -1904,7 +2186,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->currentClusterSize = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
rf->currentClusterSize);
}
else if (strcmp(key, "ps_writelsn") == 0)
@@ -1912,7 +2194,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->ps_writelsn = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
LSN_FORMAT_ARGS(rf->ps_writelsn));
}
else if (strcmp(key, "ps_flushlsn") == 0)
@@ -1920,7 +2202,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->ps_flushlsn = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
LSN_FORMAT_ARGS(rf->ps_flushlsn));
}
else if (strcmp(key, "ps_applylsn") == 0)
@@ -1928,7 +2210,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->ps_applylsn = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
LSN_FORMAT_ARGS(rf->ps_applylsn));
}
else if (strcmp(key, "ps_replytime") == 0)
@@ -1941,7 +2223,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
/* Copy because timestamptz_to_str returns a static buffer */
replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
rf->ps_replytime, replyTimeStr);
pfree(replyTimeStr);
@@ -1956,7 +2238,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
* Skip unknown keys to support backward compatibile protocol
* changes
*/
elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
walprop_log(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
pq_getmsgbytes(reply_message, len);
};
}
@@ -2107,7 +2389,7 @@ GetLatestNeonFeedback(ReplicationFeedback * rf)
rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
walprop_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
" ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
rf->currentClusterSize,
LSN_FORMAT_ARGS(rf->ps_writelsn),
@@ -2133,7 +2415,9 @@ HandleSafekeeperResponse(void)
{
/* Get ReplicationFeedback fields from the most advanced safekeeper */
GetLatestNeonFeedback(&quorumFeedback.rf);
#ifndef SIMLIB
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
#endif
}
if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
@@ -2142,6 +2426,7 @@ HandleSafekeeperResponse(void)
if (minQuorumLsn > quorumFeedback.flushLsn)
quorumFeedback.flushLsn = minQuorumLsn;
#ifndef SIMLIB
/* advance the replication slot */
if (!syncSafekeepers)
ProcessStandbyReply(
@@ -2156,18 +2441,31 @@ HandleSafekeeperResponse(void)
*/
quorumFeedback.rf.ps_flushlsn,
GetCurrentTimestamp(), false);
#endif
#ifdef SIMLIB
if (!syncSafekeepers)
{
char lsn_str[8 + 1 + 8 + 1];
snprintf(lsn_str, sizeof(lsn_str), "%X/%X", LSN_FORMAT_ARGS(quorumFeedback.flushLsn));
sim_set_result(1, lsn_str);
}
#endif
}
CombineHotStanbyFeedbacks(&hsFeedback);
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
{
quorumFeedback.hs = hsFeedback;
#ifndef SIMLIB
if (!syncSafekeepers)
ProcessStandbyHSFeedback(hsFeedback.ts,
XidFromFullTransactionId(hsFeedback.xmin),
EpochFromFullTransactionId(hsFeedback.xmin),
XidFromFullTransactionId(hsFeedback.catalog_xmin),
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
#endif
}
/*
@@ -2227,6 +2525,13 @@ HandleSafekeeperResponse(void)
}
if (n_synced >= quorum)
{
#ifdef SIMLIB
char lsn_str[8 + 1 + 8 + 1];
snprintf(lsn_str, sizeof(lsn_str), "%X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
sim_exit(0, lsn_str);
#endif
/* All safekeepers synced! */
fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
exit(0);
@@ -2251,7 +2556,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
return false;
case PG_ASYNC_READ_FAIL:
elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
sk->port, FormatSafekeeperState(sk->state),
walprop_error_message(sk->conn));
ShutdownConnection(sk);
@@ -2281,7 +2586,12 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
if (!(AsyncRead(sk, &buf, &buf_size)))
return false;
/* parse it */
// for (int i = 0; i < buf_size; i++) {
// fprintf(stderr, "%02x", buf[i]);
// }
// fprintf(stderr, "\n");
/* parse it */
s.data = buf;
s.len = buf_size;
s.cursor = 0;
@@ -2289,7 +2599,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
tag = pq_getmsgint64_le(&s);
if (tag != anymsg->tag)
{
elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
sk->port, FormatSafekeeperState(sk->state));
ResetConnection(sk);
return false;
@@ -2364,7 +2674,7 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
if (!walprop_blocking_write(sk->conn, msg, msg_size))
{
elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk->state),
walprop_error_message(sk->conn));
ShutdownConnection(sk);
@@ -2409,7 +2719,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
return false;
case PG_ASYNC_WRITE_FAIL:
elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk->state),
walprop_error_message(sk->conn));
ShutdownConnection(sk);
@@ -2446,7 +2756,7 @@ AsyncFlush(Safekeeper *sk)
/* Nothing to do; try again when the socket's ready */
return false;
case -1:
elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk->state),
walprop_error_message(sk->conn));
ResetConnection(sk);
@@ -2474,7 +2784,7 @@ backpressure_lag_impl(void)
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
#define MB ((XLogRecPtr)1024 * 1024)
elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
walprop_log(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
LSN_FORMAT_ARGS(myFlushLsn),
LSN_FORMAT_ARGS(writePtr),
LSN_FORMAT_ARGS(flushPtr),
@@ -2522,7 +2832,7 @@ backpressure_throttling_impl(void)
/* Suspend writers until replicas catch up */
set_ps_display("backpressure throttling");
elog(DEBUG2, "backpressure throttling: lag %lu", lag);
walprop_log(DEBUG2, "backpressure throttling: lag %lu", lag);
start = GetCurrentTimestamp();
pg_usleep(BACK_PRESSURE_DELAY);
stop = GetCurrentTimestamp();

View File

@@ -10,6 +10,37 @@
#include "utils/uuid.h"
#include "replication/walreceiver.h"
#define WALPROPOSER_TAG "[WALPROPOSER] "
#ifdef SIMLIB
#define walprop_log(tag, fmt, ...) do { \
MyProcPid = sim_now(); \
ereport((tag > WARNING ? WARNING : tag), \
(errmsg(fmt, ##__VA_ARGS__), \
errhidestmt(true), errhidecontext(true), internalerrposition(0))); \
if (tag > WARNING) \
sim_exit(tag, "walprop_log error macros"); \
} while (0)
#define exit(code) sim_exit(code, "exit()")
#define sim_log(fmt, ...) do { \
char buf[1024]; \
snprintf(buf, sizeof(buf), fmt, ##__VA_ARGS__); \
sim_log_event(buf); \
} while (0)
#else
#define walprop_log(tag, fmt, ...) ereport(tag, \
(errmsg(WALPROPOSER_TAG fmt, ##__VA_ARGS__), \
errhidestmt(true), errhidecontext(true), internalerrposition(0)))
#endif
#ifdef SIMLIB
extern uint64 sim_redo_start_lsn;
#define GetRedoStartLsn() sim_redo_start_lsn
extern XLogRecPtr sim_latest_available_lsn;
#endif
#define SK_MAGIC 0xCafeCeefu
#define SK_PROTOCOL_VERSION 2
@@ -28,6 +59,8 @@
*/
#define WL_NO_EVENTS 0
extern bool syncSafekeepers;
extern char *wal_acceptors_list;
extern int wal_acceptor_reconnect_timeout;
extern int wal_acceptor_connection_timeout;
@@ -374,8 +407,11 @@ typedef struct Safekeeper
XLogRecPtr streamingAt; /* current streaming position */
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
#ifndef SIMLIB
int eventPos; /* position in wait event set. Equal to -1 if*
* no event */
#endif
SafekeeperState state; /* safekeeper state machine state */
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
AcceptorGreeting greetResponse; /* acceptor greeting */

View File

@@ -121,11 +121,11 @@ CompareLsn(const void *a, const void *b)
*
* The strings are intended to be used as a prefix to "state", e.g.:
*
* elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
*
* If this sort of phrasing doesn't fit the message, instead use something like:
*
* elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
*/
char *
FormatSafekeeperState(SafekeeperState state)
@@ -192,10 +192,10 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
if (!events_ok_for_state)
{
/*
* To give a descriptive message in the case of failure, we use elog
* To give a descriptive message in the case of failure, we use walprop_log
* and then an assertion that's guaranteed to fail.
*/
elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
Assert(events_ok_for_state);
}
@@ -298,7 +298,7 @@ FormatEvents(uint32 events)
if (events & (~all_flags))
{
elog(WARNING, "Event formatting found unexpected component %d",
walprop_log(WARNING, "Event formatting found unexpected component %d",
events & (~all_flags));
return_str[6] = '*';
return_str[7] = '\0';
@@ -486,9 +486,9 @@ XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
void
XLogWalPropClose(XLogRecPtr recptr)
{
Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
// Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
if (close(walpropFile) != 0)
if (walpropFile >= 0 && close(walpropFile) != 0)
{
char xlogfname[MAXFNAMELEN];
@@ -505,6 +505,8 @@ XLogWalPropClose(XLogRecPtr recptr)
/* START of cloned functions from walsender.c */
void sim_start_replication(XLogRecPtr startpoint);
/*
* Handle START_REPLICATION command.
*
@@ -517,6 +519,11 @@ StartProposerReplication(StartReplicationCmd *cmd)
XLogRecPtr FlushPtr;
TimeLineID currTLI;
#ifdef SIMLIB
sim_start_replication(cmd->startpoint);
return;
#endif
#if PG_VERSION_NUM < 150000
if (ThisTimeLineID == 0)
ereport(ERROR,
@@ -1111,7 +1118,7 @@ XLogSendPhysical(void)
WalSndCaughtUp = true;
elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
walprop_log(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
LSN_FORMAT_ARGS(sendTimeLineValidUpto),
LSN_FORMAT_ARGS(sentPtr));
return;

View File

@@ -42,8 +42,11 @@ remote_storage.workspace = true
safekeeper_api.workspace = true
storage_broker.workspace = true
utils.workspace = true
scopeguard.workspace = true
workspace_hack.workspace = true
crossbeam = "0.8.2"
rand.workspace = true
[dev-dependencies]
tempfile.workspace = true

View File

@@ -19,6 +19,8 @@ pub mod receive_wal;
pub mod remove_wal;
pub mod safekeeper;
pub mod send_wal;
pub mod simlib;
pub mod simtest;
pub mod timeline;
pub mod wal_backup;
pub mod wal_service;
@@ -75,8 +77,7 @@ impl SafeKeeperConf {
}
impl SafeKeeperConf {
#[cfg(test)]
fn dummy() -> Self {
pub fn dummy() -> Self {
SafeKeeperConf {
workdir: PathBuf::from("./"),
no_sync: false,

View File

@@ -650,7 +650,7 @@ where
self.state.persist(&state)?;
}
info!(
debug!(
"processed greeting from walproposer {}, sending term {:?}",
msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
self.state.acceptor_state.term
@@ -695,7 +695,7 @@ where
resp.term = self.state.acceptor_state.term;
resp.vote_given = true as u64;
}
info!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
debug!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
Ok(Some(AcceptorProposerMessage::VoteResponse(resp)))
}
@@ -714,7 +714,7 @@ where
}
fn handle_elected(&mut self, msg: &ProposerElected) -> Result<Option<AcceptorProposerMessage>> {
info!("received ProposerElected {:?}", msg);
debug!("received ProposerElected {:?}", msg);
if self.state.acceptor_state.term < msg.term {
let mut state = self.state.clone();
state.acceptor_state.term = msg.term;
@@ -760,14 +760,14 @@ where
if state.timeline_start_lsn == Lsn(0) {
// Remember point where WAL begins globally.
state.timeline_start_lsn = msg.timeline_start_lsn;
info!(
debug!(
"setting timeline_start_lsn to {:?}",
state.timeline_start_lsn
);
}
if state.local_start_lsn == Lsn(0) {
state.local_start_lsn = msg.start_streaming_at;
info!("setting local_start_lsn to {:?}", state.local_start_lsn);
debug!("setting local_start_lsn to {:?}", state.local_start_lsn);
}
// Initializing commit_lsn before acking first flushed record is
// important to let find_end_of_wal skip the hole in the beginning
@@ -789,7 +789,7 @@ where
self.persist_control_file(state)?;
}
info!("start receiving WAL since {:?}", msg.start_streaming_at);
debug!("start receiving WAL since {:?}", msg.start_streaming_at);
Ok(None)
}

View File

@@ -0,0 +1,78 @@
use std::{collections::VecDeque, sync::Arc};
use super::sync::{Condvar, Mutex, Park};
/// FIFO channel with blocking send and receive. Can be cloned and shared between threads.
#[derive(Clone)]
pub struct Chan<T: Clone> {
shared: Arc<ChanState<T>>,
}
struct ChanState<T> {
queue: Mutex<VecDeque<T>>,
condvar: Condvar,
}
impl<T: Clone> Chan<T> {
pub fn new() -> Chan<T> {
Chan {
shared: Arc::new(ChanState {
queue: Mutex::new(VecDeque::new()),
condvar: Condvar::new(),
}),
}
}
/// Append a message to the end of the queue.
/// Can be called from any thread.
pub fn send(&self, t: T) {
self.shared.queue.lock().push_back(t);
self.shared.condvar.notify_one();
}
/// Get a message from the front of the queue, or block if the queue is empty.
/// Can be called only from the node thread.
pub fn recv(&self) -> T {
// interrupt the receiver to prevent consuming everything at once
Park::yield_thread();
let mut queue = self.shared.queue.lock();
loop {
if let Some(t) = queue.pop_front() {
return t;
}
self.shared.condvar.wait(&mut queue);
}
}
/// Same as `recv`, but doesn't take the message from the queue.
pub fn peek(&self) -> T {
// interrupt the receiver to prevent consuming everything at once
Park::yield_thread();
let mut queue = self.shared.queue.lock();
loop {
if let Some(t) = queue.front().cloned() {
return t;
}
self.shared.condvar.wait(&mut queue);
}
}
/// Get a message from the front of the queue, or return `None` if the queue is empty.
pub fn try_recv(&self) -> Option<T> {
let mut queue = self.shared.queue.lock();
queue.pop_front()
}
/// Clone a message from the front of the queue, or return `None` if the queue is empty.
pub fn try_peek(&self) -> Option<T> {
let queue = self.shared.queue.lock();
queue.front().cloned()
}
pub fn clear(&self) {
let mut queue = self.shared.queue.lock();
queue.clear();
}
}

View File

@@ -0,0 +1,8 @@
pub mod chan;
pub mod network;
pub mod node_os;
pub mod proto;
pub mod sync;
pub mod time;
pub mod wait_group;
pub mod world;

View File

@@ -0,0 +1,413 @@
use std::{
collections::VecDeque,
fmt::{self, Debug},
ops::DerefMut,
sync::Arc,
};
use rand::{rngs::StdRng, Rng};
use tracing::debug;
use super::{
chan::Chan,
proto::AnyMessage,
sync::Mutex,
time::NetworkEvent,
world::{Node, NodeEvent, World},
};
#[derive(Clone, Debug)]
pub struct Delay {
pub min: u64,
pub max: u64,
pub fail_prob: f64, // [0; 1]
}
impl Delay {
/// No delay, no failures.
pub fn empty() -> Delay {
Delay {
min: 0,
max: 0,
fail_prob: 0.0,
}
}
/// Fixed delay.
pub fn fixed(ms: u64) -> Delay {
Delay {
min: ms,
max: ms,
fail_prob: 0.0,
}
}
/// Generate a random delay in range [min, max]. Return None if the
/// message should be dropped.
pub fn delay(&self, rng: &mut StdRng) -> Option<u64> {
if rng.gen_bool(self.fail_prob) {
return None;
}
Some(rng.gen_range(self.min..=self.max))
}
}
#[derive(Clone, Debug)]
pub struct NetworkOptions {
/// Connection will be automatically closed after this timeout.
pub keepalive_timeout: Option<u64>,
pub connect_delay: Delay,
pub send_delay: Delay,
}
// 0 - from node(0) to node(1)
// 1 - from node(1) to node(0)
type MessageDirection = u8;
/// Virtual connection between two nodes.
/// Node 0 is the creator of the connection (client),
/// and node 1 is the acceptor (server).
pub struct VirtualConnection {
/// Connection id, used for logging and debugging and C API.
pub connection_id: u64,
pub world: Arc<World>,
pub nodes: [Arc<Node>; 2],
dst_sockets: [Chan<NodeEvent>; 2],
state: Mutex<ConnectionState>,
options: Arc<NetworkOptions>,
}
struct ConnectionState {
buffers: [NetworkBuffer; 2],
rng: StdRng,
}
impl VirtualConnection {
pub fn new(
id: u64,
world: Arc<World>,
src_sink: Chan<NodeEvent>,
dst_sink: Chan<NodeEvent>,
src: Arc<Node>,
dst: Arc<Node>,
options: Arc<NetworkOptions>,
) -> Arc<Self> {
let now = world.now();
let rng = world.new_rng();
let conn = Arc::new(Self {
connection_id: id,
world,
dst_sockets: [src_sink, dst_sink],
nodes: [src, dst],
state: Mutex::new(ConnectionState {
buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))],
rng,
}),
options,
});
conn.world.add_conn(conn.clone());
conn.schedule_timeout();
conn.send_connect();
// TODO: add connection to the dst node
// conn.dst_sockets[1].send(NodeEvent::Connection(conn.clone()));
conn
}
/// Notify the future about the possible timeout.
fn schedule_timeout(self: &Arc<Self>) {
if let Some(timeout) = self.options.keepalive_timeout {
self.world.schedule(timeout, self.as_event());
}
}
/// Transmit some of the messages from the buffer to the nodes.
pub fn process(self: &Arc<Self>) {
let now = self.world.now();
let mut state = self.state.lock();
for direction in 0..2 {
self.process_direction(
state.deref_mut(),
now,
direction as MessageDirection,
&self.dst_sockets[direction ^ 1],
);
}
// Close the one side of the connection by timeout if the node
// has not received any messages for a long time.
if let Some(timeout) = self.options.keepalive_timeout {
let mut to_close = [false, false];
for direction in 0..2 {
let node_idx = direction ^ 1;
let node = &self.nodes[node_idx];
let buffer = &mut state.buffers[direction];
if buffer.recv_closed {
continue;
}
if let Some(last_recv) = buffer.last_recv {
if now - last_recv >= timeout {
debug!(
"NET: connection {} timed out at node {}",
self.connection_id, node.id
);
to_close[node_idx] = true;
}
}
}
drop(state);
for node_idx in 0..2 {
if to_close[node_idx] {
self.close(node_idx);
}
}
}
}
/// Process messages in the buffer in the given direction.
fn process_direction(
self: &Arc<Self>,
state: &mut ConnectionState,
now: u64,
direction: MessageDirection,
to_socket: &Chan<NodeEvent>,
) {
let buffer = &mut state.buffers[direction as usize];
if buffer.recv_closed {
assert!(buffer.buf.is_empty());
}
while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now {
let msg = buffer.buf.pop_front().unwrap().1;
let callback = TCP::new(self.clone(), direction ^ 1);
// debug!(
// "NET: {:?} delivered, {}=>{}",
// msg, from_node.id, to_node.id
// );
buffer.last_recv = Some(now);
self.schedule_timeout();
if let AnyMessage::InternalConnect = msg {
to_socket.send(NodeEvent::Accept(callback));
} else {
to_socket.send(NodeEvent::Message((msg, callback)));
}
}
}
/// Send a message to the buffer.
pub fn send(self: &Arc<Self>, direction: MessageDirection, msg: AnyMessage) {
let now = self.world.now();
let mut state = self.state.lock();
let (delay, close) = if let Some(ms) = self.options.send_delay.delay(&mut state.rng) {
(ms, false)
} else {
(0, true)
};
let buffer = &mut state.buffers[direction as usize];
if buffer.send_closed {
debug!(
"NET: TCP #{} dropped message {:?} (broken pipe)",
self.connection_id, msg
);
return;
}
if close {
debug!(
"NET: TCP #{} dropped message {:?} (pipe just broke)",
self.connection_id, msg
);
buffer.send_closed = true;
return;
}
if buffer.recv_closed {
debug!(
"NET: TCP #{} dropped message {:?} (recv closed)",
self.connection_id, msg
);
return;
}
// Send a message into the future.
buffer.buf.push_back((now + delay, msg));
self.world.schedule(delay, self.as_event());
}
/// Send the handshake (Accept) to the server.
fn send_connect(self: &Arc<Self>) {
let now = self.world.now();
let mut state = self.state.lock();
let delay = self.options.connect_delay.delay(&mut state.rng);
let buffer = &mut state.buffers[0];
assert!(buffer.buf.is_empty());
assert!(!buffer.recv_closed);
assert!(!buffer.send_closed);
assert!(buffer.last_recv.is_none());
let delay = if let Some(ms) = delay {
ms
} else {
debug!("NET: TCP #{} dropped connect", self.connection_id);
buffer.send_closed = true;
return;
};
// Send a message into the future.
buffer
.buf
.push_back((now + delay, AnyMessage::InternalConnect));
self.world.schedule(delay, self.as_event());
}
fn internal_recv(self: &Arc<Self>, node_idx: usize) -> NodeEvent {
// Only src node can receive messages.
assert!(node_idx == 0);
return self.dst_sockets[node_idx].recv();
}
/// Close the connection. Only one side of the connection will be closed,
/// and no further messages will be delivered. The other side will not be notified.
pub fn close(self: &Arc<Self>, node_idx: usize) {
let node = &self.nodes[node_idx];
let mut state = self.state.lock();
let recv_buffer = &mut state.buffers[1 ^ node_idx];
if recv_buffer.recv_closed {
debug!(
"NET: TCP #{} closed twice at node {}",
self.connection_id, node.id
);
return;
}
debug!(
"NET: TCP #{} closed at node {}",
self.connection_id, node.id
);
recv_buffer.recv_closed = true;
for msg in recv_buffer.buf.drain(..) {
debug!(
"NET: TCP #{} dropped message {:?} (closed)",
self.connection_id, msg
);
}
let send_buffer = &mut state.buffers[node_idx];
send_buffer.send_closed = true;
drop(state);
// TODO: notify the other side?
self.dst_sockets[node_idx].send(NodeEvent::Closed(TCP::new(self.clone(), node_idx as u8)));
}
/// Get an event suitable for scheduling.
fn as_event(self: &Arc<Self>) -> Box<NetworkEvent> {
Box::new(NetworkEvent(self.clone()))
}
pub fn deallocate(&self) {
self.dst_sockets[0].clear();
self.dst_sockets[1].clear();
}
}
struct NetworkBuffer {
/// Messages paired with time of delivery
buf: VecDeque<(u64, AnyMessage)>,
/// True if the connection is closed on the receiving side,
/// i.e. no more messages from the buffer will be delivered.
recv_closed: bool,
/// True if the connection is closed on the sending side,
/// i.e. no more messages will be added to the buffer.
send_closed: bool,
/// Last time a message was delivered from the buffer.
/// If None, it means that the server is the receiver and
/// it has not yet aware of this connection (i.e. has not
/// received the Accept).
last_recv: Option<u64>,
}
impl NetworkBuffer {
fn new(last_recv: Option<u64>) -> Self {
Self {
buf: VecDeque::new(),
recv_closed: false,
send_closed: false,
last_recv,
}
}
}
/// Simplistic simulation of a bidirectional network stream without reordering (TCP).
/// There are almost no errors, writes are always successful (but may end up in void).
/// Reads are implemented as a messages in a shared queue, refer to [`NodeOs::network_epoll`]
/// for details.
///
/// TCP struct is just a one side of a connection. To create a connection, use [`NodeOs::open_tcp`].
#[derive(Clone)]
pub struct TCP {
conn: Arc<VirtualConnection>,
dir: MessageDirection,
}
impl Debug for TCP {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"TCP #{} {{{}=>{}}}",
self.conn.connection_id,
self.conn.nodes[self.dir as usize].id,
self.conn.nodes[1 - self.dir as usize].id
)
}
}
impl TCP {
pub fn new(conn: Arc<VirtualConnection>, dir: MessageDirection) -> TCP {
TCP { conn, dir }
}
/// Send a message to the other side. It's guaranteed that it will not arrive
/// before the arrival of all messages sent earlier.
pub fn send(&self, msg: AnyMessage) {
self.conn.send(self.dir, msg);
}
/// Receive a message. Blocks until a message is available. Can be used only
/// with sockets opened with [`NodeOs::open_tcp_nopoll`].
pub fn recv(&self) -> NodeEvent {
// TODO: handle closed connection
self.conn.internal_recv(self.dir as usize)
}
pub fn id(&self) -> i64 {
let positive: i64 = (self.conn.connection_id + 1) as i64;
if self.dir == 0 {
positive
} else {
-positive
}
}
pub fn connection_id(&self) -> u64 {
self.conn.connection_id
}
pub fn close(&self) {
self.conn.close(self.dir as usize);
}
}

View File

@@ -0,0 +1,171 @@
use std::sync::Arc;
use rand::Rng;
use super::{
chan::Chan,
network::TCP,
time::SendMessageEvent,
world::{Node, NodeEvent, NodeId, World},
};
/// Abstraction with all functions (aka syscalls) available to the node.
#[derive(Clone)]
pub struct NodeOs {
world: Arc<World>,
internal: Arc<Node>,
}
impl NodeOs {
pub fn new(world: Arc<World>, internal: Arc<Node>) -> NodeOs {
NodeOs { world, internal }
}
/// Get the node id.
pub fn id(&self) -> NodeId {
self.internal.id
}
pub fn now(&self) -> u64 {
self.world.now()
}
/// Returns a writable pipe. All incoming messages should be polled
/// with [`network_epoll`]. Always successful.
pub fn open_tcp(&self, dst: NodeId) -> TCP {
self.world.open_tcp(&self.internal, dst)
}
/// Returns a readable and writable pipe. All incoming messages should
/// be read from [`TCP`] object.
pub fn open_tcp_nopoll(&self, dst: NodeId) -> TCP {
self.world.open_tcp_nopoll(&self.internal, dst)
}
/// Returns a channel to receive timers and events from the network.
pub fn epoll(&self) -> Chan<NodeEvent> {
self.internal.network_chan()
}
/// Returns next event from the epoll channel with timeout.
/// Returns `None` if timeout is reached.
/// -1 wait forever.
/// 0 - poll, return immediately.
/// >0 - wait for timeout milliseconds.
pub fn epoll_recv(&self, timeout: i64) -> Option<NodeEvent> {
let epoll = self.epoll();
let ready_event = loop {
let event = epoll.try_recv();
if let Some(NodeEvent::WakeTimeout(_)) = event {
continue;
}
break event;
};
if let Some(event) = ready_event {
// return event if it's ready
return Some(event);
}
if timeout == 0 {
// poll, return immediately
return None;
}
// or wait for timeout
let rand_nonce = self.random(u64::MAX);
if timeout > 0 {
self.world.schedule(
timeout as u64,
SendMessageEvent::new(epoll.clone(), NodeEvent::WakeTimeout(rand_nonce)),
);
}
loop {
match epoll.recv() {
NodeEvent::WakeTimeout(nonce) if nonce == rand_nonce => {
return None;
}
NodeEvent::WakeTimeout(_) => {}
event => {
return Some(event);
}
}
}
}
/// Same as epoll_recv, but does not remove the event from the queue.
pub fn epoll_peek(&self, timeout: i64) -> Option<NodeEvent> {
let epoll = self.epoll();
let ready_event = loop {
let event = epoll.try_peek();
if let Some(NodeEvent::WakeTimeout(_)) = event {
assert!(epoll.try_recv().is_some());
continue;
}
break event;
};
if let Some(event) = ready_event {
// return event if it's ready
return Some(event);
}
if timeout == 0 {
// poll, return immediately
return None;
}
// or wait for timeout
let rand_nonce = self.random(u64::MAX);
if timeout > 0 {
self.world.schedule(
timeout as u64,
SendMessageEvent::new(epoll.clone(), NodeEvent::WakeTimeout(rand_nonce)),
);
}
loop {
match epoll.peek() {
NodeEvent::WakeTimeout(nonce) if nonce == rand_nonce => {
assert!(epoll.try_recv().is_some());
return None;
}
NodeEvent::WakeTimeout(_) => {
assert!(epoll.try_recv().is_some());
}
event => {
return Some(event);
}
}
}
}
/// Sleep for a given number of milliseconds.
/// Currently matches the global virtual time, TODO may be good to
/// introduce a separate clocks for each node.
pub fn sleep(&self, ms: u64) {
let chan: Chan<()> = Chan::new();
self.world
.schedule(ms, SendMessageEvent::new(chan.clone(), ()));
chan.recv();
}
/// Generate a random number in range [0, max).
pub fn random(&self, max: u64) -> u64 {
self.internal.rng.lock().gen_range(0..max)
}
/// Set the result for the current node.
pub fn set_result(&self, code: i32, result: String) {
*self.internal.result.lock() = (code, result);
}
pub fn log_event(&self, data: String) {
self.world.add_event(self.id(), data)
}
}

View File

@@ -0,0 +1,38 @@
use std::fmt::Debug;
use bytes::Bytes;
use utils::lsn::Lsn;
/// All possible flavours of messages.
/// Grouped by the receiver node.
#[derive(Clone)]
pub enum AnyMessage {
/// Not used, empty placeholder.
None,
/// Used internally for notifying node about new incoming connection.
InternalConnect,
Just32(u32),
ReplCell(ReplCell),
Bytes(Bytes),
LSN(u64),
}
impl Debug for AnyMessage {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
AnyMessage::None => write!(f, "None"),
AnyMessage::InternalConnect => write!(f, "InternalConnect"),
AnyMessage::Just32(v) => write!(f, "Just32({})", v),
AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v),
AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)),
AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)),
}
}
}
#[derive(Clone, Debug)]
pub struct ReplCell {
pub value: u32,
pub client_id: u32,
pub seqno: u32,
}

View File

@@ -0,0 +1,273 @@
use std::{backtrace::Backtrace, sync::Arc};
use tracing::debug;
use super::world::{Node, NodeId, World};
pub type Mutex<T> = parking_lot::Mutex<T>;
/// More deterministic condvar. Determenism comes from the fact that
/// at all times there is at most one running thread.
pub struct Condvar {
waiters: Mutex<CondvarState>,
}
struct CondvarState {
waiters: Vec<Arc<Park>>,
}
impl Condvar {
pub fn new() -> Condvar {
Condvar {
waiters: Mutex::new(CondvarState {
waiters: Vec::new(),
}),
}
}
/// Blocks the current thread until this condition variable receives a notification.
pub fn wait<'a, T>(&self, guard: &mut parking_lot::MutexGuard<'a, T>) {
let park = Park::new(false);
// add the waiter to the list
self.waiters.lock().waiters.push(park.clone());
parking_lot::MutexGuard::unlocked(guard, || {
// part the thread, it will be woken up by notify_one or notify_all
park.park();
});
}
/// Wakes up all blocked threads on this condvar, can be called only from the node thread.
pub fn notify_all(&self) {
// TODO: check that it's waked up in random order and yield to the scheduler
let mut waiters = self.waiters.lock().waiters.drain(..).collect::<Vec<_>>();
for waiter in waiters.drain(..) {
// block (park) the current thread, wake the other thread
waiter.wake();
}
}
/// Wakes up one blocked thread on this condvar. Usually can be called only from the node thread,
/// because we have a global running threads counter and we transfer it from the current thread
/// to the woken up thread. But we have a HACK here to allow calling it from the world thread.
pub fn notify_one(&self) {
// TODO: wake up random thread
let to_wake = self.waiters.lock().waiters.pop();
if Node::is_node_thread() {
if let Some(waiter) = to_wake {
// block (park) the current thread, wake the other thread
waiter.wake();
} else {
// block (park) the current thread just in case
Park::yield_thread()
}
} else {
// HACK: custom notify_one implementation for the world thread
if let Some(waiter) = to_wake {
// block (park) the current thread, wake the other thread
waiter.external_wake();
}
}
}
}
/// A tool to block (park) a current thread until it will be woken up.
pub struct Park {
lock: Mutex<ParkState>,
cvar: parking_lot::Condvar,
}
struct ParkState {
/// False means that thread cannot continue without external signal,
/// i.e. waiting for some event to happen.
can_continue: bool,
/// False means that thread is unconditionally parked and waiting for
/// world simulation to wake it up. True means that the parking is
/// finished and the thread can continue.
finished: bool,
/// True means that the thread should wake up and panic.
panic: bool,
node_id: Option<NodeId>,
backtrace: Option<Backtrace>,
}
impl Park {
pub fn new(can_continue: bool) -> Arc<Park> {
Arc::new(Park {
lock: Mutex::new(ParkState {
can_continue,
finished: false,
panic: false,
node_id: None,
backtrace: None,
}),
cvar: parking_lot::Condvar::new(),
})
}
fn init_state(state: &mut ParkState, node: &Arc<Node>) {
state.node_id = Some(node.id);
state.backtrace = Some(Backtrace::capture());
}
/// Should be called once by the waiting thread. Blocks the thread until wake() is called,
/// and until the thread is woken up by the world simulation.
pub fn park(self: &Arc<Self>) {
let node = Node::current();
// start blocking
let mut state = self.lock.lock();
Self::init_state(&mut state, &node);
if state.can_continue {
// unconditional parking
parking_lot::MutexGuard::unlocked(&mut state, || {
// first put to world parking, then decrease the running threads counter
node.internal_parking_middle(self.clone());
});
} else {
parking_lot::MutexGuard::unlocked(&mut state, || {
// conditional parking, decrease the running threads counter without parking
node.internal_parking_start(self.clone());
});
// wait for condition
while !state.can_continue {
self.cvar.wait(&mut state);
}
if state.panic {
panic!("thread was crashed by the simulation");
}
// condition is met, we are now running instead of the waker thread.
// the next thing is to park the thread in the world, then decrease
// the running threads counter
node.internal_parking_middle(self.clone());
}
self.park_wait_the_world(node, &mut state);
}
fn park_wait_the_world(&self, node: Arc<Node>, state: &mut parking_lot::MutexGuard<ParkState>) {
// condition is met, wait for world simulation to wake us up
while !state.finished {
self.cvar.wait(state);
}
if state.panic {
panic!("node {} was crashed by the simulation", node.id);
}
// We are the only running thread now, we just need to update the state,
// and continue the execution.
node.internal_parking_end();
}
/// Hacky way to register parking before the thread is actually blocked.
fn park_ahead_now() -> Arc<Park> {
let park = Park::new(true);
let node = Node::current();
Self::init_state(&mut park.lock.lock(), &node);
node.internal_parking_ahead(park.clone());
park
}
/// Will wake up the thread that is currently conditionally parked. Can be called only
/// from the node thread, because it will block the caller thread. What it will do:
/// 1. Park the thread that called wake() in the world
/// 2. Wake up the waiting thread (it will also park in the world)
/// 3. Block the thread that called wake()
pub fn wake(&self) {
// parking the thread that called wake()
let self_park = Park::park_ahead_now();
let mut state = self.lock.lock();
if state.can_continue {
debug!(
"WARN wake() called on a thread that is already waked, node {:?}",
state.node_id
);
} else {
state.can_continue = true;
// and here we park the waiting thread
self.cvar.notify_all();
}
drop(state);
// and here we block the thread that called wake() by defer
let node = Node::current();
let mut state = self_park.lock.lock();
self_park.park_wait_the_world(node, &mut state);
}
/// Will wake up the thread that is currently conditionally parked. Can be called only
/// from the world threads. What it will do:
/// 1. Increase the running threads counter
/// 2. Wake up the waiting thread (it will park itself in the world)
pub fn external_wake(&self) {
let world = World::current();
let mut state = self.lock.lock();
if state.can_continue {
debug!(
"WARN external_wake() called on a thread that is already waked, node {:?}",
state.node_id
);
return;
}
world.internal_parking_wake();
state.can_continue = true;
// and here we park the waiting thread
self.cvar.notify_all();
drop(state);
}
/// Will wake up the thread that is currently unconditionally parked.
pub fn internal_world_wake(&self) {
let mut state = self.lock.lock();
if state.finished {
debug!(
"WARN internal_world_wake() called on a thread that is already waked, node {:?}",
state.node_id
);
return;
}
state.finished = true;
self.cvar.notify_all();
}
/// Will wake up thread to panic instantly.
pub fn crash_panic(&self) {
let mut state = self.lock.lock();
state.can_continue = true;
state.finished = true;
state.panic = true;
self.cvar.notify_all();
drop(state);
}
/// Print debug info about the parked thread.
pub fn debug_print(&self) {
// let state = self.lock.lock();
// debug!("PARK: node {:?} wake1={} wake2={}", state.node_id, state.can_continue, state.finished);
// debug!("DEBUG: node {:?} wake1={} wake2={}, trace={:?}", state.node_id, state.can_continue, state.finished, state.backtrace);
}
/// It feels that this function can cause deadlocks.
pub fn node_id(&self) -> Option<NodeId> {
let state = self.lock.lock();
state.node_id
}
/// Yield the current thread to the world simulation.
pub fn yield_thread() {
let park = Park::new(true);
park.park();
}
}

View File

@@ -0,0 +1,155 @@
use std::{cmp::Ordering, collections::BinaryHeap, fmt::Debug, sync::Arc};
use super::{chan::Chan, network::VirtualConnection};
pub struct Timing {
/// Current world's time.
current_time: u64,
/// Pending timers.
timers: BinaryHeap<Pending>,
/// Global nonce.
nonce: u32,
}
impl Timing {
pub fn new() -> Timing {
Timing {
current_time: 0,
timers: BinaryHeap::new(),
nonce: 0,
}
}
/// Return the current world's time.
pub fn now(&self) -> u64 {
self.current_time
}
/// Tick-tock the global clock. Return the event ready to be processed
/// or move the clock forward and then return the event.
pub fn step(&mut self) -> Option<Pending> {
if self.timers.is_empty() {
// no future events
return None;
}
if !self.is_event_ready() {
let next_time = self.timers.peek().unwrap().time;
self.current_time = next_time;
assert!(self.is_event_ready());
}
self.timers.pop()
}
/// TODO: write docs
pub fn schedule_future(&mut self, ms: u64, event: Box<dyn Event + Send + Sync>) {
self.nonce += 1;
let nonce = self.nonce;
self.timers.push(Pending {
time: self.current_time + ms,
nonce,
event,
})
}
/// Return true if there is a ready event.
fn is_event_ready(&self) -> bool {
self.timers
.peek()
.map_or(false, |x| x.time <= self.current_time)
}
pub fn clear(&mut self) {
self.timers.clear();
}
}
pub struct Pending {
pub time: u64,
pub nonce: u32,
pub event: Box<dyn Event + Send + Sync>,
}
impl Pending {
pub fn process(&self) {
self.event.process();
}
}
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
// to get that.
impl PartialOrd for Pending {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
(other.time, other.nonce).partial_cmp(&(self.time, self.nonce))
}
}
impl Ord for Pending {
fn cmp(&self, other: &Self) -> Ordering {
(other.time, other.nonce).cmp(&(self.time, self.nonce))
}
}
impl PartialEq for Pending {
fn eq(&self, other: &Self) -> bool {
(other.time, other.nonce) == (self.time, self.nonce)
}
}
impl Eq for Pending {}
pub trait Event: Debug {
fn process(&self);
}
pub struct SendMessageEvent<T: Debug + Clone> {
chan: Chan<T>,
msg: T,
}
impl<T: Debug + Clone> SendMessageEvent<T> {
pub fn new(chan: Chan<T>, msg: T) -> Box<SendMessageEvent<T>> {
Box::new(SendMessageEvent { chan, msg })
}
}
impl<T: Debug + Clone> Event for SendMessageEvent<T> {
fn process(&self) {
self.chan.send(self.msg.clone());
}
}
impl<T: Debug + Clone> Debug for SendMessageEvent<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// TODO: add more context about receiver channel
f.debug_struct("SendMessageEvent")
.field("msg", &self.msg)
.finish()
}
}
pub struct NetworkEvent(pub Arc<VirtualConnection>);
impl Event for NetworkEvent {
fn process(&self) {
self.0.process();
}
}
impl Debug for NetworkEvent {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Network")
.field("conn", &self.0.connection_id)
.field("node[0]", &self.0.nodes[0].id)
.field("node[1]", &self.0.nodes[1].id)
.finish()
}
}
#[derive(Copy, Clone, Debug)]
pub struct EmptyEvent;
impl Event for EmptyEvent {
fn process(&self) {}
}

View File

@@ -0,0 +1,54 @@
use std::sync::{Arc, Condvar, Mutex};
/// This is a custom waitgroup for internal use, shouldn't be used by the custom code.
#[derive(Clone)]
pub struct WaitGroup {
inner: Arc<Inner>,
}
/// Inner state of a `WaitGroup`.
struct Inner {
// using std convar
cvar: Condvar,
count: Mutex<i32>,
}
impl Default for WaitGroup {
fn default() -> Self {
Self {
inner: Arc::new(Inner {
cvar: Condvar::new(),
count: Mutex::new(0),
}),
}
}
}
impl WaitGroup {
pub fn new() -> Self {
Self::default()
}
pub fn wait(&self) {
if *self.inner.count.lock().unwrap() <= 0 {
return;
}
let mut count = self.inner.count.lock().unwrap();
while *count > 0 {
count = self.inner.cvar.wait(count).unwrap();
}
}
pub fn add(&self, delta: i32) {
let mut count = self.inner.count.lock().unwrap();
*count += delta;
if *count <= 0 {
self.inner.cvar.notify_all();
}
}
pub fn done(&self) {
self.add(-1);
}
}

View File

@@ -0,0 +1,536 @@
use rand::{rngs::StdRng, Rng, SeedableRng};
use std::{
cell::RefCell,
ops::DerefMut,
panic::AssertUnwindSafe,
sync::{atomic::AtomicU64, Arc},
};
use tracing::{debug, info, trace};
use super::{
chan::Chan,
network::{NetworkOptions, VirtualConnection, TCP},
node_os::NodeOs,
proto::AnyMessage,
sync::{Mutex, Park},
time::{Event, Timing},
wait_group::WaitGroup,
};
pub type NodeId = u32;
/// Full world simulation state, shared between all nodes.
pub struct World {
nodes: Mutex<Vec<Arc<Node>>>,
/// List of parked threads, to be woken up by the world simulation.
unconditional_parking: Mutex<Vec<Arc<Park>>>,
/// Counter for running threads. Generally should not be more than 1, if you want
/// to get a deterministic simulation. 0 means that all threads are parked or finished.
wait_group: WaitGroup,
/// Random number generator.
rng: Mutex<StdRng>,
/// Timers and stuff.
timing: Mutex<Timing>,
/// Network connection counter.
connection_counter: AtomicU64,
/// Network options.
network_options: Arc<NetworkOptions>,
/// Optional function to initialize nodes right after thread creation.
nodes_init: Option<Box<dyn Fn(NodeOs) + Send + Sync>>,
/// Internal event log.
events: Mutex<Vec<SEvent>>,
/// Connections.
connections: Mutex<Vec<Arc<VirtualConnection>>>,
}
impl World {
pub fn new(
seed: u64,
network_options: Arc<NetworkOptions>,
nodes_init: Option<Box<dyn Fn(NodeOs) + Send + Sync>>,
) -> World {
World {
nodes: Mutex::new(Vec::new()),
unconditional_parking: Mutex::new(Vec::new()),
wait_group: WaitGroup::new(),
rng: Mutex::new(StdRng::seed_from_u64(seed)),
timing: Mutex::new(Timing::new()),
connection_counter: AtomicU64::new(0),
network_options,
nodes_init,
events: Mutex::new(Vec::new()),
connections: Mutex::new(Vec::new()),
}
}
/// Create a new random number generator.
pub fn new_rng(&self) -> StdRng {
let mut rng = self.rng.lock();
StdRng::from_rng(rng.deref_mut()).unwrap()
}
/// Create a new node.
pub fn new_node(self: &Arc<Self>) -> Arc<Node> {
let mut nodes = self.nodes.lock();
let id = nodes.len() as NodeId;
let node = Arc::new(Node::new(id, self.clone(), self.new_rng()));
nodes.push(node.clone());
node
}
/// Register world for the current thread. This is required before calling
/// step().
pub fn register_world(self: &Arc<Self>) {
CURRENT_WORLD.with(|world| {
*world.borrow_mut() = Some(self.clone());
});
}
/// Get an internal node state by id.
pub fn get_node(&self, id: NodeId) -> Option<Arc<Node>> {
let nodes = self.nodes.lock();
let num = id as usize;
if num < nodes.len() {
Some(nodes[num].clone())
} else {
None
}
}
pub fn stop_all(&self) {
let nodes = self.nodes.lock().clone();
for node in nodes {
node.crash_stop();
}
}
/// Returns a writable end of a TCP connection, to send src->dst messages.
pub fn open_tcp(self: &Arc<World>, src: &Arc<Node>, dst: NodeId) -> TCP {
// TODO: replace unwrap() with /dev/null socket.
let dst = self.get_node(dst).unwrap();
let id = self
.connection_counter
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
let conn = VirtualConnection::new(
id,
self.clone(),
src.network_chan(),
dst.network_chan(),
src.clone(),
dst,
self.network_options.clone(),
);
// MessageDirection(0) is src->dst
TCP::new(conn, 0)
}
pub fn open_tcp_nopoll(self: &Arc<World>, src: &Arc<Node>, dst: NodeId) -> TCP {
// TODO: replace unwrap() with /dev/null socket.
let dst = self.get_node(dst).unwrap();
let id = self
.connection_counter
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
let conn = VirtualConnection::new(
id,
self.clone(),
Chan::new(), // creating a new channel to read from
dst.network_chan(),
src.clone(),
dst,
self.network_options.clone(),
);
// MessageDirection(0) is src->dst
TCP::new(conn, 0)
}
/// Blocks the current thread until all nodes will park or finish.
pub fn await_all(&self) {
self.wait_group.wait();
}
/// Take a random unconditionally parked thread and return it.
fn thread_to_unpark(&self) -> Option<Arc<Park>> {
let mut parking = self.unconditional_parking.lock();
if parking.is_empty() {
// nothing to do, all threads have finished
return None;
}
let chosen_one = self.rng.lock().gen_range(0..parking.len());
let park = parking.swap_remove(chosen_one);
drop(parking);
Some(park)
}
pub fn step(&self) -> bool {
self.await_all();
// First try to wake up unconditional thread.
let to_resume = self.thread_to_unpark();
if let Some(park) = to_resume {
// debug!("Waking up park at node {:?}", park.node_id());
// Wake up the chosen thread. To do that:
// 1. Increment the counter of running threads.
// 2. Send a singal to continue the thread.
self.wait_group.add(1);
park.internal_world_wake();
// to have a clean state after each step, wait for all threads to finish
self.await_all();
return true;
}
// Otherwise, all threads are probably waiting for some event.
// We'll try to advance virtual time to the next available event.
//
// This way all code running in simulation is considered to be
// instant in terms of "virtual time", and time is advanced only
// when code is waiting for external events.
let time_event = self.timing.lock().step();
if let Some(event) = time_event {
// debug!("Processing event: {:?}", event.event);
event.process();
// to have a clean state after each step, wait for all threads to finish
self.await_all();
return true;
}
false
}
/// Print full world state to stdout.
pub fn debug_print_state(&self) {
debug!(
"World state, nodes.len()={:?}, parking.len()={:?}",
self.nodes.lock().len(),
self.unconditional_parking.lock().len()
);
for node in self.nodes.lock().iter() {
debug!("node id={:?} status={:?}", node.id, node.status.lock());
}
for park in self.unconditional_parking.lock().iter() {
park.debug_print();
}
}
/// Schedule an event to be processed after `ms` milliseconds of global time.
pub fn schedule(&self, ms: u64, e: Box<dyn Event + Send + Sync>) {
let mut timing = self.timing.lock();
timing.schedule_future(ms, e);
}
/// Get current time.
pub fn now(&self) -> u64 {
let timing = self.timing.lock();
timing.now()
}
/// Get the current world, panics if called from outside of a world thread.
pub fn current() -> Arc<World> {
CURRENT_WORLD.with(|world| {
world
.borrow()
.as_ref()
.expect("World::current() called from outside of a world thread")
.clone()
})
}
pub fn internal_parking_wake(&self) {
// waking node with condition, increase the running threads counter
self.wait_group.add(1);
}
fn find_parked_node(&self, node: &Node) -> Option<Arc<Park>> {
let mut parking = self.unconditional_parking.lock();
let mut found: Option<usize> = None;
for (i, park) in parking.iter().enumerate() {
if park.node_id() == Some(node.id) {
if found.is_some() {
panic!("found more than one parked thread for node {}", node.id);
}
found = Some(i);
}
}
Some(parking.swap_remove(found?))
}
pub fn add_event(&self, node: NodeId, data: String) {
let time = self.now();
self.events.lock().push(SEvent { time, node, data });
}
pub fn take_events(&self) -> Vec<SEvent> {
let mut events = self.events.lock();
let mut res = Vec::new();
std::mem::swap(&mut res, &mut events);
res
}
pub fn add_conn(&self, conn: Arc<VirtualConnection>) {
self.connections.lock().push(conn);
}
pub fn deallocate(&self) {
self.stop_all();
self.timing.lock().clear();
self.unconditional_parking.lock().clear();
let mut connections = Vec::new();
std::mem::swap(&mut connections, &mut self.connections.lock());
for conn in connections {
conn.deallocate();
trace!("conn strong count: {}", Arc::strong_count(&conn));
}
let mut nodes = Vec::new();
std::mem::swap(&mut nodes, &mut self.nodes.lock());
let mut weak_ptrs = Vec::new();
for node in nodes {
node.deallocate();
weak_ptrs.push(Arc::downgrade(&node));
}
for weak_ptr in weak_ptrs {
let node = weak_ptr.upgrade();
if node.is_none() {
trace!("node is already deallocated");
continue;
}
let node = node.unwrap();
debug!("node strong count: {}", Arc::strong_count(&node));
}
self.events.lock().clear();
}
}
thread_local! {
pub static CURRENT_NODE: RefCell<Option<Arc<Node>>> = RefCell::new(None);
pub static CURRENT_WORLD: RefCell<Option<Arc<World>>> = RefCell::new(None);
}
/// Internal node state.
pub struct Node {
pub id: NodeId,
network: Mutex<Chan<NodeEvent>>,
status: Mutex<NodeStatus>,
waiting_park: Mutex<Arc<Park>>,
world: Arc<World>,
join_handle: Mutex<Option<std::thread::JoinHandle<()>>>,
pub rng: Mutex<StdRng>,
/// Every node can set a result string, which can be read by the test.
pub result: Mutex<(i32, String)>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum NodeStatus {
NotStarted,
Running,
Waiting,
Parked,
Finished,
Failed,
}
impl Node {
pub fn new(id: NodeId, world: Arc<World>, rng: StdRng) -> Node {
Node {
id,
network: Mutex::new(Chan::new()),
status: Mutex::new(NodeStatus::NotStarted),
waiting_park: Mutex::new(Park::new(false)),
world,
join_handle: Mutex::new(None),
rng: Mutex::new(rng),
result: Mutex::new((-1, String::new())),
}
}
/// Set a code to run in this node thread.
pub fn launch(self: &Arc<Self>, f: impl FnOnce(NodeOs) + Send + 'static) {
let node = self.clone();
let world = self.world.clone();
world.wait_group.add(1);
let join_handle = std::thread::spawn(move || {
CURRENT_NODE.with(|current_node| {
*current_node.borrow_mut() = Some(node.clone());
});
let wg = world.wait_group.clone();
scopeguard::defer! {
wg.done();
}
let mut status = node.status.lock();
if *status != NodeStatus::NotStarted && *status != NodeStatus::Finished {
// clearly a caller bug, should never happen
panic!("node {} is already running", node.id);
}
*status = NodeStatus::Running;
drop(status);
let res = std::panic::catch_unwind(AssertUnwindSafe(|| {
// park the current thread, [`launch`] will wait until it's parked
Park::yield_thread();
if let Some(nodes_init) = world.nodes_init.as_ref() {
nodes_init(NodeOs::new(world.clone(), node.clone()));
}
f(NodeOs::new(world, node.clone()));
}));
match res {
Ok(_) => {
debug!("Node {} finished successfully", node.id);
}
Err(e) => {
debug!("Node {} finished with panic: {:?}", node.id, e);
}
}
let mut status = node.status.lock();
*status = NodeStatus::Finished;
});
*self.join_handle.lock() = Some(join_handle);
// we need to wait for the thread to park, to assure that threads
// are parked in deterministic order
self.world.wait_group.wait();
}
/// Returns a channel to receive events from the network.
pub fn network_chan(&self) -> Chan<NodeEvent> {
self.network.lock().clone()
}
pub fn internal_parking_start(&self, park: Arc<Park>) {
// Node started parking (waiting for condition), and the current thread
// is the only one running, so we need to do:
// 1. Change the node status to Waiting
// 2. Decrease the running threads counter
// 3. Block the current thread until it's woken up (outside this function)
*self.status.lock() = NodeStatus::Waiting;
*self.waiting_park.lock() = park;
self.world.wait_group.done();
}
pub fn internal_parking_middle(&self, park: Arc<Park>) {
// [`park`] entered the unconditional_parking state, and the current thread
// is the only one running, so we need to do:
// 1. Change the node status to Parked
// 2. Park in the world list
// 3. Decrease the running threads counter
// 4. Block the current thread until it's woken up (outside this function)
*self.status.lock() = NodeStatus::Parked;
self.world.unconditional_parking.lock().push(park);
self.world.wait_group.done();
}
pub fn internal_parking_ahead(&self, park: Arc<Park>) {
// [`park`] entered the unconditional_parking state, and the current thread
// wants to transfer control to another thread, so we need to do:
// 1. Change the node status to Parked
// 2. Park in the world list
// 3. Notify the other thread to continue
// 4. Block the current thread until it's woken up (outside this function)
*self.status.lock() = NodeStatus::Parked;
self.world.unconditional_parking.lock().push(park);
}
pub fn internal_parking_end(&self) {
// node finished parking, now it's running again
*self.status.lock() = NodeStatus::Running;
}
/// Get the current node, panics if called from outside of a node thread.
pub fn current() -> Arc<Node> {
CURRENT_NODE.with(|current_node| current_node.borrow().clone().unwrap())
}
pub fn is_node_thread() -> bool {
CURRENT_NODE.with(|current_node| current_node.borrow().is_some())
}
pub fn is_finished(&self) -> bool {
let status = self.status.lock();
*status == NodeStatus::Finished
}
pub fn crash_stop(self: &Arc<Self>) {
self.world.await_all();
let status = self.status.lock().clone();
match status {
NodeStatus::NotStarted | NodeStatus::Finished | NodeStatus::Failed => return,
NodeStatus::Running => {
panic!("crash unexpected node state: Running")
}
NodeStatus::Waiting | NodeStatus::Parked => {}
}
debug!("Node {} is crashing, status={:?}", self.id, status);
let park = self.world.find_parked_node(self);
let park = if park.is_some() {
assert!(status == NodeStatus::Parked);
park.unwrap()
} else {
assert!(status == NodeStatus::Waiting);
self.waiting_park.lock().clone()
};
park.debug_print();
// self.world.debug_print_state();
// unplug old network socket, and create a new one
*self.network.lock() = Chan::new();
self.world.wait_group.add(1);
park.crash_panic();
// self.world.debug_print_state();
self.world.wait_group.wait();
}
pub fn deallocate(&self) {
self.network.lock().clear();
}
}
/// Network events and timers.
#[derive(Clone, Debug)]
pub enum NodeEvent {
Accept(TCP),
Closed(TCP),
Message((AnyMessage, TCP)),
Internal(AnyMessage),
WakeTimeout(u64),
// TODO: close?
}
#[derive(Debug)]
pub struct SEvent {
pub time: u64,
pub node: NodeId,
pub data: String,
}

View File

@@ -0,0 +1,48 @@
use tracing::info;
use crate::simlib::{
node_os::NodeOs,
proto::{AnyMessage, ReplCell},
world::{NodeEvent, NodeId},
};
/// Copy all data from array to the remote node.
pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) {
info!("started client");
let epoll = os.epoll();
let mut delivered = 0;
let mut sock = os.open_tcp(dst);
while delivered < data.len() {
let num = &data[delivered];
info!("sending data: {:?}", num.clone());
sock.send(AnyMessage::ReplCell(num.clone()));
// loop {
let event = epoll.recv();
match event {
NodeEvent::Message((AnyMessage::Just32(flush_pos), _)) => {
if flush_pos == 1 + delivered as u32 {
delivered += 1;
}
}
NodeEvent::Closed(_) => {
info!("connection closed, reestablishing");
sock = os.open_tcp(dst);
}
_ => {}
}
// }
}
let sock = os.open_tcp(dst);
for num in data {
info!("sending data: {:?}", num.clone());
sock.send(AnyMessage::ReplCell(num.clone()));
}
info!("sent all data and finished client");
}

View File

@@ -0,0 +1,63 @@
use std::sync::Arc;
use anyhow::Result;
use crate::simlib::sync::{Mutex, Park};
pub trait Storage<T> {
fn flush_pos(&self) -> u32;
fn flush(&mut self) -> Result<()>;
fn write(&mut self, t: T);
}
#[derive(Clone)]
pub struct SharedStorage<T> {
pub state: Arc<Mutex<InMemoryStorage<T>>>,
}
impl<T> SharedStorage<T> {
pub fn new() -> Self {
Self {
state: Arc::new(Mutex::new(InMemoryStorage::new())),
}
}
}
impl<T> Storage<T> for SharedStorage<T> {
fn flush_pos(&self) -> u32 {
self.state.lock().flush_pos
}
fn flush(&mut self) -> Result<()> {
Park::yield_thread();
self.state.lock().flush()
}
fn write(&mut self, t: T) {
Park::yield_thread();
self.state.lock().write(t);
}
}
pub struct InMemoryStorage<T> {
pub data: Vec<T>,
pub flush_pos: u32,
}
impl<T> InMemoryStorage<T> {
pub fn new() -> Self {
Self {
data: Vec::new(),
flush_pos: 0,
}
}
pub fn flush(&mut self) -> Result<()> {
self.flush_pos = self.data.len() as u32;
Ok(())
}
pub fn write(&mut self, t: T) {
self.data.push(t);
}
}

View File

@@ -0,0 +1,109 @@
mod client;
mod disk;
mod server;
use std::sync::Arc;
use crate::{
simlib::{
proto::ReplCell,
world::World, node_os::NodeOs,
},
simtest::{disk::SharedStorage, server::run_server},
};
#[cfg(test)]
mod tests {
use std::sync::Arc;
use crate::simlib::{network::{Delay, NetworkOptions}, world::World};
use super::{u32_to_cells, start_simulation, Options, client::run_client};
#[test]
fn run_pure_rust_test() {
let delay = Delay {
min: 1,
max: 60,
fail_prob: 0.4,
};
let network = NetworkOptions {
keepalive_timeout: Some(50),
connect_delay: delay.clone(),
send_delay: delay.clone(),
};
for seed in 0..20 {
let u32_data: [u32; 5] = [1, 2, 3, 4, 5];
let data = u32_to_cells(&u32_data, 1);
let world = Arc::new(World::new(seed, Arc::new(network.clone()), None));
start_simulation(Options {
world,
time_limit: 1_000_000,
client_fn: Box::new(move |os, server_id| {
run_client(os, &data, server_id)
}),
u32_data,
});
}
}
}
pub struct Options {
pub world: Arc<World>,
pub time_limit: u64,
pub u32_data: [u32; 5],
pub client_fn: Box<dyn FnOnce(NodeOs, u32) + Send + 'static>,
}
pub fn start_simulation(options: Options) {
let world = options.world;
world.register_world();
let client_node = world.new_node();
let server_node = world.new_node();
let server_id = server_node.id;
// start the client thread
client_node.launch(move |os| {
let client_fn = options.client_fn;
client_fn(os, server_id);
});
// start the server thread
let shared_storage = SharedStorage::new();
let server_storage = shared_storage.clone();
server_node.launch(move |os| run_server(os, Box::new(server_storage)));
world.await_all();
while world.step() && world.now() < options.time_limit {}
let disk_data = shared_storage.state.lock().data.clone();
assert!(verify_data(&disk_data, &options.u32_data[..]));
}
pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec<ReplCell> {
let mut res = Vec::new();
for i in 0..data.len() {
res.push(ReplCell {
client_id,
seqno: i as u32,
value: data[i],
});
}
res
}
fn verify_data(disk_data: &[u32], data: &[u32]) -> bool {
if disk_data.len() != data.len() {
return false;
}
for i in 0..data.len() {
if disk_data[i] != data[i] {
return false;
}
}
true
}

View File

@@ -0,0 +1,53 @@
use tracing::info;
use crate::simlib::{node_os::NodeOs, proto::AnyMessage, world::NodeEvent};
use super::disk::Storage;
// pub struct DiskLog {
// pub map: HashMap<String, u32>,
// }
// impl DiskLog {
// pub fn new() -> Self {
// Self {
// map: HashMap::new(),
// }
// }
// pub fn get(&self, key: &str) -> u32 {
// self.map.get(key).copied().unwrap_or(0)
// }
// pub fn set(&mut self, key: &str, value: u32) {
// self.map.insert(key.to_string(), value);
// }
// }
pub fn run_server(os: NodeOs, mut storage: Box<dyn Storage<u32>>) {
info!("started server");
let epoll = os.epoll();
loop {
let event = epoll.recv();
info!("got event: {:?}", event);
match event {
NodeEvent::Message((msg, tcp)) => match msg {
AnyMessage::ReplCell(cell) => {
if cell.seqno != storage.flush_pos() {
info!("got out of order data: {:?}", cell);
continue;
}
storage.write(cell.value);
storage.flush().unwrap();
tcp.send(AnyMessage::Just32(storage.flush_pos()));
}
_ => {}
},
NodeEvent::Accept(tcp) => {
tcp.send(AnyMessage::Just32(storage.flush_pos()));
}
_ => {}
}
}
}