mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 02:42:56 +00:00
Compare commits
50 Commits
proxy-cpla
...
arthur/sim
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6a00ad3aab | ||
|
|
61e6b24cb2 | ||
|
|
44c7d96ed0 | ||
|
|
10ad3ae4eb | ||
|
|
eb2886b401 | ||
|
|
0dc262a84a | ||
|
|
d801ba7248 | ||
|
|
1effb586ba | ||
|
|
2fd351fd63 | ||
|
|
13e94bf687 | ||
|
|
41b9750e81 | ||
|
|
f8729f046d | ||
|
|
420d3bc18f | ||
|
|
33f7877d1b | ||
|
|
7de94c959a | ||
|
|
731ed3bb64 | ||
|
|
413ce2cfe8 | ||
|
|
7f36028fab | ||
|
|
cb6a8d3fe3 | ||
|
|
095747afc0 | ||
|
|
89bd7ab8a3 | ||
|
|
5034a8cca0 | ||
|
|
55e40d090e | ||
|
|
d87e822169 | ||
|
|
296a0cbac2 | ||
|
|
aed14f52d5 | ||
|
|
909d7fadb8 | ||
|
|
3840d6b18b | ||
|
|
65f92232e6 | ||
|
|
0d4f987fc8 | ||
|
|
aa0763d49d | ||
|
|
7b5123edda | ||
|
|
b6a80bc269 | ||
|
|
ac82b34c64 | ||
|
|
a77fc2c5ff | ||
|
|
9ccbec0e14 | ||
|
|
b55005d2c4 | ||
|
|
6436432a77 | ||
|
|
1b8918e665 | ||
|
|
87c9edac7c | ||
|
|
5e0550a620 | ||
|
|
06f493f525 | ||
|
|
f6b540ebfe | ||
|
|
83f87af02b | ||
|
|
79823c38cd | ||
|
|
072fb3d7e9 | ||
|
|
f2fb9f6be9 | ||
|
|
dd4c8fb568 | ||
|
|
9116c01614 | ||
|
|
17cd96e022 |
@@ -14,3 +14,6 @@ opt-level = 1
|
||||
|
||||
[alias]
|
||||
build_testing = ["build", "--features", "testing"]
|
||||
|
||||
[build]
|
||||
rustflags = ["-C", "default-linker-libraries"]
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -18,3 +18,5 @@ test_output/
|
||||
*.o
|
||||
*.so
|
||||
*.Po
|
||||
|
||||
tmp
|
||||
|
||||
81
Cargo.lock
generated
81
Cargo.lock
generated
@@ -679,6 +679,25 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cbindgen"
|
||||
version = "0.24.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b922faaf31122819ec80c4047cc684c6979a087366c069611e33649bf98e18d"
|
||||
dependencies = [
|
||||
"clap 3.2.23",
|
||||
"heck",
|
||||
"indexmap",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"syn",
|
||||
"tempfile",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.79"
|
||||
@@ -757,9 +776,12 @@ version = "3.2.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"bitflags",
|
||||
"clap_lex 0.2.4",
|
||||
"indexmap",
|
||||
"strsim",
|
||||
"termcolor",
|
||||
"textwrap",
|
||||
]
|
||||
|
||||
@@ -1014,6 +1036,20 @@ version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-channel",
|
||||
"crossbeam-deque",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-queue",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.6"
|
||||
@@ -1048,6 +1084,16 @@ dependencies = [
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-queue"
|
||||
version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.14"
|
||||
@@ -3311,6 +3357,7 @@ dependencies = [
|
||||
"clap 4.1.4",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"crossbeam",
|
||||
"fs2",
|
||||
"git-version",
|
||||
"hex",
|
||||
@@ -3324,9 +3371,11 @@ dependencies = [
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"safekeeper_api",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
@@ -4588,6 +4637,38 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "walproposer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"atty",
|
||||
"bindgen",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"cbindgen",
|
||||
"crc32c",
|
||||
"env_logger",
|
||||
"hex",
|
||||
"hyper",
|
||||
"libc",
|
||||
"log",
|
||||
"memoffset 0.8.0",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"postgres_ffi",
|
||||
"rand",
|
||||
"regex",
|
||||
"safekeeper",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "want"
|
||||
version = "0.3.0"
|
||||
|
||||
@@ -138,10 +138,12 @@ postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||
safekeeper = { path = "./safekeeper/" }
|
||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
8
Makefile
8
Makefile
@@ -39,6 +39,8 @@ endif
|
||||
# been no changes to the files. Changing the mtime triggers an
|
||||
# unnecessary rebuild of 'postgres_ffi'.
|
||||
PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C'
|
||||
PG_CONFIGURE_OPTS += CC=clang
|
||||
PG_CONFIGURE_OPTS += CCX=clang++
|
||||
|
||||
# Choose whether we should be silent or verbose
|
||||
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
|
||||
@@ -134,6 +136,12 @@ neon-pg-ext-%: postgres-%
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
|
||||
|
||||
.PHONY:
|
||||
neon-pg-ext-walproposer:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-v15 \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
|
||||
|
||||
.PHONY: neon-pg-ext-clean-%
|
||||
neon-pg-ext-clean-%:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
|
||||
4
libs/walproposer/.gitignore
vendored
Normal file
4
libs/walproposer/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
*.a
|
||||
*.o
|
||||
*.tmp
|
||||
pgdata
|
||||
39
libs/walproposer/Cargo.toml
Normal file
39
libs/walproposer/Cargo.toml
Normal file
@@ -0,0 +1,39 @@
|
||||
[package]
|
||||
name = "walproposer"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
atty.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
anyhow.workspace = true
|
||||
crc32c.workspace = true
|
||||
hex.workspace = true
|
||||
once_cell.workspace = true
|
||||
log.workspace = true
|
||||
libc.workspace = true
|
||||
memoffset.workspace = true
|
||||
thiserror.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
||||
serde.workspace = true
|
||||
scopeguard.workspace = true
|
||||
utils.workspace = true
|
||||
safekeeper.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
hyper.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
anyhow.workspace = true
|
||||
bindgen.workspace = true
|
||||
cbindgen = "0.24.0"
|
||||
16
libs/walproposer/README.md
Normal file
16
libs/walproposer/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
# walproposer Rust module
|
||||
|
||||
## Rust -> C
|
||||
|
||||
We compile walproposer as a static library and generate Rust bindings for it using `bindgen`.
|
||||
Entrypoint header file is `bindgen_deps.h`.
|
||||
|
||||
## C -> Rust
|
||||
|
||||
We use `cbindgen` to generate C bindings for the Rust code. They are stored in `rust_bindings.h`.
|
||||
|
||||
## How to run the tests
|
||||
|
||||
```
|
||||
export RUSTFLAGS="-C default-linker-libraries"
|
||||
```
|
||||
30
libs/walproposer/bindgen_deps.h
Normal file
30
libs/walproposer/bindgen_deps.h
Normal file
@@ -0,0 +1,30 @@
|
||||
/*
|
||||
* This header file is the input to bindgen. It includes all the
|
||||
* PostgreSQL headers that we need to auto-generate Rust structs
|
||||
* from. If you need to expose a new struct to Rust code, add the
|
||||
* header here, and whitelist the struct in the build.rs file.
|
||||
*/
|
||||
#include "c.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Calc a sum of two numbers. Used to test Rust->C function calls.
|
||||
int TestFunc(int a, int b);
|
||||
|
||||
// Run a client for simple simlib test.
|
||||
void RunClientC(uint32_t serverId);
|
||||
|
||||
void WalProposerRust();
|
||||
|
||||
void WalProposerCleanup();
|
||||
|
||||
extern bool debug_enabled;
|
||||
|
||||
// Initialize global variables before calling any Postgres C code.
|
||||
void MyContextInit();
|
||||
|
||||
XLogRecPtr MyInsertRecord();
|
||||
137
libs/walproposer/build.rs
Normal file
137
libs/walproposer/build.rs
Normal file
@@ -0,0 +1,137 @@
|
||||
use std::{env, path::PathBuf, process::Command};
|
||||
use anyhow::{anyhow, Context};
|
||||
use bindgen::CargoCallbacks;
|
||||
|
||||
extern crate bindgen;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
||||
|
||||
cbindgen::Builder::new()
|
||||
.with_crate(crate_dir)
|
||||
.with_language(cbindgen::Language::C)
|
||||
.generate()
|
||||
.expect("Unable to generate bindings")
|
||||
.write_to_file("rust_bindings.h");
|
||||
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=bindgen_deps.h,test.c,../../pgxn/neon/walproposer.c,build.sh");
|
||||
println!("cargo:rustc-link-arg=-Wl,--start-group");
|
||||
println!("cargo:rustc-link-arg=-lsim");
|
||||
println!("cargo:rustc-link-arg=-lpgport_srv");
|
||||
println!("cargo:rustc-link-arg=-lpostgres");
|
||||
println!("cargo:rustc-link-arg=-lpgcommon_srv");
|
||||
println!("cargo:rustc-link-arg=-lssl");
|
||||
println!("cargo:rustc-link-arg=-lcrypto");
|
||||
println!("cargo:rustc-link-arg=-lz");
|
||||
println!("cargo:rustc-link-arg=-lpthread");
|
||||
println!("cargo:rustc-link-arg=-lrt");
|
||||
println!("cargo:rustc-link-arg=-ldl");
|
||||
println!("cargo:rustc-link-arg=-lm");
|
||||
println!("cargo:rustc-link-arg=-lwalproposer");
|
||||
println!("cargo:rustc-link-arg=-Wl,--end-group");
|
||||
println!("cargo:rustc-link-search=/home/admin/simulator/libs/walproposer");
|
||||
// disable fPIE
|
||||
println!("cargo:rustc-link-arg=-no-pie");
|
||||
|
||||
// print output of build.sh
|
||||
let output = std::process::Command::new("./build.sh")
|
||||
.output()
|
||||
.expect("could not spawn `clang`");
|
||||
|
||||
println!("stdout: {}", String::from_utf8(output.stdout).unwrap());
|
||||
println!("stderr: {}", String::from_utf8(output.stderr).unwrap());
|
||||
|
||||
if !output.status.success() {
|
||||
// Panic if the command was not successful.
|
||||
panic!("could not compile object file");
|
||||
}
|
||||
|
||||
// // Finding the location of C headers for the Postgres server:
|
||||
// // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
|
||||
// // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
|
||||
let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
|
||||
postgres_install_dir.into()
|
||||
} else {
|
||||
PathBuf::from("pg_install")
|
||||
};
|
||||
|
||||
let pg_version = "v15";
|
||||
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
|
||||
if pg_install_dir_versioned.is_relative() {
|
||||
let cwd = env::current_dir().context("Failed to get current_dir")?;
|
||||
pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned);
|
||||
}
|
||||
|
||||
let pg_config_bin = pg_install_dir_versioned
|
||||
.join(pg_version)
|
||||
.join("bin")
|
||||
.join("pg_config");
|
||||
let inc_server_path: String = if pg_config_bin.exists() {
|
||||
let output = Command::new(pg_config_bin)
|
||||
.arg("--includedir-server")
|
||||
.output()
|
||||
.context("failed to execute `pg_config --includedir-server`")?;
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("`pg_config --includedir-server` failed")
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout)
|
||||
.context("pg_config output is not UTF-8")?
|
||||
.trim_end()
|
||||
.into()
|
||||
} else {
|
||||
let server_path = pg_install_dir_versioned
|
||||
.join("include")
|
||||
.join("postgresql")
|
||||
.join("server")
|
||||
.into_os_string();
|
||||
server_path
|
||||
.into_string()
|
||||
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
|
||||
};
|
||||
|
||||
let inc_pgxn_path = "/home/admin/simulator/pgxn/neon";
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
// The input header we would like to generate
|
||||
// bindings for.
|
||||
.header("bindgen_deps.h")
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
.parse_callbacks(Box::new(CargoCallbacks))
|
||||
.allowlist_function("TestFunc")
|
||||
.allowlist_function("RunClientC")
|
||||
.allowlist_function("WalProposerRust")
|
||||
.allowlist_function("MyContextInit")
|
||||
.allowlist_function("WalProposerCleanup")
|
||||
.allowlist_function("MyInsertRecord")
|
||||
.allowlist_var("wal_acceptors_list")
|
||||
.allowlist_var("wal_acceptor_reconnect_timeout")
|
||||
.allowlist_var("wal_acceptor_connection_timeout")
|
||||
.allowlist_var("am_wal_proposer")
|
||||
.allowlist_var("neon_timeline_walproposer")
|
||||
.allowlist_var("neon_tenant_walproposer")
|
||||
.allowlist_var("syncSafekeepers")
|
||||
.allowlist_var("sim_redo_start_lsn")
|
||||
.allowlist_var("debug_enabled")
|
||||
.clang_arg(format!("-I{inc_server_path}"))
|
||||
.clang_arg(format!("-I{inc_pgxn_path}"))
|
||||
.clang_arg(format!("-DSIMLIB"))
|
||||
// Finish the builder and generate the bindings.
|
||||
.generate()
|
||||
// Unwrap the Result and panic on failure.
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
|
||||
bindings
|
||||
.write_to_file(out_path)
|
||||
.expect("Couldn't write bindings!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
21
libs/walproposer/build.sh
Executable file
21
libs/walproposer/build.sh
Executable file
@@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
cd /home/admin/simulator/libs/walproposer
|
||||
|
||||
# TODO: rewrite to Makefile
|
||||
|
||||
make -C ../.. neon-pg-ext-walproposer
|
||||
make -C ../../pg_install/build/v15/src/backend postgres-lib -s
|
||||
cp ../../pg_install/build/v15/src/backend/libpostgres.a .
|
||||
cp ../../pg_install/build/v15/src/common/libpgcommon_srv.a .
|
||||
cp ../../pg_install/build/v15/src/port/libpgport_srv.a .
|
||||
|
||||
clang -g -c libpqwalproposer.c test.c -ferror-limit=1 -I ../../pg_install/v15/include/postgresql/server -I ../../pgxn/neon
|
||||
rm -rf libsim.a
|
||||
ar rcs libsim.a test.o libpqwalproposer.o
|
||||
|
||||
rm -rf libwalproposer.a
|
||||
|
||||
PGXN_DIR=../../pg_install/build/neon-v15/
|
||||
ar rcs libwalproposer.a $PGXN_DIR/walproposer.o $PGXN_DIR/walproposer_utils.o $PGXN_DIR/neon.o
|
||||
542
libs/walproposer/libpqwalproposer.c
Normal file
542
libs/walproposer/libpqwalproposer.c
Normal file
@@ -0,0 +1,542 @@
|
||||
#include "postgres.h"
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
#include "rust_bindings.h"
|
||||
#include "replication/message.h"
|
||||
#include "access/xlog_internal.h"
|
||||
|
||||
// defined in walproposer.h
|
||||
uint64 sim_redo_start_lsn;
|
||||
XLogRecPtr sim_latest_available_lsn;
|
||||
|
||||
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
|
||||
struct WalProposerConn
|
||||
{
|
||||
int64_t tcp;
|
||||
};
|
||||
|
||||
/* Helper function */
|
||||
static bool
|
||||
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||
{
|
||||
// walprop_log(LOG, "not implemented");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Exported function definitions */
|
||||
char *
|
||||
walprop_error_message(WalProposerConn *conn)
|
||||
{
|
||||
// walprop_log(LOG, "not implemented");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
WalProposerConnStatusType
|
||||
walprop_status(WalProposerConn *conn)
|
||||
{
|
||||
// walprop_log(LOG, "not implemented: walprop_status");
|
||||
return WP_CONNECTION_OK;
|
||||
}
|
||||
|
||||
WalProposerConn *
|
||||
walprop_connect_start(char *conninfo)
|
||||
{
|
||||
WalProposerConn *conn;
|
||||
|
||||
walprop_log(LOG, "walprop_connect_start: %s", conninfo);
|
||||
|
||||
const char *connstr_prefix = "host=node port=";
|
||||
Assert(strncmp(conninfo, connstr_prefix, strlen(connstr_prefix)) == 0);
|
||||
|
||||
int nodeId = atoi(conninfo + strlen(connstr_prefix));
|
||||
|
||||
conn = palloc(sizeof(WalProposerConn));
|
||||
conn->tcp = sim_open_tcp(nodeId);
|
||||
return conn;
|
||||
}
|
||||
|
||||
WalProposerConnectPollStatusType
|
||||
walprop_connect_poll(WalProposerConn *conn)
|
||||
{
|
||||
// walprop_log(LOG, "not implemented: walprop_connect_poll");
|
||||
return WP_CONN_POLLING_OK;
|
||||
}
|
||||
|
||||
bool
|
||||
walprop_send_query(WalProposerConn *conn, char *query)
|
||||
{
|
||||
// walprop_log(LOG, "not implemented: walprop_send_query");
|
||||
return true;
|
||||
}
|
||||
|
||||
WalProposerExecStatusType
|
||||
walprop_get_query_result(WalProposerConn *conn)
|
||||
{
|
||||
// walprop_log(LOG, "not implemented: walprop_get_query_result");
|
||||
return WP_EXEC_SUCCESS_COPYBOTH;
|
||||
}
|
||||
|
||||
pgsocket
|
||||
walprop_socket(WalProposerConn *conn)
|
||||
{
|
||||
return (pgsocket) conn->tcp;
|
||||
}
|
||||
|
||||
int
|
||||
walprop_flush(WalProposerConn *conn)
|
||||
{
|
||||
// walprop_log(LOG, "not implemented");
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
walprop_finish(WalProposerConn *conn)
|
||||
{
|
||||
// walprop_log(LOG, "walprop_finish not implemented");
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a message from the safekeeper.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
PGAsyncReadResult
|
||||
walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
{
|
||||
uintptr_t len;
|
||||
char *msg;
|
||||
Event event;
|
||||
|
||||
event = sim_epoll_peek(0);
|
||||
if (event.tcp != conn->tcp || event.tag != Message || event.any_message != Bytes)
|
||||
return PG_ASYNC_READ_TRY_AGAIN;
|
||||
|
||||
event = sim_epoll_rcv(0);
|
||||
|
||||
// walprop_log(LOG, "walprop_async_read, T: %d, tcp: %d, tag: %d", (int) event.tag, (int) event.tcp, (int) event.any_message);
|
||||
Assert(event.tcp == conn->tcp);
|
||||
Assert(event.tag == Message);
|
||||
Assert(event.any_message == Bytes);
|
||||
|
||||
msg = (char*) sim_msg_get_bytes(&len);
|
||||
*buf = msg;
|
||||
*amount = len;
|
||||
// walprop_log(LOG, "walprop_async_read: %d", (int) len);
|
||||
|
||||
return PG_ASYNC_READ_SUCCESS;
|
||||
}
|
||||
|
||||
PGAsyncWriteResult
|
||||
walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
{
|
||||
// walprop_log(LOG, "walprop_async_write");
|
||||
sim_msg_set_bytes(buf, size);
|
||||
sim_tcp_send(conn->tcp);
|
||||
return PG_ASYNC_WRITE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is very similar to walprop_async_write. For more
|
||||
* information, refer to the comments there.
|
||||
*/
|
||||
bool
|
||||
walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
{
|
||||
// walprop_log(LOG, "walprop_blocking_write");
|
||||
sim_msg_set_bytes(buf, size);
|
||||
sim_tcp_send(conn->tcp);
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
sim_start_replication(XLogRecPtr startptr)
|
||||
{
|
||||
walprop_log(LOG, "sim_start_replication: %X/%X", LSN_FORMAT_ARGS(startptr));
|
||||
sim_latest_available_lsn = startptr;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
XLogRecPtr endptr = sim_latest_available_lsn;
|
||||
|
||||
Assert(startptr <= endptr);
|
||||
if (endptr > startptr)
|
||||
{
|
||||
WalProposerBroadcast(startptr, endptr);
|
||||
startptr = endptr;
|
||||
}
|
||||
|
||||
WalProposerPoll();
|
||||
}
|
||||
}
|
||||
|
||||
#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
|
||||
|
||||
static int UsableBytesInSegment =
|
||||
(DEFAULT_XLOG_SEG_SIZE / XLOG_BLCKSZ * UsableBytesInPage) -
|
||||
(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
|
||||
|
||||
/*
|
||||
* Converts a "usable byte position" to XLogRecPtr. A usable byte position
|
||||
* is the position starting from the beginning of WAL, excluding all WAL
|
||||
* page headers.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
XLogBytePosToRecPtr(uint64 bytepos)
|
||||
{
|
||||
uint64 fullsegs;
|
||||
uint64 fullpages;
|
||||
uint64 bytesleft;
|
||||
uint32 seg_offset;
|
||||
XLogRecPtr result;
|
||||
|
||||
fullsegs = bytepos / UsableBytesInSegment;
|
||||
bytesleft = bytepos % UsableBytesInSegment;
|
||||
|
||||
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
|
||||
{
|
||||
/* fits on first page of segment */
|
||||
seg_offset = bytesleft + SizeOfXLogLongPHD;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* account for the first page on segment with long header */
|
||||
seg_offset = XLOG_BLCKSZ;
|
||||
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
|
||||
|
||||
fullpages = bytesleft / UsableBytesInPage;
|
||||
bytesleft = bytesleft % UsableBytesInPage;
|
||||
|
||||
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
|
||||
}
|
||||
|
||||
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert an XLogRecPtr to a "usable byte position".
|
||||
*/
|
||||
static uint64
|
||||
XLogRecPtrToBytePos(XLogRecPtr ptr)
|
||||
{
|
||||
uint64 fullsegs;
|
||||
uint32 fullpages;
|
||||
uint32 offset;
|
||||
uint64 result;
|
||||
|
||||
XLByteToSeg(ptr, fullsegs, wal_segment_size);
|
||||
|
||||
fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
|
||||
offset = ptr % XLOG_BLCKSZ;
|
||||
|
||||
if (fullpages == 0)
|
||||
{
|
||||
result = fullsegs * UsableBytesInSegment;
|
||||
if (offset > 0)
|
||||
{
|
||||
Assert(offset >= SizeOfXLogLongPHD);
|
||||
result += offset - SizeOfXLogLongPHD;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
result = fullsegs * UsableBytesInSegment +
|
||||
(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
|
||||
(fullpages - 1) * UsableBytesInPage; /* full pages */
|
||||
if (offset > 0)
|
||||
{
|
||||
Assert(offset >= SizeOfXLogShortPHD);
|
||||
result += offset - SizeOfXLogShortPHD;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#define max_rdatas 16
|
||||
|
||||
void InitMyInsert();
|
||||
static void MyBeginInsert();
|
||||
static void MyRegisterData(char *data, int len);
|
||||
static XLogRecPtr MyFinishInsert(RmgrId rmid, uint8 info, uint8 flags);
|
||||
static void MyCopyXLogRecordToWAL(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos);
|
||||
|
||||
/*
|
||||
* An array of XLogRecData structs, to hold registered data.
|
||||
*/
|
||||
static XLogRecData rdatas[max_rdatas];
|
||||
static int num_rdatas; /* entries currently used */
|
||||
static uint32 mainrdata_len; /* total # of bytes in chain */
|
||||
static XLogRecData hdr_rdt;
|
||||
static char hdr_scratch[16000];
|
||||
static XLogRecPtr CurrBytePos;
|
||||
static XLogRecPtr PrevBytePos;
|
||||
|
||||
void InitMyInsert()
|
||||
{
|
||||
CurrBytePos = sim_redo_start_lsn;
|
||||
PrevBytePos = InvalidXLogRecPtr;
|
||||
sim_latest_available_lsn = sim_redo_start_lsn;
|
||||
}
|
||||
|
||||
static void MyBeginInsert()
|
||||
{
|
||||
num_rdatas = 0;
|
||||
mainrdata_len = 0;
|
||||
}
|
||||
|
||||
static void MyRegisterData(char *data, int len)
|
||||
{
|
||||
XLogRecData *rdata;
|
||||
|
||||
if (num_rdatas >= max_rdatas)
|
||||
walprop_log(ERROR, "too much WAL data");
|
||||
rdata = &rdatas[num_rdatas++];
|
||||
|
||||
rdata->data = data;
|
||||
rdata->len = len;
|
||||
rdata->next = NULL;
|
||||
|
||||
if (num_rdatas > 1) {
|
||||
rdatas[num_rdatas - 2].next = rdata;
|
||||
}
|
||||
|
||||
mainrdata_len += len;
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
MyFinishInsert(RmgrId rmid, uint8 info, uint8 flags)
|
||||
{
|
||||
XLogRecData *rdt;
|
||||
uint32 total_len = 0;
|
||||
int block_id;
|
||||
pg_crc32c rdata_crc;
|
||||
XLogRecord *rechdr;
|
||||
char *scratch = hdr_scratch;
|
||||
int size;
|
||||
XLogRecPtr StartPos;
|
||||
XLogRecPtr EndPos;
|
||||
uint64 startbytepos;
|
||||
uint64 endbytepos;
|
||||
|
||||
/*
|
||||
* Note: this function can be called multiple times for the same record.
|
||||
* All the modifications we do to the rdata chains below must handle that.
|
||||
*/
|
||||
|
||||
/* The record begins with the fixed-size header */
|
||||
rechdr = (XLogRecord *) scratch;
|
||||
scratch += SizeOfXLogRecord;
|
||||
|
||||
hdr_rdt.data = hdr_scratch;
|
||||
|
||||
if (num_rdatas > 0)
|
||||
{
|
||||
hdr_rdt.next = &rdatas[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
hdr_rdt.next = NULL;
|
||||
}
|
||||
|
||||
/* followed by main data, if any */
|
||||
if (mainrdata_len > 0)
|
||||
{
|
||||
if (mainrdata_len > 255)
|
||||
{
|
||||
*(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG;
|
||||
memcpy(scratch, &mainrdata_len, sizeof(uint32));
|
||||
scratch += sizeof(uint32);
|
||||
}
|
||||
else
|
||||
{
|
||||
*(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT;
|
||||
*(scratch++) = (uint8) mainrdata_len;
|
||||
}
|
||||
total_len += mainrdata_len;
|
||||
}
|
||||
|
||||
hdr_rdt.len = (scratch - hdr_scratch);
|
||||
total_len += hdr_rdt.len;
|
||||
|
||||
/*
|
||||
* Calculate CRC of the data
|
||||
*
|
||||
* Note that the record header isn't added into the CRC initially since we
|
||||
* don't know the prev-link yet. Thus, the CRC will represent the CRC of
|
||||
* the whole record in the order: rdata, then backup blocks, then record
|
||||
* header.
|
||||
*/
|
||||
INIT_CRC32C(rdata_crc);
|
||||
COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord);
|
||||
for (size_t i = 0; i < num_rdatas; i++)
|
||||
{
|
||||
rdt = &rdatas[i];
|
||||
COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Fill in the fields in the record header. Prev-link is filled in later,
|
||||
* once we know where in the WAL the record will be inserted. The CRC does
|
||||
* not include the record header yet.
|
||||
*/
|
||||
rechdr->xl_xid = 0;
|
||||
rechdr->xl_tot_len = total_len;
|
||||
rechdr->xl_info = info;
|
||||
rechdr->xl_rmid = rmid;
|
||||
rechdr->xl_prev = InvalidXLogRecPtr;
|
||||
rechdr->xl_crc = rdata_crc;
|
||||
|
||||
size = MAXALIGN(rechdr->xl_tot_len);
|
||||
|
||||
/* All (non xlog-switch) records should contain data. */
|
||||
Assert(size > SizeOfXLogRecord);
|
||||
|
||||
startbytepos = XLogRecPtrToBytePos(CurrBytePos);
|
||||
endbytepos = startbytepos + size;
|
||||
|
||||
// Get the position.
|
||||
StartPos = XLogBytePosToRecPtr(startbytepos);
|
||||
EndPos = XLogBytePosToRecPtr(startbytepos + size);
|
||||
rechdr->xl_prev = PrevBytePos;
|
||||
|
||||
Assert(XLogRecPtrToBytePos(StartPos) == startbytepos);
|
||||
Assert(XLogRecPtrToBytePos(EndPos) == endbytepos);
|
||||
|
||||
// Update global pointers.
|
||||
CurrBytePos = EndPos;
|
||||
PrevBytePos = StartPos;
|
||||
|
||||
/*
|
||||
* Now that xl_prev has been filled in, calculate CRC of the record
|
||||
* header.
|
||||
*/
|
||||
rdata_crc = rechdr->xl_crc;
|
||||
COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
|
||||
FIN_CRC32C(rdata_crc);
|
||||
rechdr->xl_crc = rdata_crc;
|
||||
|
||||
// Now write it to disk.
|
||||
MyCopyXLogRecordToWAL(rechdr->xl_tot_len, &hdr_rdt, StartPos, EndPos);
|
||||
return EndPos;
|
||||
}
|
||||
|
||||
#define INSERT_FREESPACE(endptr) \
|
||||
(((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
|
||||
|
||||
static void
|
||||
MyCopyXLogRecordToWAL(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos)
|
||||
{
|
||||
XLogRecPtr CurrPos;
|
||||
int written;
|
||||
int freespace;
|
||||
|
||||
// Write hdr_rdt and `num_rdatas` other datas.
|
||||
CurrPos = StartPos;
|
||||
freespace = INSERT_FREESPACE(CurrPos);
|
||||
written = 0;
|
||||
|
||||
Assert(freespace >= sizeof(uint32));
|
||||
|
||||
while (rdata != NULL)
|
||||
{
|
||||
char *rdata_data = rdata->data;
|
||||
int rdata_len = rdata->len;
|
||||
|
||||
while (rdata_len >= freespace)
|
||||
{
|
||||
char header_buf[SizeOfXLogLongPHD];
|
||||
XLogPageHeader NewPage = (XLogPageHeader) header_buf;
|
||||
|
||||
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
|
||||
XLogWalPropWrite(rdata_data, freespace, CurrPos);
|
||||
rdata_data += freespace;
|
||||
rdata_len -= freespace;
|
||||
written += freespace;
|
||||
CurrPos += freespace;
|
||||
|
||||
// Init new page
|
||||
MemSet(header_buf, 0, SizeOfXLogLongPHD);
|
||||
|
||||
/*
|
||||
* Fill the new page's header
|
||||
*/
|
||||
NewPage->xlp_magic = XLOG_PAGE_MAGIC;
|
||||
|
||||
/* NewPage->xlp_info = 0; */ /* done by memset */
|
||||
NewPage->xlp_tli = 1;
|
||||
NewPage->xlp_pageaddr = CurrPos;
|
||||
|
||||
/* NewPage->xlp_rem_len = 0; */ /* done by memset */
|
||||
NewPage->xlp_info |= XLP_BKP_REMOVABLE;
|
||||
|
||||
/*
|
||||
* If first page of an XLOG segment file, make it a long header.
|
||||
*/
|
||||
if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
|
||||
{
|
||||
XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
|
||||
|
||||
NewLongPage->xlp_sysid = 0;
|
||||
NewLongPage->xlp_seg_size = wal_segment_size;
|
||||
NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
|
||||
NewPage->xlp_info |= XLP_LONG_HEADER;
|
||||
}
|
||||
|
||||
NewPage->xlp_rem_len = write_len - written;
|
||||
if (NewPage->xlp_rem_len > 0) {
|
||||
NewPage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
|
||||
}
|
||||
|
||||
/* skip over the page header */
|
||||
if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
|
||||
{
|
||||
XLogWalPropWrite(header_buf, SizeOfXLogLongPHD, CurrPos);
|
||||
CurrPos += SizeOfXLogLongPHD;
|
||||
}
|
||||
else
|
||||
{
|
||||
XLogWalPropWrite(header_buf, SizeOfXLogShortPHD, CurrPos);
|
||||
CurrPos += SizeOfXLogShortPHD;
|
||||
}
|
||||
freespace = INSERT_FREESPACE(CurrPos);
|
||||
}
|
||||
|
||||
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
|
||||
XLogWalPropWrite(rdata_data, rdata_len, CurrPos);
|
||||
CurrPos += rdata_len;
|
||||
written += rdata_len;
|
||||
freespace -= rdata_len;
|
||||
|
||||
rdata = rdata->next;
|
||||
}
|
||||
|
||||
Assert(written == write_len);
|
||||
CurrPos = MAXALIGN64(CurrPos);
|
||||
Assert(CurrPos == EndPos);
|
||||
}
|
||||
|
||||
XLogRecPtr MyInsertRecord()
|
||||
{
|
||||
const char *prefix = "prefix";
|
||||
const char *message = "message";
|
||||
size_t size = 7;
|
||||
bool transactional = false;
|
||||
|
||||
xl_logical_message xlrec;
|
||||
|
||||
xlrec.dbId = 0;
|
||||
xlrec.transactional = transactional;
|
||||
/* trailing zero is critical; see logicalmsg_desc */
|
||||
xlrec.prefix_size = strlen(prefix) + 1;
|
||||
xlrec.message_size = size;
|
||||
|
||||
MyBeginInsert();
|
||||
MyRegisterData((char *) &xlrec, SizeOfLogicalMessage);
|
||||
MyRegisterData(unconstify(char *, prefix), xlrec.prefix_size);
|
||||
MyRegisterData(unconstify(char *, message), size);
|
||||
|
||||
return MyFinishInsert(RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLOG_INCLUDE_ORIGIN);
|
||||
}
|
||||
106
libs/walproposer/rust_bindings.h
Normal file
106
libs/walproposer/rust_bindings.h
Normal file
@@ -0,0 +1,106 @@
|
||||
#include <stdarg.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/**
|
||||
* List of all possible AnyMessage.
|
||||
*/
|
||||
enum AnyMessageTag {
|
||||
None,
|
||||
InternalConnect,
|
||||
Just32,
|
||||
ReplCell,
|
||||
Bytes,
|
||||
LSN,
|
||||
};
|
||||
typedef uint8_t AnyMessageTag;
|
||||
|
||||
/**
|
||||
* List of all possible NodeEvent.
|
||||
*/
|
||||
enum EventTag {
|
||||
Timeout,
|
||||
Accept,
|
||||
Closed,
|
||||
Message,
|
||||
Internal,
|
||||
};
|
||||
typedef uint8_t EventTag;
|
||||
|
||||
/**
|
||||
* Event returned by epoll_recv.
|
||||
*/
|
||||
typedef struct Event {
|
||||
EventTag tag;
|
||||
int64_t tcp;
|
||||
AnyMessageTag any_message;
|
||||
} Event;
|
||||
|
||||
void rust_function(uint32_t a);
|
||||
|
||||
/**
|
||||
* C API for the node os.
|
||||
*/
|
||||
void sim_sleep(uint64_t ms);
|
||||
|
||||
uint64_t sim_random(uint64_t max);
|
||||
|
||||
uint32_t sim_id(void);
|
||||
|
||||
int64_t sim_open_tcp(uint32_t dst);
|
||||
|
||||
int64_t sim_open_tcp_nopoll(uint32_t dst);
|
||||
|
||||
/**
|
||||
* Send MESSAGE_BUF content to the given tcp.
|
||||
*/
|
||||
void sim_tcp_send(int64_t tcp);
|
||||
|
||||
/**
|
||||
* Receive a message from the given tcp. Can be used only with tcp opened with
|
||||
* `sim_open_tcp_nopoll`.
|
||||
*/
|
||||
struct Event sim_tcp_recv(int64_t tcp);
|
||||
|
||||
struct Event sim_epoll_rcv(int64_t timeout);
|
||||
|
||||
struct Event sim_epoll_peek(int64_t timeout);
|
||||
|
||||
int64_t sim_now(void);
|
||||
|
||||
void sim_exit(int32_t code, const uint8_t *msg);
|
||||
|
||||
void sim_set_result(int32_t code, const uint8_t *msg);
|
||||
|
||||
void sim_log_event(const int8_t *msg);
|
||||
|
||||
/**
|
||||
* Get tag of the current message.
|
||||
*/
|
||||
AnyMessageTag sim_msg_tag(void);
|
||||
|
||||
/**
|
||||
* Read AnyMessage::Just32 message.
|
||||
*/
|
||||
void sim_msg_get_just_u32(uint32_t *val);
|
||||
|
||||
/**
|
||||
* Read AnyMessage::LSN message.
|
||||
*/
|
||||
void sim_msg_get_lsn(uint64_t *val);
|
||||
|
||||
/**
|
||||
* Write AnyMessage::ReplCell message.
|
||||
*/
|
||||
void sim_msg_set_repl_cell(uint32_t value, uint32_t client_id, uint32_t seqno);
|
||||
|
||||
/**
|
||||
* Write AnyMessage::Bytes message.
|
||||
*/
|
||||
void sim_msg_set_bytes(const char *bytes, uintptr_t len);
|
||||
|
||||
/**
|
||||
* Read AnyMessage::Bytes message.
|
||||
*/
|
||||
const char *sim_msg_get_bytes(uintptr_t *len);
|
||||
36
libs/walproposer/src/lib.rs
Normal file
36
libs/walproposer/src/lib.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use safekeeper::simlib::node_os::NodeOs;
|
||||
use tracing::info;
|
||||
|
||||
pub mod bindings {
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn rust_function(a: u32) {
|
||||
info!("Hello from Rust!");
|
||||
info!("a: {}", a);
|
||||
}
|
||||
|
||||
pub mod sim;
|
||||
pub mod sim_proto;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod simtest;
|
||||
|
||||
pub fn c_context() -> Option<Box<dyn Fn(NodeOs) + Send + Sync>> {
|
||||
Some(Box::new(|os: NodeOs| {
|
||||
sim::c_attach_node_os(os);
|
||||
unsafe { bindings::MyContextInit(); }
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn enable_debug() {
|
||||
unsafe { bindings::debug_enabled = true; }
|
||||
}
|
||||
240
libs/walproposer/src/sim.rs
Normal file
240
libs/walproposer/src/sim.rs
Normal file
@@ -0,0 +1,240 @@
|
||||
use log::debug;
|
||||
use safekeeper::simlib::{network::TCP, node_os::NodeOs, world::NodeEvent};
|
||||
use std::{
|
||||
cell::RefCell,
|
||||
collections::HashMap,
|
||||
ffi::{CStr, CString},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::sim_proto::{anymessage_tag, AnyMessageTag, Event, EventTag, MESSAGE_BUF};
|
||||
|
||||
thread_local! {
|
||||
static CURRENT_NODE_OS: RefCell<Option<NodeOs>> = RefCell::new(None);
|
||||
static TCP_CACHE: RefCell<HashMap<i64, TCP>> = RefCell::new(HashMap::new());
|
||||
}
|
||||
|
||||
/// Get the current node os.
|
||||
fn os() -> NodeOs {
|
||||
CURRENT_NODE_OS.with(|cell| cell.borrow().clone().expect("no node os set"))
|
||||
}
|
||||
|
||||
fn tcp_save(tcp: TCP) -> i64 {
|
||||
TCP_CACHE.with(|cell| {
|
||||
let mut cache = cell.borrow_mut();
|
||||
let id = tcp.id();
|
||||
cache.insert(id, tcp);
|
||||
id
|
||||
})
|
||||
}
|
||||
|
||||
fn tcp_load(id: i64) -> TCP {
|
||||
TCP_CACHE.with(|cell| {
|
||||
let cache = cell.borrow();
|
||||
cache.get(&id).expect("unknown TCP id").clone()
|
||||
})
|
||||
}
|
||||
|
||||
/// Should be called before calling any of the C functions.
|
||||
pub(crate) fn c_attach_node_os(os: NodeOs) {
|
||||
CURRENT_NODE_OS.with(|cell| {
|
||||
*cell.borrow_mut() = Some(os);
|
||||
});
|
||||
TCP_CACHE.with(|cell| {
|
||||
*cell.borrow_mut() = HashMap::new();
|
||||
});
|
||||
}
|
||||
|
||||
/// C API for the node os.
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_sleep(ms: u64) {
|
||||
os().sleep(ms);
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_random(max: u64) -> u64 {
|
||||
os().random(max)
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_id() -> u32 {
|
||||
os().id().into()
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_open_tcp(dst: u32) -> i64 {
|
||||
tcp_save(os().open_tcp(dst.into()))
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_open_tcp_nopoll(dst: u32) -> i64 {
|
||||
tcp_save(os().open_tcp_nopoll(dst.into()))
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
/// Send MESSAGE_BUF content to the given tcp.
|
||||
pub extern "C" fn sim_tcp_send(tcp: i64) {
|
||||
tcp_load(tcp).send(MESSAGE_BUF.with(|cell| cell.borrow().clone()));
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
/// Receive a message from the given tcp. Can be used only with tcp opened with
|
||||
/// `sim_open_tcp_nopoll`.
|
||||
pub extern "C" fn sim_tcp_recv(tcp: i64) -> Event {
|
||||
let event = tcp_load(tcp).recv();
|
||||
match event {
|
||||
NodeEvent::Accept(_) => unreachable!(),
|
||||
NodeEvent::Closed(_) => Event {
|
||||
tag: EventTag::Closed,
|
||||
tcp: 0,
|
||||
any_message: AnyMessageTag::None,
|
||||
},
|
||||
NodeEvent::Internal(_) => unreachable!(),
|
||||
NodeEvent::Message((message, _)) => {
|
||||
// store message in thread local storage, C code should use
|
||||
// sim_msg_* functions to access it.
|
||||
MESSAGE_BUF.with(|cell| {
|
||||
*cell.borrow_mut() = message.clone();
|
||||
});
|
||||
Event {
|
||||
tag: EventTag::Message,
|
||||
tcp: 0,
|
||||
any_message: anymessage_tag(&message),
|
||||
}
|
||||
}
|
||||
NodeEvent::WakeTimeout(_) => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_epoll_rcv(timeout: i64) -> Event {
|
||||
let event = os().epoll_recv(timeout);
|
||||
let event = if let Some(event) = event {
|
||||
event
|
||||
} else {
|
||||
return Event {
|
||||
tag: EventTag::Timeout,
|
||||
tcp: 0,
|
||||
any_message: AnyMessageTag::None,
|
||||
};
|
||||
};
|
||||
|
||||
match event {
|
||||
NodeEvent::Accept(tcp) => Event {
|
||||
tag: EventTag::Accept,
|
||||
tcp: tcp_save(tcp),
|
||||
any_message: AnyMessageTag::None,
|
||||
},
|
||||
NodeEvent::Closed(tcp) => Event {
|
||||
tag: EventTag::Closed,
|
||||
tcp: tcp_save(tcp),
|
||||
any_message: AnyMessageTag::None,
|
||||
},
|
||||
NodeEvent::Message((message, tcp)) => {
|
||||
// store message in thread local storage, C code should use
|
||||
// sim_msg_* functions to access it.
|
||||
MESSAGE_BUF.with(|cell| {
|
||||
*cell.borrow_mut() = message.clone();
|
||||
});
|
||||
Event {
|
||||
tag: EventTag::Message,
|
||||
tcp: tcp_save(tcp),
|
||||
any_message: anymessage_tag(&message),
|
||||
}
|
||||
}
|
||||
NodeEvent::Internal(message) => {
|
||||
// store message in thread local storage, C code should use
|
||||
// sim_msg_* functions to access it.
|
||||
MESSAGE_BUF.with(|cell| {
|
||||
*cell.borrow_mut() = message.clone();
|
||||
});
|
||||
Event {
|
||||
tag: EventTag::Internal,
|
||||
tcp: 0,
|
||||
any_message: anymessage_tag(&message),
|
||||
}
|
||||
}
|
||||
NodeEvent::WakeTimeout(_) => {
|
||||
// can't happen
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_epoll_peek(timeout: i64) -> Event {
|
||||
let event = os().epoll_peek(timeout);
|
||||
let event = if let Some(event) = event {
|
||||
event
|
||||
} else {
|
||||
return Event {
|
||||
tag: EventTag::Timeout,
|
||||
tcp: 0,
|
||||
any_message: AnyMessageTag::None,
|
||||
};
|
||||
};
|
||||
|
||||
match event {
|
||||
NodeEvent::Accept(tcp) => Event {
|
||||
tag: EventTag::Accept,
|
||||
tcp: tcp_save(tcp),
|
||||
any_message: AnyMessageTag::None,
|
||||
},
|
||||
NodeEvent::Closed(tcp) => Event {
|
||||
tag: EventTag::Closed,
|
||||
tcp: tcp_save(tcp),
|
||||
any_message: AnyMessageTag::None,
|
||||
},
|
||||
NodeEvent::Message((message, tcp)) => Event {
|
||||
tag: EventTag::Message,
|
||||
tcp: tcp_save(tcp),
|
||||
any_message: anymessage_tag(&message),
|
||||
},
|
||||
NodeEvent::Internal(message) => Event {
|
||||
tag: EventTag::Internal,
|
||||
tcp: 0,
|
||||
any_message: anymessage_tag(&message),
|
||||
},
|
||||
NodeEvent::WakeTimeout(_) => {
|
||||
// can't happen
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_now() -> i64 {
|
||||
os().now() as i64
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_exit(code: i32, msg: *const u8) {
|
||||
trace!("sim_exit({}, {:?})", code, msg);
|
||||
sim_set_result(code, msg);
|
||||
|
||||
// I tried to make use of pthread_exit, but it doesn't work.
|
||||
// https://github.com/rust-lang/unsafe-code-guidelines/issues/211
|
||||
// unsafe { libc::pthread_exit(std::ptr::null_mut()) };
|
||||
|
||||
// https://doc.rust-lang.org/nomicon/unwinding.html
|
||||
// Everyone on the internet saying this is UB, but it works for me,
|
||||
// so I'm going to use it for now.
|
||||
panic!("sim_exit() called from C code")
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_set_result(code: i32, msg: *const u8) {
|
||||
let msg = unsafe { CStr::from_ptr(msg as *const i8) };
|
||||
let msg = msg.to_string_lossy().into_owned();
|
||||
debug!("sim_set_result({}, {:?})", code, msg);
|
||||
os().set_result(code, msg);
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn sim_log_event(msg: *const i8) {
|
||||
let msg = unsafe { CStr::from_ptr(msg) };
|
||||
let msg = msg.to_string_lossy().into_owned();
|
||||
debug!("sim_log_event({:?})", msg);
|
||||
os().log_event(msg);
|
||||
}
|
||||
114
libs/walproposer/src/sim_proto.rs
Normal file
114
libs/walproposer/src/sim_proto.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use safekeeper::simlib::proto::{AnyMessage, ReplCell};
|
||||
use std::{cell::RefCell, ffi::c_char};
|
||||
|
||||
pub(crate) fn anymessage_tag(msg: &AnyMessage) -> AnyMessageTag {
|
||||
match msg {
|
||||
AnyMessage::None => AnyMessageTag::None,
|
||||
AnyMessage::InternalConnect => AnyMessageTag::InternalConnect,
|
||||
AnyMessage::Just32(_) => AnyMessageTag::Just32,
|
||||
AnyMessage::ReplCell(_) => AnyMessageTag::ReplCell,
|
||||
AnyMessage::Bytes(_) => AnyMessageTag::Bytes,
|
||||
AnyMessage::LSN(_) => AnyMessageTag::LSN,
|
||||
}
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
pub static MESSAGE_BUF: RefCell<AnyMessage> = RefCell::new(AnyMessage::None);
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
/// Get tag of the current message.
|
||||
pub extern "C" fn sim_msg_tag() -> AnyMessageTag {
|
||||
MESSAGE_BUF.with(|cell| anymessage_tag(&*cell.borrow()))
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
/// Read AnyMessage::Just32 message.
|
||||
pub extern "C" fn sim_msg_get_just_u32(val: &mut u32) {
|
||||
MESSAGE_BUF.with(|cell| match &*cell.borrow() {
|
||||
AnyMessage::Just32(v) => {
|
||||
*val = *v;
|
||||
}
|
||||
_ => panic!("expected Just32 message"),
|
||||
});
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
/// Read AnyMessage::LSN message.
|
||||
pub extern "C" fn sim_msg_get_lsn(val: &mut u64) {
|
||||
MESSAGE_BUF.with(|cell| match &*cell.borrow() {
|
||||
AnyMessage::LSN(v) => {
|
||||
*val = *v;
|
||||
}
|
||||
_ => panic!("expected LSN message"),
|
||||
});
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
/// Write AnyMessage::ReplCell message.
|
||||
pub extern "C" fn sim_msg_set_repl_cell(value: u32, client_id: u32, seqno: u32) {
|
||||
MESSAGE_BUF.with(|cell| {
|
||||
*cell.borrow_mut() = AnyMessage::ReplCell(ReplCell {
|
||||
value,
|
||||
client_id,
|
||||
seqno,
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
/// Write AnyMessage::Bytes message.
|
||||
pub extern "C" fn sim_msg_set_bytes(bytes: *const c_char, len: usize) {
|
||||
MESSAGE_BUF.with(|cell| {
|
||||
// copy bytes to a Rust Vec
|
||||
let mut v: Vec<u8> = Vec::with_capacity(len);
|
||||
unsafe {
|
||||
v.set_len(len);
|
||||
std::ptr::copy_nonoverlapping(bytes as *const u8, v.as_mut_ptr(), len);
|
||||
}
|
||||
*cell.borrow_mut() = AnyMessage::Bytes(v.into());
|
||||
});
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
/// Read AnyMessage::Bytes message.
|
||||
pub extern "C" fn sim_msg_get_bytes(len: *mut usize) -> *const c_char {
|
||||
MESSAGE_BUF.with(|cell| match &*cell.borrow() {
|
||||
AnyMessage::Bytes(v) => {
|
||||
unsafe {
|
||||
*len = v.len();
|
||||
v.as_ptr() as *const i8
|
||||
}
|
||||
}
|
||||
_ => panic!("expected Bytes message"),
|
||||
})
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
/// Event returned by epoll_recv.
|
||||
pub struct Event {
|
||||
pub tag: EventTag,
|
||||
pub tcp: i64,
|
||||
pub any_message: AnyMessageTag,
|
||||
}
|
||||
|
||||
#[repr(u8)]
|
||||
/// List of all possible NodeEvent.
|
||||
pub enum EventTag {
|
||||
Timeout,
|
||||
Accept,
|
||||
Closed,
|
||||
Message,
|
||||
Internal,
|
||||
}
|
||||
|
||||
#[repr(u8)]
|
||||
/// List of all possible AnyMessage.
|
||||
pub enum AnyMessageTag {
|
||||
None,
|
||||
InternalConnect,
|
||||
Just32,
|
||||
ReplCell,
|
||||
Bytes,
|
||||
LSN,
|
||||
}
|
||||
88
libs/walproposer/src/simtest/disk.rs
Normal file
88
libs/walproposer/src/simtest/disk.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use safekeeper::safekeeper::SafeKeeperState;
|
||||
use safekeeper::simlib::sync::Mutex;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
pub struct Disk {
|
||||
pub timelines: Mutex<HashMap<TenantTimelineId, Arc<TimelineDisk>>>,
|
||||
}
|
||||
|
||||
impl Disk {
|
||||
pub fn new() -> Self {
|
||||
Disk {
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_state(&self, ttid: &TenantTimelineId, state: SafeKeeperState) -> Arc<TimelineDisk> {
|
||||
self.timelines
|
||||
.lock()
|
||||
.entry(ttid.clone())
|
||||
.and_modify(|e| {
|
||||
let mut mu = e.state.lock();
|
||||
*mu = state.clone();
|
||||
})
|
||||
.or_insert_with(|| {
|
||||
Arc::new(TimelineDisk {
|
||||
state: Mutex::new(state),
|
||||
wal: Mutex::new(BlockStorage::new()),
|
||||
})
|
||||
})
|
||||
.clone()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TimelineDisk {
|
||||
pub state: Mutex<SafeKeeperState>,
|
||||
pub wal: Mutex<BlockStorage>,
|
||||
}
|
||||
|
||||
const BLOCK_SIZE: usize = 8192;
|
||||
|
||||
pub struct BlockStorage {
|
||||
blocks: HashMap<u64, [u8; BLOCK_SIZE]>,
|
||||
}
|
||||
|
||||
impl BlockStorage {
|
||||
pub fn new() -> Self {
|
||||
BlockStorage {
|
||||
blocks: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read(&self, pos: u64, buf: &mut [u8]) {
|
||||
let mut buf_offset = 0;
|
||||
let mut storage_pos = pos;
|
||||
while buf_offset < buf.len() {
|
||||
let block_id = storage_pos / BLOCK_SIZE as u64;
|
||||
let block = self.blocks.get(&block_id).unwrap_or(&[0; BLOCK_SIZE]);
|
||||
let block_offset = storage_pos % BLOCK_SIZE as u64;
|
||||
let block_len = BLOCK_SIZE as u64 - block_offset;
|
||||
let buf_len = buf.len() - buf_offset;
|
||||
let copy_len = std::cmp::min(block_len as usize, buf_len);
|
||||
buf[buf_offset..buf_offset + copy_len]
|
||||
.copy_from_slice(&block[block_offset as usize..block_offset as usize + copy_len]);
|
||||
buf_offset += copy_len;
|
||||
storage_pos += copy_len as u64;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write(&mut self, pos: u64, buf: &[u8]) {
|
||||
let mut buf_offset = 0;
|
||||
let mut storage_pos = pos;
|
||||
while buf_offset < buf.len() {
|
||||
let block_id = storage_pos / BLOCK_SIZE as u64;
|
||||
let block = self.blocks.entry(block_id).or_insert([0; BLOCK_SIZE]);
|
||||
let block_offset = storage_pos % BLOCK_SIZE as u64;
|
||||
let block_len = BLOCK_SIZE as u64 - block_offset;
|
||||
let buf_len = buf.len() - buf_offset;
|
||||
let copy_len = std::cmp::min(block_len as usize, buf_len);
|
||||
block[block_offset as usize..block_offset as usize + copy_len]
|
||||
.copy_from_slice(&buf[buf_offset..buf_offset + copy_len]);
|
||||
buf_offset += copy_len;
|
||||
storage_pos += copy_len as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
61
libs/walproposer/src/simtest/log.rs
Normal file
61
libs/walproposer/src/simtest/log.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use std::{sync::Arc, fmt};
|
||||
|
||||
use safekeeper::simlib::{world::World, sync::Mutex};
|
||||
use tracing_subscriber::fmt::{time::FormatTime, format::Writer};
|
||||
use utils::logging;
|
||||
|
||||
use crate::bindings;
|
||||
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SimClock {
|
||||
world_ptr: Arc<Mutex<Option<Arc<World>>>>,
|
||||
}
|
||||
|
||||
impl Default for SimClock {
|
||||
fn default() -> Self {
|
||||
SimClock {
|
||||
world_ptr: Arc::new(Mutex::new(None)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SimClock {
|
||||
pub fn set_world(&self, world: Arc<World>) {
|
||||
*self.world_ptr.lock() = Some(world);
|
||||
}
|
||||
}
|
||||
|
||||
impl FormatTime for SimClock {
|
||||
fn format_time(&self, w: &mut Writer<'_>) -> fmt::Result {
|
||||
let world = self.world_ptr.lock().clone();
|
||||
|
||||
if let Some(world) = world {
|
||||
let now = world.now();
|
||||
write!(w, "[{}]", now)
|
||||
} else {
|
||||
write!(w, "[?]")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init_logger() -> SimClock {
|
||||
let debug_enabled = unsafe { bindings::debug_enabled };
|
||||
|
||||
let clock = SimClock::default();
|
||||
let base_logger = tracing_subscriber::fmt()
|
||||
.with_target(false)
|
||||
.with_timer(clock.clone())
|
||||
.with_ansi(true)
|
||||
.with_max_level(match debug_enabled {
|
||||
true => tracing::Level::DEBUG,
|
||||
false => tracing::Level::INFO,
|
||||
})
|
||||
.with_writer(std::io::stdout);
|
||||
base_logger.init();
|
||||
|
||||
// logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
std::panic::set_hook(Box::new(|_| {}));
|
||||
|
||||
clock
|
||||
}
|
||||
11
libs/walproposer/src/simtest/mod.rs
Normal file
11
libs/walproposer/src/simtest/mod.rs
Normal file
@@ -0,0 +1,11 @@
|
||||
#[cfg(test)]
|
||||
pub mod simple_client;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod wp_sk;
|
||||
|
||||
pub mod disk;
|
||||
pub mod safekeeper;
|
||||
pub mod storage;
|
||||
pub mod log;
|
||||
pub mod util;
|
||||
372
libs/walproposer/src/simtest/safekeeper.rs
Normal file
372
libs/walproposer/src/simtest/safekeeper.rs
Normal file
@@ -0,0 +1,372 @@
|
||||
//! Safekeeper communication endpoint to WAL proposer (compute node).
|
||||
//! Gets messages from the network, passes them down to consensus module and
|
||||
//! sends replies back.
|
||||
|
||||
use std::{collections::HashMap, path::PathBuf, sync::Arc, time::Duration};
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hyper::Uri;
|
||||
use log::info;
|
||||
use safekeeper::{
|
||||
safekeeper::{
|
||||
ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo, UNKNOWN_SERVER_VERSION,
|
||||
},
|
||||
simlib::{network::TCP, node_os::NodeOs, proto::AnyMessage, world::NodeEvent},
|
||||
timeline::TimelineError,
|
||||
SafeKeeperConf, wal_storage::Storage,
|
||||
};
|
||||
use tracing::{debug, info_span};
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::simtest::storage::DiskStateStorage;
|
||||
|
||||
use super::{
|
||||
disk::{Disk, TimelineDisk},
|
||||
storage::DiskWALStorage,
|
||||
};
|
||||
|
||||
struct ConnState {
|
||||
tcp: TCP,
|
||||
|
||||
greeting: bool,
|
||||
ttid: TenantTimelineId,
|
||||
flush_pending: bool,
|
||||
}
|
||||
|
||||
struct SharedState {
|
||||
sk: SafeKeeper<DiskStateStorage, DiskWALStorage>,
|
||||
disk: Arc<TimelineDisk>,
|
||||
}
|
||||
|
||||
struct GlobalMap {
|
||||
timelines: HashMap<TenantTimelineId, SharedState>,
|
||||
conf: SafeKeeperConf,
|
||||
disk: Arc<Disk>,
|
||||
}
|
||||
|
||||
impl GlobalMap {
|
||||
fn new(disk: Arc<Disk>, conf: SafeKeeperConf) -> Result<Self> {
|
||||
let mut timelines = HashMap::new();
|
||||
|
||||
for (&ttid, disk) in disk.timelines.lock().iter() {
|
||||
debug!("loading timeline {}", ttid);
|
||||
let state = disk.state.lock().clone();
|
||||
|
||||
if state.server.wal_seg_size == 0 {
|
||||
bail!(TimelineError::UninitializedWalSegSize(ttid));
|
||||
}
|
||||
|
||||
if state.server.pg_version == UNKNOWN_SERVER_VERSION {
|
||||
bail!(TimelineError::UninitialinzedPgVersion(ttid));
|
||||
}
|
||||
|
||||
if state.commit_lsn < state.local_start_lsn {
|
||||
bail!(
|
||||
"commit_lsn {} is higher than local_start_lsn {}",
|
||||
state.commit_lsn,
|
||||
state.local_start_lsn
|
||||
);
|
||||
}
|
||||
|
||||
let control_store = DiskStateStorage::new(disk.clone());
|
||||
let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?;
|
||||
|
||||
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
|
||||
timelines.insert(
|
||||
ttid.clone(),
|
||||
SharedState {
|
||||
sk,
|
||||
disk: disk.clone(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
timelines,
|
||||
conf,
|
||||
disk,
|
||||
})
|
||||
}
|
||||
|
||||
fn create(&mut self, ttid: TenantTimelineId, server_info: ServerInfo) -> Result<()> {
|
||||
if self.timelines.contains_key(&ttid) {
|
||||
bail!("timeline {} already exists", ttid);
|
||||
}
|
||||
|
||||
debug!("creating new timeline {}", ttid);
|
||||
|
||||
let commit_lsn = Lsn::INVALID;
|
||||
let local_start_lsn = Lsn::INVALID;
|
||||
|
||||
// TODO: load state from in-memory storage
|
||||
let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
|
||||
|
||||
if state.server.wal_seg_size == 0 {
|
||||
bail!(TimelineError::UninitializedWalSegSize(ttid));
|
||||
}
|
||||
|
||||
if state.server.pg_version == UNKNOWN_SERVER_VERSION {
|
||||
bail!(TimelineError::UninitialinzedPgVersion(ttid));
|
||||
}
|
||||
|
||||
if state.commit_lsn < state.local_start_lsn {
|
||||
bail!(
|
||||
"commit_lsn {} is higher than local_start_lsn {}",
|
||||
state.commit_lsn,
|
||||
state.local_start_lsn
|
||||
);
|
||||
}
|
||||
|
||||
let disk_timeline = self.disk.put_state(&ttid, state);
|
||||
let control_store = DiskStateStorage::new(disk_timeline.clone());
|
||||
let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?;
|
||||
|
||||
let sk = SafeKeeper::new(control_store, wal_store, self.conf.my_id)?;
|
||||
|
||||
self.timelines.insert(
|
||||
ttid.clone(),
|
||||
SharedState {
|
||||
sk,
|
||||
disk: disk_timeline,
|
||||
},
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get(&mut self, ttid: &TenantTimelineId) -> &mut SharedState {
|
||||
self.timelines.get_mut(ttid).expect("timeline must exist")
|
||||
}
|
||||
|
||||
fn has_tli(&self, ttid: &TenantTimelineId) -> bool {
|
||||
self.timelines.contains_key(ttid)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn run_server(os: NodeOs, disk: Arc<Disk>) -> Result<()> {
|
||||
let _enter = info_span!("safekeeper", id = os.id()).entered();
|
||||
debug!("started server");
|
||||
os.log_event("started;safekeeper".to_owned());
|
||||
let conf = SafeKeeperConf {
|
||||
workdir: PathBuf::from("."),
|
||||
my_id: NodeId(os.id() as u64),
|
||||
listen_pg_addr: String::new(),
|
||||
listen_http_addr: String::new(),
|
||||
no_sync: false,
|
||||
broker_endpoint: "/".parse::<Uri>().unwrap(),
|
||||
broker_keepalive_interval: Duration::from_secs(0),
|
||||
heartbeat_timeout: Duration::from_secs(0),
|
||||
remote_storage: None,
|
||||
max_offloader_lag_bytes: 0,
|
||||
backup_runtime_threads: None,
|
||||
wal_backup_enabled: false,
|
||||
auth: None,
|
||||
};
|
||||
|
||||
let mut global = GlobalMap::new(disk, conf.clone())?;
|
||||
let mut conns: HashMap<i64, ConnState> = HashMap::new();
|
||||
|
||||
for (&ttid, shared_state) in global.timelines.iter_mut() {
|
||||
let flush_lsn = shared_state.sk.wal_store.flush_lsn();
|
||||
let commit_lsn = shared_state.sk.state.commit_lsn;
|
||||
os.log_event(format!("tli_loaded;{};{}", flush_lsn.0, commit_lsn.0));
|
||||
}
|
||||
|
||||
let epoll = os.epoll();
|
||||
loop {
|
||||
// waiting for the next message
|
||||
let mut next_event = Some(epoll.recv());
|
||||
|
||||
loop {
|
||||
let event = match next_event {
|
||||
Some(event) => event,
|
||||
None => break,
|
||||
};
|
||||
|
||||
match event {
|
||||
NodeEvent::Accept(tcp) => {
|
||||
conns.insert(
|
||||
tcp.id(),
|
||||
ConnState {
|
||||
tcp,
|
||||
greeting: false,
|
||||
ttid: TenantTimelineId::empty(),
|
||||
flush_pending: false,
|
||||
},
|
||||
);
|
||||
}
|
||||
NodeEvent::Message((msg, tcp)) => {
|
||||
let conn = conns.get_mut(&tcp.id());
|
||||
if let Some(conn) = conn {
|
||||
let res = conn.process_any(msg, &mut global);
|
||||
if res.is_err() {
|
||||
debug!("conn {:?} error: {:#}", tcp, res.unwrap_err());
|
||||
conns.remove(&tcp.id());
|
||||
}
|
||||
} else {
|
||||
debug!("conn {:?} was closed, dropping msg {:?}", tcp, msg);
|
||||
}
|
||||
}
|
||||
NodeEvent::Internal(_) => {}
|
||||
NodeEvent::Closed(_) => {}
|
||||
NodeEvent::WakeTimeout(_) => {}
|
||||
}
|
||||
|
||||
// TODO: make simulator support multiple events per tick
|
||||
next_event = epoll.try_recv();
|
||||
}
|
||||
|
||||
conns.retain(|_, conn| {
|
||||
let res = conn.flush(&mut global);
|
||||
if res.is_err() {
|
||||
debug!("conn {:?} error: {:?}", conn.tcp, res);
|
||||
}
|
||||
res.is_ok()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl ConnState {
|
||||
fn process_any(&mut self, any: AnyMessage, global: &mut GlobalMap) -> Result<()> {
|
||||
if let AnyMessage::Bytes(copy_data) = any {
|
||||
let repl_prefix = b"START_REPLICATION ";
|
||||
if !self.greeting && copy_data.starts_with(repl_prefix) {
|
||||
self.process_start_replication(copy_data.slice(repl_prefix.len()..), global)?;
|
||||
bail!("finished processing START_REPLICATION")
|
||||
}
|
||||
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
||||
debug!("got msg: {:?}", msg);
|
||||
return self.process(msg, global);
|
||||
} else {
|
||||
bail!("unexpected message, expected AnyMessage::Bytes");
|
||||
}
|
||||
}
|
||||
|
||||
fn process_start_replication(
|
||||
&mut self,
|
||||
copy_data: Bytes,
|
||||
global: &mut GlobalMap,
|
||||
) -> Result<()> {
|
||||
// format is "<tenant_id> <timeline_id> <start_lsn> <end_lsn>"
|
||||
let str = String::from_utf8(copy_data.to_vec())?;
|
||||
|
||||
let mut parts = str.split(' ');
|
||||
let tenant_id = parts.next().unwrap().parse::<TenantId>()?;
|
||||
let timeline_id = parts.next().unwrap().parse::<TimelineId>()?;
|
||||
let start_lsn = parts.next().unwrap().parse::<u64>()?;
|
||||
let end_lsn = parts.next().unwrap().parse::<u64>()?;
|
||||
|
||||
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
let shared_state = global.get(&ttid);
|
||||
|
||||
// read bytes from start_lsn to end_lsn
|
||||
let mut buf = vec![0; (end_lsn - start_lsn) as usize];
|
||||
shared_state.disk.wal.lock().read(start_lsn, &mut buf);
|
||||
|
||||
// send bytes to the client
|
||||
self.tcp.send(AnyMessage::Bytes(Bytes::from(buf)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_timeline(
|
||||
&mut self,
|
||||
ttid: TenantTimelineId,
|
||||
server_info: ServerInfo,
|
||||
global: &mut GlobalMap,
|
||||
) -> Result<()> {
|
||||
self.ttid = ttid;
|
||||
if global.has_tli(&ttid) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
global.create(ttid, server_info)
|
||||
}
|
||||
|
||||
fn process(&mut self, msg: ProposerAcceptorMessage, global: &mut GlobalMap) -> Result<()> {
|
||||
if !self.greeting {
|
||||
self.greeting = true;
|
||||
|
||||
match msg {
|
||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
||||
debug!(
|
||||
"start handshake with walproposer {:?}",
|
||||
self.tcp,
|
||||
);
|
||||
let server_info = ServerInfo {
|
||||
pg_version: greeting.pg_version,
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
let ttid = TenantTimelineId::new(greeting.tenant_id, greeting.timeline_id);
|
||||
self.init_timeline(ttid, server_info, global)?
|
||||
}
|
||||
_ => {
|
||||
bail!("unexpected message {msg:?} instead of greeting");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tli = global.get(&self.ttid);
|
||||
|
||||
match msg {
|
||||
ProposerAcceptorMessage::AppendRequest(append_request) => {
|
||||
self.flush_pending = true;
|
||||
self.process_sk_msg(
|
||||
tli,
|
||||
&ProposerAcceptorMessage::NoFlushAppendRequest(append_request),
|
||||
)?;
|
||||
}
|
||||
other => {
|
||||
self.process_sk_msg(tli, &other)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Process FlushWAL if needed.
|
||||
// TODO: add extra flushes, to verify that extra flushes don't break anything
|
||||
fn flush(&mut self, global: &mut GlobalMap) -> Result<()> {
|
||||
if !self.flush_pending {
|
||||
return Ok(());
|
||||
}
|
||||
self.flush_pending = false;
|
||||
let shared_state = global.get(&self.ttid);
|
||||
self.process_sk_msg(shared_state, &ProposerAcceptorMessage::FlushWAL)
|
||||
}
|
||||
|
||||
/// Make safekeeper process a message and send a reply to the TCP
|
||||
fn process_sk_msg(
|
||||
&mut self,
|
||||
shared_state: &mut SharedState,
|
||||
msg: &ProposerAcceptorMessage,
|
||||
) -> Result<()> {
|
||||
let mut reply = shared_state.sk.process_msg(msg)?;
|
||||
if let Some(reply) = &mut reply {
|
||||
// // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
|
||||
// if let AcceptorProposerMessage::AppendResponse(ref mut resp) = reply {
|
||||
// // TODO:
|
||||
// }
|
||||
|
||||
let mut buf = BytesMut::with_capacity(128);
|
||||
reply.serialize(&mut buf)?;
|
||||
|
||||
self.tcp.send(AnyMessage::Bytes(buf.into()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ConnState {
|
||||
fn drop(&mut self) {
|
||||
debug!("dropping conn: {:?}", self.tcp);
|
||||
if !std::thread::panicking() {
|
||||
self.tcp.close();
|
||||
}
|
||||
// TODO: clean up non-fsynced WAL
|
||||
}
|
||||
}
|
||||
38
libs/walproposer/src/simtest/simple_client.rs
Normal file
38
libs/walproposer/src/simtest/simple_client.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use safekeeper::{
|
||||
simlib::{
|
||||
network::{Delay, NetworkOptions},
|
||||
world::World,
|
||||
},
|
||||
simtest::{start_simulation, Options},
|
||||
};
|
||||
|
||||
use crate::{bindings::RunClientC, c_context};
|
||||
|
||||
#[test]
|
||||
fn run_rust_c_test() {
|
||||
let delay = Delay {
|
||||
min: 1,
|
||||
max: 5,
|
||||
fail_prob: 0.5,
|
||||
};
|
||||
|
||||
let network = NetworkOptions {
|
||||
keepalive_timeout: Some(50),
|
||||
connect_delay: delay.clone(),
|
||||
send_delay: delay.clone(),
|
||||
};
|
||||
|
||||
let u32_data: [u32; 5] = [1, 2, 3, 4, 5];
|
||||
|
||||
let world = Arc::new(World::new(1337, Arc::new(network), c_context()));
|
||||
start_simulation(Options {
|
||||
world,
|
||||
time_limit: 1_000_000,
|
||||
client_fn: Box::new(move |_, server_id| unsafe {
|
||||
RunClientC(server_id);
|
||||
}),
|
||||
u32_data,
|
||||
});
|
||||
}
|
||||
234
libs/walproposer/src/simtest/storage.rs
Normal file
234
libs/walproposer/src/simtest/storage.rs
Normal file
@@ -0,0 +1,234 @@
|
||||
use std::{ops::Deref, sync::Arc};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BytesMut};
|
||||
use log::{debug, info};
|
||||
use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo};
|
||||
use safekeeper::{control_file, safekeeper::SafeKeeperState, wal_storage};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::disk::TimelineDisk;
|
||||
|
||||
pub struct DiskStateStorage {
|
||||
persisted_state: SafeKeeperState,
|
||||
disk: Arc<TimelineDisk>,
|
||||
}
|
||||
|
||||
impl DiskStateStorage {
|
||||
pub fn new(disk: Arc<TimelineDisk>) -> Self {
|
||||
let guard = disk.state.lock();
|
||||
let state = guard.clone();
|
||||
drop(guard);
|
||||
DiskStateStorage {
|
||||
persisted_state: state,
|
||||
disk,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl control_file::Storage for DiskStateStorage {
|
||||
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
|
||||
self.persisted_state = s.clone();
|
||||
*self.disk.state.lock() = s.clone();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for DiskStateStorage {
|
||||
type Target = SafeKeeperState;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.persisted_state
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DummyWalStore {
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl DummyWalStore {
|
||||
pub fn new() -> Self {
|
||||
DummyWalStore { lsn: Lsn::INVALID }
|
||||
}
|
||||
}
|
||||
|
||||
impl wal_storage::Storage for DummyWalStore {
|
||||
fn flush_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
self.lsn = startpos + buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
|
||||
self.lsn = end_pos;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush_wal(&mut self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
|
||||
Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
|
||||
}
|
||||
|
||||
fn get_metrics(&self) -> safekeeper::metrics::WalStorageMetrics {
|
||||
safekeeper::metrics::WalStorageMetrics::default()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DiskWALStorage {
|
||||
/// Written to disk, but possibly still in the cache and not fully persisted.
|
||||
/// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
|
||||
write_lsn: Lsn,
|
||||
|
||||
/// The LSN of the last WAL record written to disk. Still can be not fully flushed.
|
||||
write_record_lsn: Lsn,
|
||||
|
||||
/// The LSN of the last WAL record flushed to disk.
|
||||
flush_record_lsn: Lsn,
|
||||
|
||||
/// Decoder is required for detecting boundaries of WAL records.
|
||||
decoder: WalStreamDecoder,
|
||||
|
||||
unflushed_bytes: BytesMut,
|
||||
|
||||
disk: Arc<TimelineDisk>,
|
||||
}
|
||||
|
||||
impl DiskWALStorage {
|
||||
pub fn new(disk: Arc<TimelineDisk>, state: &SafeKeeperState) -> Result<Self> {
|
||||
let write_lsn = if state.commit_lsn == Lsn(0) {
|
||||
Lsn(0)
|
||||
} else {
|
||||
Self::find_end_of_wal(disk.clone(), state.commit_lsn)?
|
||||
};
|
||||
|
||||
let flush_lsn = write_lsn;
|
||||
Ok(DiskWALStorage {
|
||||
write_lsn,
|
||||
write_record_lsn: flush_lsn,
|
||||
flush_record_lsn: flush_lsn,
|
||||
decoder: WalStreamDecoder::new(flush_lsn, 15),
|
||||
unflushed_bytes: BytesMut::new(),
|
||||
disk,
|
||||
})
|
||||
}
|
||||
|
||||
fn find_end_of_wal(disk: Arc<TimelineDisk>, start_lsn: Lsn) -> Result<Lsn> {
|
||||
let mut buf = [0; 8192];
|
||||
let mut pos = start_lsn.0;
|
||||
let mut decoder = WalStreamDecoder::new(start_lsn, 15);
|
||||
let mut result = start_lsn;
|
||||
loop {
|
||||
disk.wal.lock().read(pos, &mut buf);
|
||||
pos += buf.len() as u64;
|
||||
decoder.feed_bytes(&buf);
|
||||
|
||||
loop {
|
||||
match decoder.poll_decode() {
|
||||
Ok(Some(record)) => result = record.0,
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"find_end_of_wal reached end at {:?}, decode error: {:?}",
|
||||
result, e
|
||||
);
|
||||
return Ok(result);
|
||||
}
|
||||
Ok(None) => break, // need more data
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl wal_storage::Storage for DiskWALStorage {
|
||||
fn flush_lsn(&self) -> Lsn {
|
||||
self.flush_record_lsn
|
||||
}
|
||||
|
||||
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
if self.write_lsn != startpos {
|
||||
panic!("write_wal called with wrong startpos");
|
||||
}
|
||||
|
||||
self.unflushed_bytes.extend_from_slice(buf);
|
||||
self.write_lsn += buf.len() as u64;
|
||||
|
||||
if self.decoder.available() != startpos {
|
||||
info!(
|
||||
"restart decoder from {} to {}",
|
||||
self.decoder.available(),
|
||||
startpos,
|
||||
);
|
||||
self.decoder = WalStreamDecoder::new(startpos, 15);
|
||||
}
|
||||
self.decoder.feed_bytes(buf);
|
||||
loop {
|
||||
match self.decoder.poll_decode()? {
|
||||
None => break, // no full record yet
|
||||
Some((lsn, _rec)) => {
|
||||
self.write_record_lsn = lsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
|
||||
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
|
||||
panic!(
|
||||
"truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
|
||||
self.write_lsn, end_pos
|
||||
);
|
||||
}
|
||||
|
||||
self.flush_wal()?;
|
||||
|
||||
// write zeroes to disk from end_pos until self.write_lsn
|
||||
let buf = [0; 8192];
|
||||
let mut pos = end_pos.0;
|
||||
while pos < self.write_lsn.0 {
|
||||
self.disk.wal.lock().write(pos, &buf);
|
||||
pos += buf.len() as u64;
|
||||
}
|
||||
|
||||
self.write_lsn = end_pos;
|
||||
self.write_record_lsn = end_pos;
|
||||
self.flush_record_lsn = end_pos;
|
||||
self.unflushed_bytes.clear();
|
||||
self.decoder = WalStreamDecoder::new(end_pos, 15);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush_wal(&mut self) -> Result<()> {
|
||||
if self.flush_record_lsn == self.write_record_lsn {
|
||||
// no need to do extra flush
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let num_bytes = self.write_record_lsn.0 - self.flush_record_lsn.0;
|
||||
|
||||
self.disk.wal.lock().write(
|
||||
self.flush_record_lsn.0,
|
||||
&self.unflushed_bytes[..num_bytes as usize],
|
||||
);
|
||||
self.unflushed_bytes.advance(num_bytes as usize);
|
||||
self.flush_record_lsn = self.write_record_lsn;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
|
||||
Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
|
||||
}
|
||||
|
||||
fn get_metrics(&self) -> safekeeper::metrics::WalStorageMetrics {
|
||||
safekeeper::metrics::WalStorageMetrics::default()
|
||||
}
|
||||
}
|
||||
610
libs/walproposer/src/simtest/util.rs
Normal file
610
libs/walproposer/src/simtest/util.rs
Normal file
@@ -0,0 +1,610 @@
|
||||
use std::{ffi::CString, path::Path, str::FromStr, sync::Arc, collections::HashMap};
|
||||
|
||||
use rand::{Rng, SeedableRng};
|
||||
use safekeeper::simlib::{
|
||||
network::{Delay, NetworkOptions},
|
||||
proto::AnyMessage,
|
||||
time::EmptyEvent,
|
||||
world::World,
|
||||
world::{Node, NodeEvent, SEvent, NodeId},
|
||||
};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
bindings::{
|
||||
neon_tenant_walproposer, neon_timeline_walproposer, sim_redo_start_lsn, syncSafekeepers,
|
||||
wal_acceptor_connection_timeout, wal_acceptor_reconnect_timeout, wal_acceptors_list,
|
||||
MyInsertRecord, WalProposerCleanup, WalProposerRust,
|
||||
},
|
||||
c_context,
|
||||
simtest::{
|
||||
log::{init_logger, SimClock},
|
||||
safekeeper::run_server,
|
||||
},
|
||||
};
|
||||
|
||||
use super::disk::Disk;
|
||||
|
||||
pub struct SkNode {
|
||||
pub node: Arc<Node>,
|
||||
pub id: u32,
|
||||
pub disk: Arc<Disk>,
|
||||
}
|
||||
|
||||
impl SkNode {
|
||||
pub fn new(node: Arc<Node>) -> Self {
|
||||
let disk = Arc::new(Disk::new());
|
||||
let res = Self {
|
||||
id: node.id,
|
||||
node,
|
||||
disk,
|
||||
};
|
||||
res.launch();
|
||||
res
|
||||
}
|
||||
|
||||
pub fn launch(&self) {
|
||||
let id = self.id;
|
||||
let disk = self.disk.clone();
|
||||
// start the server thread
|
||||
self.node.launch(move |os| {
|
||||
let res = run_server(os, disk);
|
||||
debug!("server {} finished: {:?}", id, res);
|
||||
});
|
||||
}
|
||||
|
||||
pub fn restart(&self) {
|
||||
self.node.crash_stop();
|
||||
self.launch();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TestConfig {
|
||||
pub network: NetworkOptions,
|
||||
pub timeout: u64,
|
||||
pub clock: Option<SimClock>,
|
||||
}
|
||||
|
||||
impl TestConfig {
|
||||
pub fn new(clock: Option<SimClock>) -> Self {
|
||||
Self {
|
||||
network: NetworkOptions {
|
||||
keepalive_timeout: Some(2000),
|
||||
connect_delay: Delay {
|
||||
min: 1,
|
||||
max: 5,
|
||||
fail_prob: 0.0,
|
||||
},
|
||||
send_delay: Delay {
|
||||
min: 1,
|
||||
max: 5,
|
||||
fail_prob: 0.0,
|
||||
},
|
||||
},
|
||||
timeout: 1_000 * 10,
|
||||
clock,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self, seed: u64) -> Test {
|
||||
let world = Arc::new(World::new(
|
||||
seed,
|
||||
Arc::new(self.network.clone()),
|
||||
c_context(),
|
||||
));
|
||||
world.register_world();
|
||||
|
||||
if let Some(clock) = &self.clock {
|
||||
clock.set_world(world.clone());
|
||||
}
|
||||
|
||||
let servers = [
|
||||
SkNode::new(world.new_node()),
|
||||
SkNode::new(world.new_node()),
|
||||
SkNode::new(world.new_node()),
|
||||
];
|
||||
|
||||
let server_ids = [servers[0].id, servers[1].id, servers[2].id];
|
||||
|
||||
let safekeepers_guc = server_ids.map(|id| format!("node:{}", id)).join(",");
|
||||
let ttid = TenantTimelineId::generate();
|
||||
|
||||
// wait init for all servers
|
||||
world.await_all();
|
||||
|
||||
// clean up pgdata directory
|
||||
self.init_pgdata();
|
||||
|
||||
Test {
|
||||
world,
|
||||
servers,
|
||||
safekeepers_guc,
|
||||
ttid,
|
||||
timeout: self.timeout,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init_pgdata(&self) {
|
||||
let pgdata = Path::new("/home/admin/simulator/libs/walproposer/pgdata");
|
||||
if pgdata.exists() {
|
||||
std::fs::remove_dir_all(pgdata).unwrap();
|
||||
}
|
||||
std::fs::create_dir(pgdata).unwrap();
|
||||
|
||||
// create empty pg_wal and pg_notify subdirs
|
||||
std::fs::create_dir(pgdata.join("pg_wal")).unwrap();
|
||||
std::fs::create_dir(pgdata.join("pg_notify")).unwrap();
|
||||
|
||||
// write postgresql.conf
|
||||
let mut conf = std::fs::File::create(pgdata.join("postgresql.conf")).unwrap();
|
||||
let content = "
|
||||
wal_log_hints=off
|
||||
hot_standby=on
|
||||
fsync=off
|
||||
wal_level=replica
|
||||
restart_after_crash=off
|
||||
shared_preload_libraries=neon
|
||||
neon.pageserver_connstring=''
|
||||
neon.tenant_id=cc6e67313d57283bad411600fbf5c142
|
||||
neon.timeline_id=de6fa815c1e45aa61491c3d34c4eb33e
|
||||
synchronous_standby_names=walproposer
|
||||
neon.safekeepers='node:1,node:2,node:3'
|
||||
max_connections=100
|
||||
";
|
||||
|
||||
std::io::Write::write_all(&mut conf, content.as_bytes()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Test {
|
||||
pub world: Arc<World>,
|
||||
pub servers: [SkNode; 3],
|
||||
pub safekeepers_guc: String,
|
||||
pub ttid: TenantTimelineId,
|
||||
pub timeout: u64,
|
||||
}
|
||||
|
||||
impl Test {
|
||||
fn launch_sync(&self) -> Arc<Node> {
|
||||
let client_node = self.world.new_node();
|
||||
debug!("sync-safekeepers started at node {}", client_node.id);
|
||||
|
||||
// start the client thread
|
||||
let guc = self.safekeepers_guc.clone();
|
||||
let ttid = self.ttid.clone();
|
||||
client_node.launch(move |_| {
|
||||
let list = CString::new(guc).unwrap();
|
||||
|
||||
unsafe {
|
||||
WalProposerCleanup();
|
||||
|
||||
syncSafekeepers = true;
|
||||
wal_acceptors_list = list.into_raw();
|
||||
wal_acceptor_reconnect_timeout = 1000;
|
||||
wal_acceptor_connection_timeout = 5000;
|
||||
neon_tenant_walproposer =
|
||||
CString::new(ttid.tenant_id.to_string()).unwrap().into_raw();
|
||||
neon_timeline_walproposer = CString::new(ttid.timeline_id.to_string())
|
||||
.unwrap()
|
||||
.into_raw();
|
||||
WalProposerRust();
|
||||
}
|
||||
});
|
||||
|
||||
self.world.await_all();
|
||||
|
||||
client_node
|
||||
}
|
||||
|
||||
pub fn sync_safekeepers(&self) -> anyhow::Result<Lsn> {
|
||||
let client_node = self.launch_sync();
|
||||
|
||||
// poll until exit or timeout
|
||||
let time_limit = self.timeout;
|
||||
while self.world.step() && self.world.now() < time_limit && !client_node.is_finished() {}
|
||||
|
||||
if !client_node.is_finished() {
|
||||
anyhow::bail!("timeout or idle stuck");
|
||||
}
|
||||
|
||||
let res = client_node.result.lock().clone();
|
||||
if res.0 != 0 {
|
||||
anyhow::bail!("non-zero exitcode: {:?}", res);
|
||||
}
|
||||
let lsn = Lsn::from_str(&res.1)?;
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
pub fn launch_walproposer(&self, lsn: Lsn) -> WalProposer {
|
||||
let client_node = self.world.new_node();
|
||||
|
||||
let lsn = if lsn.0 == 0 {
|
||||
// usual LSN after basebackup
|
||||
Lsn(21623024)
|
||||
} else {
|
||||
lsn
|
||||
};
|
||||
|
||||
// start the client thread
|
||||
let guc = self.safekeepers_guc.clone();
|
||||
let ttid = self.ttid.clone();
|
||||
client_node.launch(move |_| {
|
||||
let list = CString::new(guc).unwrap();
|
||||
|
||||
unsafe {
|
||||
WalProposerCleanup();
|
||||
|
||||
sim_redo_start_lsn = lsn.0;
|
||||
syncSafekeepers = false;
|
||||
wal_acceptors_list = list.into_raw();
|
||||
wal_acceptor_reconnect_timeout = 1000;
|
||||
wal_acceptor_connection_timeout = 5000;
|
||||
neon_tenant_walproposer =
|
||||
CString::new(ttid.tenant_id.to_string()).unwrap().into_raw();
|
||||
neon_timeline_walproposer = CString::new(ttid.timeline_id.to_string())
|
||||
.unwrap()
|
||||
.into_raw();
|
||||
WalProposerRust();
|
||||
}
|
||||
});
|
||||
|
||||
self.world.await_all();
|
||||
|
||||
WalProposer {
|
||||
node: client_node,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn poll_for_duration(&self, duration: u64) {
|
||||
let time_limit = std::cmp::min(self.world.now() + duration, self.timeout);
|
||||
while self.world.step() && self.world.now() < time_limit {}
|
||||
}
|
||||
|
||||
pub fn run_schedule(&self, schedule: &Schedule) -> anyhow::Result<()> {
|
||||
{
|
||||
let empty_event = Box::new(EmptyEvent);
|
||||
|
||||
let now = self.world.now();
|
||||
for (time, _) in schedule {
|
||||
if *time < now {
|
||||
continue;
|
||||
}
|
||||
self.world.schedule(*time - now, empty_event.clone())
|
||||
}
|
||||
}
|
||||
|
||||
let mut wait_node = self.launch_sync();
|
||||
// fake walproposer
|
||||
let mut wp = WalProposer {
|
||||
node: wait_node.clone(),
|
||||
};
|
||||
let mut sync_in_progress = true;
|
||||
|
||||
let mut skipped_tx = 0;
|
||||
let mut started_tx = 0;
|
||||
|
||||
let mut schedule_ptr = 0;
|
||||
|
||||
loop {
|
||||
if sync_in_progress && wait_node.is_finished() {
|
||||
let res = wait_node.result.lock().clone();
|
||||
if res.0 != 0 {
|
||||
warn!("sync non-zero exitcode: {:?}", res);
|
||||
debug!("restarting walproposer");
|
||||
wait_node = self.launch_sync();
|
||||
continue;
|
||||
}
|
||||
let lsn = Lsn::from_str(&res.1)?;
|
||||
debug!("sync-safekeepers finished at LSN {}", lsn);
|
||||
wp = self.launch_walproposer(lsn);
|
||||
wait_node = wp.node.clone();
|
||||
debug!("walproposer started at node {}", wait_node.id);
|
||||
sync_in_progress = false;
|
||||
}
|
||||
|
||||
let now = self.world.now();
|
||||
while schedule_ptr < schedule.len() && schedule[schedule_ptr].0 <= now {
|
||||
if now != schedule[schedule_ptr].0 {
|
||||
warn!("skipped event {:?} at {}", schedule[schedule_ptr], now);
|
||||
}
|
||||
|
||||
let action = &schedule[schedule_ptr].1;
|
||||
match action {
|
||||
TestAction::WriteTx(size) => {
|
||||
if !sync_in_progress && !wait_node.is_finished() {
|
||||
started_tx += *size;
|
||||
wp.write_tx(*size);
|
||||
debug!("written {} transactions", size);
|
||||
} else {
|
||||
skipped_tx += size;
|
||||
debug!("skipped {} transactions", size);
|
||||
}
|
||||
}
|
||||
TestAction::RestartSafekeeper(id) => {
|
||||
debug!("restarting safekeeper {}", id);
|
||||
self.servers[*id as usize].restart();
|
||||
}
|
||||
TestAction::RestartWalProposer => {
|
||||
debug!("restarting walproposer");
|
||||
wait_node.crash_stop();
|
||||
sync_in_progress = true;
|
||||
wait_node = self.launch_sync();
|
||||
}
|
||||
}
|
||||
schedule_ptr += 1;
|
||||
}
|
||||
|
||||
if schedule_ptr == schedule.len() {
|
||||
break;
|
||||
}
|
||||
let next_event_time = schedule[schedule_ptr].0;
|
||||
|
||||
// poll until the next event
|
||||
if wait_node.is_finished() {
|
||||
while self.world.step() && self.world.now() < next_event_time {}
|
||||
} else {
|
||||
while self.world.step()
|
||||
&& self.world.now() < next_event_time
|
||||
&& !wait_node.is_finished()
|
||||
{}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("finished schedule");
|
||||
debug!("skipped_tx: {}", skipped_tx);
|
||||
debug!("started_tx: {}", started_tx);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WalProposer {
|
||||
pub node: Arc<Node>,
|
||||
}
|
||||
|
||||
impl WalProposer {
|
||||
pub fn write_tx(&mut self, cnt: usize) {
|
||||
self.node
|
||||
.network_chan()
|
||||
.send(NodeEvent::Internal(AnyMessage::Just32(cnt as u32)));
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
self.node.crash_stop();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum TestAction {
|
||||
WriteTx(usize),
|
||||
RestartSafekeeper(usize),
|
||||
RestartWalProposer,
|
||||
}
|
||||
|
||||
pub type Schedule = Vec<(u64, TestAction)>;
|
||||
|
||||
pub fn generate_schedule(seed: u64) -> Schedule {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
||||
let mut schedule = Vec::new();
|
||||
let mut time = 0;
|
||||
|
||||
let cnt = rng.gen_range(1..100);
|
||||
|
||||
for _ in 0..cnt {
|
||||
time += rng.gen_range(0..500);
|
||||
let action = match rng.gen_range(0..3) {
|
||||
0 => TestAction::WriteTx(rng.gen_range(1..10)),
|
||||
1 => TestAction::RestartSafekeeper(rng.gen_range(0..3)),
|
||||
2 => TestAction::RestartWalProposer,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
schedule.push((time, action));
|
||||
}
|
||||
|
||||
schedule
|
||||
}
|
||||
|
||||
pub fn generate_network_opts(seed: u64) -> NetworkOptions {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
||||
|
||||
let timeout = rng.gen_range(100..2000);
|
||||
let max_delay = rng.gen_range(1..2*timeout);
|
||||
let min_delay = rng.gen_range(1..=max_delay);
|
||||
|
||||
let max_fail_prob = rng.gen_range(0.0..0.9);
|
||||
let connect_fail_prob = rng.gen_range(0.0..max_fail_prob);
|
||||
let send_fail_prob = rng.gen_range(0.0..connect_fail_prob);
|
||||
|
||||
NetworkOptions {
|
||||
keepalive_timeout: Some(timeout),
|
||||
connect_delay: Delay {
|
||||
min: min_delay,
|
||||
max: max_delay,
|
||||
fail_prob: connect_fail_prob,
|
||||
},
|
||||
send_delay: Delay {
|
||||
min: min_delay,
|
||||
max: max_delay,
|
||||
fail_prob: send_fail_prob,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug,Clone,PartialEq,Eq)]
|
||||
enum NodeKind {
|
||||
Unknown,
|
||||
Safekeeper,
|
||||
WalProposer,
|
||||
}
|
||||
|
||||
impl Default for NodeKind {
|
||||
fn default() -> Self {
|
||||
Self::Unknown
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct NodeInfo {
|
||||
kind: NodeKind,
|
||||
|
||||
// walproposer
|
||||
is_sync: bool,
|
||||
term: u64,
|
||||
epoch_lsn: u64,
|
||||
|
||||
// safekeeper
|
||||
commit_lsn: u64,
|
||||
flush_lsn: u64,
|
||||
}
|
||||
|
||||
impl NodeInfo {
|
||||
fn init_kind(&mut self, kind: NodeKind) {
|
||||
if self.kind == NodeKind::Unknown {
|
||||
self.kind = kind;
|
||||
} else {
|
||||
assert!(self.kind == kind);
|
||||
}
|
||||
}
|
||||
|
||||
fn started(&mut self, data: &str) {
|
||||
let mut parts = data.split(';');
|
||||
assert!(parts.next().unwrap() == "started");
|
||||
match parts.next().unwrap() {
|
||||
"safekeeper" => {
|
||||
self.init_kind(NodeKind::Safekeeper);
|
||||
}
|
||||
"walproposer" => {
|
||||
self.init_kind(NodeKind::WalProposer);
|
||||
let is_sync: u8 = parts.next().unwrap().parse().unwrap();
|
||||
self.is_sync = is_sync != 0;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug,Default)]
|
||||
struct GlobalState {
|
||||
nodes: Vec<NodeInfo>,
|
||||
commit_lsn: u64,
|
||||
write_lsn: u64,
|
||||
max_write_lsn: u64,
|
||||
|
||||
written_wal: u64,
|
||||
written_records: u64,
|
||||
}
|
||||
|
||||
impl GlobalState {
|
||||
fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
fn get(&mut self, id: u32) -> &mut NodeInfo {
|
||||
let id = id as usize;
|
||||
if id >= self.nodes.len() {
|
||||
self.nodes.resize(id + 1, NodeInfo::default());
|
||||
}
|
||||
&mut self.nodes[id]
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_events(events: Vec<SEvent>) {
|
||||
const INITDB_LSN: u64 = 21623024;
|
||||
|
||||
let hook = std::panic::take_hook();
|
||||
scopeguard::defer_on_success! {
|
||||
std::panic::set_hook(hook);
|
||||
};
|
||||
|
||||
let mut state = GlobalState::new();
|
||||
state.max_write_lsn = INITDB_LSN;
|
||||
|
||||
for event in events {
|
||||
debug!("{:?}", event);
|
||||
|
||||
let node = state.get(event.node);
|
||||
if event.data.starts_with("started;") {
|
||||
node.started(&event.data);
|
||||
continue;
|
||||
}
|
||||
assert!(node.kind != NodeKind::Unknown);
|
||||
|
||||
// drop reference to unlock state
|
||||
let mut node = node.clone();
|
||||
|
||||
let mut parts = event.data.split(';');
|
||||
match node.kind {
|
||||
NodeKind::Safekeeper => {
|
||||
match parts.next().unwrap() {
|
||||
"tli_loaded" => {
|
||||
let flush_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
||||
let commit_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
||||
node.flush_lsn = flush_lsn;
|
||||
node.commit_lsn = commit_lsn;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
NodeKind::WalProposer => {
|
||||
match parts.next().unwrap() {
|
||||
"prop_elected" => {
|
||||
let prop_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
||||
let prop_term: u64 = parts.next().unwrap().parse().unwrap();
|
||||
let prev_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
||||
let prev_term: u64 = parts.next().unwrap().parse().unwrap();
|
||||
|
||||
assert!(prop_lsn >= prev_lsn);
|
||||
assert!(prop_term >= prev_term);
|
||||
|
||||
assert!(prop_lsn >= state.commit_lsn);
|
||||
|
||||
if prop_lsn > state.write_lsn {
|
||||
assert!(prop_lsn <= state.max_write_lsn);
|
||||
debug!("moving write_lsn up from {} to {}", state.write_lsn, prop_lsn);
|
||||
state.write_lsn = prop_lsn;
|
||||
}
|
||||
if prop_lsn < state.write_lsn {
|
||||
debug!("moving write_lsn down from {} to {}", state.write_lsn, prop_lsn);
|
||||
state.write_lsn = prop_lsn;
|
||||
}
|
||||
|
||||
node.epoch_lsn = prop_lsn;
|
||||
node.term = prop_term;
|
||||
}
|
||||
"write_wal" => {
|
||||
assert!(!node.is_sync);
|
||||
let start_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
||||
let end_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
||||
let cnt: u64 = parts.next().unwrap().parse().unwrap();
|
||||
|
||||
let size = end_lsn - start_lsn;
|
||||
state.written_wal += size;
|
||||
state.written_records += cnt;
|
||||
|
||||
// TODO: If we allow writing WAL before winning the election
|
||||
|
||||
assert!(start_lsn >= state.commit_lsn);
|
||||
assert!(end_lsn >= start_lsn);
|
||||
assert!(start_lsn == state.write_lsn);
|
||||
state.write_lsn = end_lsn;
|
||||
|
||||
if end_lsn > state.max_write_lsn {
|
||||
state.max_write_lsn = end_lsn;
|
||||
}
|
||||
}
|
||||
"commit_lsn" => {
|
||||
let lsn: u64 = parts.next().unwrap().parse().unwrap();
|
||||
assert!(lsn >= state.commit_lsn);
|
||||
state.commit_lsn = lsn;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
// update the node in the state struct
|
||||
*state.get(event.node) = node;
|
||||
}
|
||||
}
|
||||
265
libs/walproposer/src/simtest/wp_sk.rs
Normal file
265
libs/walproposer/src/simtest/wp_sk.rs
Normal file
@@ -0,0 +1,265 @@
|
||||
use std::{ffi::CString, path::Path, str::FromStr, sync::Arc};
|
||||
|
||||
use rand::Rng;
|
||||
use safekeeper::simlib::{
|
||||
network::{Delay, NetworkOptions},
|
||||
proto::AnyMessage,
|
||||
world::World,
|
||||
world::{Node, NodeEvent},
|
||||
};
|
||||
use tracing::{info, warn};
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
bindings::{
|
||||
neon_tenant_walproposer, neon_timeline_walproposer, sim_redo_start_lsn, syncSafekeepers,
|
||||
wal_acceptor_connection_timeout, wal_acceptor_reconnect_timeout, wal_acceptors_list,
|
||||
MyInsertRecord, WalProposerCleanup, WalProposerRust,
|
||||
},
|
||||
c_context,
|
||||
simtest::{
|
||||
log::{init_logger, SimClock},
|
||||
safekeeper::run_server,
|
||||
util::{generate_schedule, TestConfig, generate_network_opts, validate_events},
|
||||
}, enable_debug,
|
||||
};
|
||||
|
||||
use super::{
|
||||
disk::Disk,
|
||||
util::{Schedule, TestAction},
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn sync_empty_safekeepers() {
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
let test = config.start(1337);
|
||||
|
||||
let lsn = test.sync_safekeepers().unwrap();
|
||||
assert_eq!(lsn, Lsn(0));
|
||||
info!("Sucessfully synced empty safekeepers at 0/0");
|
||||
|
||||
let lsn = test.sync_safekeepers().unwrap();
|
||||
assert_eq!(lsn, Lsn(0));
|
||||
info!("Sucessfully synced (again) empty safekeepers at 0/0");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn run_walproposer_generate_wal() {
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
// config.network.timeout = Some(250);
|
||||
let test = config.start(1337);
|
||||
|
||||
let lsn = test.sync_safekeepers().unwrap();
|
||||
assert_eq!(lsn, Lsn(0));
|
||||
info!("Sucessfully synced empty safekeepers at 0/0");
|
||||
|
||||
let mut wp = test.launch_walproposer(lsn);
|
||||
|
||||
test.poll_for_duration(30);
|
||||
|
||||
for i in 0..100 {
|
||||
wp.write_tx(1);
|
||||
test.poll_for_duration(5);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn crash_safekeeper() {
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
// config.network.timeout = Some(250);
|
||||
let test = config.start(1337);
|
||||
|
||||
let lsn = test.sync_safekeepers().unwrap();
|
||||
assert_eq!(lsn, Lsn(0));
|
||||
info!("Sucessfully synced empty safekeepers at 0/0");
|
||||
|
||||
let mut wp = test.launch_walproposer(lsn);
|
||||
|
||||
test.poll_for_duration(30);
|
||||
|
||||
wp.write_tx(3);
|
||||
|
||||
test.servers[0].restart();
|
||||
|
||||
test.poll_for_duration(100);
|
||||
test.poll_for_duration(1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_restart() {
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
// config.network.timeout = Some(250);
|
||||
let test = config.start(1337);
|
||||
|
||||
let lsn = test.sync_safekeepers().unwrap();
|
||||
assert_eq!(lsn, Lsn(0));
|
||||
info!("Sucessfully synced empty safekeepers at 0/0");
|
||||
|
||||
let mut wp = test.launch_walproposer(lsn);
|
||||
|
||||
test.poll_for_duration(30);
|
||||
|
||||
wp.write_tx(3);
|
||||
test.poll_for_duration(100);
|
||||
|
||||
wp.stop();
|
||||
drop(wp);
|
||||
|
||||
let lsn = test.sync_safekeepers().unwrap();
|
||||
info!("Sucessfully synced safekeepers at {}", lsn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_schedule() -> anyhow::Result<()> {
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
config.network.keepalive_timeout = Some(100);
|
||||
let test = config.start(1337);
|
||||
|
||||
let schedule: Schedule = vec![
|
||||
(0, TestAction::RestartWalProposer),
|
||||
(50, TestAction::WriteTx(5)),
|
||||
(100, TestAction::RestartSafekeeper(0)),
|
||||
(100, TestAction::WriteTx(5)),
|
||||
(110, TestAction::RestartSafekeeper(1)),
|
||||
(110, TestAction::WriteTx(5)),
|
||||
(120, TestAction::RestartSafekeeper(2)),
|
||||
(120, TestAction::WriteTx(5)),
|
||||
(201, TestAction::RestartWalProposer),
|
||||
(251, TestAction::RestartSafekeeper(0)),
|
||||
(251, TestAction::RestartSafekeeper(1)),
|
||||
(251, TestAction::RestartSafekeeper(2)),
|
||||
(251, TestAction::WriteTx(5)),
|
||||
(255, TestAction::WriteTx(5)),
|
||||
(1000, TestAction::WriteTx(5)),
|
||||
];
|
||||
|
||||
test.run_schedule(&schedule)?;
|
||||
info!("Test finished, stopping all threads");
|
||||
test.world.deallocate();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_many_tx() -> anyhow::Result<()> {
|
||||
enable_debug();
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
let test = config.start(1337);
|
||||
|
||||
let mut schedule: Schedule = vec![];
|
||||
for i in 0..100 {
|
||||
schedule.push((i * 10, TestAction::WriteTx(10)));
|
||||
}
|
||||
|
||||
test.run_schedule(&schedule)?;
|
||||
info!("Test finished, stopping all threads");
|
||||
test.world.stop_all();
|
||||
|
||||
let events = test.world.take_events();
|
||||
info!("Events: {:?}", events);
|
||||
let last_commit_lsn = events
|
||||
.iter()
|
||||
.filter_map(|event| {
|
||||
if event.data.starts_with("commit_lsn;") {
|
||||
let lsn: u64 = event.data.split(';').nth(1).unwrap().parse().unwrap();
|
||||
return Some(lsn);
|
||||
}
|
||||
None
|
||||
})
|
||||
.last()
|
||||
.unwrap();
|
||||
|
||||
let initdb_lsn = 21623024;
|
||||
let diff = last_commit_lsn - initdb_lsn;
|
||||
info!("Last commit lsn: {}, diff: {}", last_commit_lsn, diff);
|
||||
assert!(diff > 1000 * 8);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_random_schedules() -> anyhow::Result<()> {
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
config.network.keepalive_timeout = Some(100);
|
||||
|
||||
for i in 0..30000 {
|
||||
let seed: u64 = rand::thread_rng().gen();
|
||||
config.network = generate_network_opts(seed);
|
||||
|
||||
let test = config.start(seed);
|
||||
warn!("Running test with seed {}", seed);
|
||||
|
||||
let schedule = generate_schedule(seed);
|
||||
test.run_schedule(&schedule).unwrap();
|
||||
validate_events(test.world.take_events());
|
||||
test.world.deallocate();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_one_schedule() -> anyhow::Result<()> {
|
||||
enable_debug();
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
config.network.keepalive_timeout = Some(100);
|
||||
|
||||
// let seed = 6762900106769428342;
|
||||
// let test = config.start(seed);
|
||||
// warn!("Running test with seed {}", seed);
|
||||
|
||||
// let schedule = generate_schedule(seed);
|
||||
// info!("schedule: {:?}", schedule);
|
||||
// test.run_schedule(&schedule)?;
|
||||
// test.world.deallocate();
|
||||
|
||||
let seed = 3649773280641776194;
|
||||
config.network = generate_network_opts(seed);
|
||||
info!("network: {:?}", config.network);
|
||||
let test = config.start(seed);
|
||||
warn!("Running test with seed {}", seed);
|
||||
|
||||
let schedule = generate_schedule(seed);
|
||||
info!("schedule: {:?}", schedule);
|
||||
test.run_schedule(&schedule).unwrap();
|
||||
validate_events(test.world.take_events());
|
||||
test.world.deallocate();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_res_dealloc() -> anyhow::Result<()> {
|
||||
// enable_debug();
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
|
||||
// print pid
|
||||
let pid = unsafe { libc::getpid() };
|
||||
info!("pid: {}", pid);
|
||||
|
||||
let seed = 123456;
|
||||
config.network = generate_network_opts(seed);
|
||||
let test = config.start(seed);
|
||||
warn!("Running test with seed {}", seed);
|
||||
|
||||
let schedule = generate_schedule(seed);
|
||||
info!("schedule: {:?}", schedule);
|
||||
test.run_schedule(&schedule).unwrap();
|
||||
test.world.stop_all();
|
||||
|
||||
let world = test.world.clone();
|
||||
drop(test);
|
||||
info!("world strong count: {}", Arc::strong_count(&world));
|
||||
world.deallocate();
|
||||
info!("world strong count: {}", Arc::strong_count(&world));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
31
libs/walproposer/src/test.rs
Normal file
31
libs/walproposer/src/test.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
use tracing::info;
|
||||
|
||||
use crate::bindings::{TestFunc, MyContextInit};
|
||||
|
||||
#[test]
|
||||
fn test_rust_c_calls() {
|
||||
let res = std::thread::spawn(|| {
|
||||
let res = unsafe {
|
||||
MyContextInit();
|
||||
TestFunc(1, 2)
|
||||
};
|
||||
res
|
||||
}).join().unwrap();
|
||||
info!("res: {}", res);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sim_bindings() {
|
||||
std::thread::spawn(|| {
|
||||
unsafe {
|
||||
MyContextInit();
|
||||
TestFunc(1, 2)
|
||||
}
|
||||
}).join().unwrap();
|
||||
std::thread::spawn(|| {
|
||||
unsafe {
|
||||
MyContextInit();
|
||||
TestFunc(1, 2)
|
||||
}
|
||||
}).join().unwrap();
|
||||
}
|
||||
100
libs/walproposer/test.c
Normal file
100
libs/walproposer/test.c
Normal file
@@ -0,0 +1,100 @@
|
||||
#include "bindgen_deps.h"
|
||||
#include "rust_bindings.h"
|
||||
#include <stdio.h>
|
||||
#include <pthread.h>
|
||||
#include <stdlib.h>
|
||||
#include "postgres.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/guc.h"
|
||||
#include "miscadmin.h"
|
||||
#include "common/pg_prng.h"
|
||||
|
||||
// From src/backend/main/main.c
|
||||
const char *progname = "fakepostgres";
|
||||
|
||||
int TestFunc(int a, int b) {
|
||||
printf("TestFunc: %d + %d = %d\n", a, b, a + b);
|
||||
rust_function(0);
|
||||
elog(LOG, "postgres elog test");
|
||||
printf("After rust_function\n");
|
||||
return a + b;
|
||||
}
|
||||
|
||||
// This is a quick experiment with rewriting existing Rust code in C.
|
||||
void RunClientC(uint32_t serverId) {
|
||||
uint32_t clientId = sim_id();
|
||||
|
||||
elog(LOG, "started client");
|
||||
|
||||
int data_len = 5;
|
||||
|
||||
int delivered = 0;
|
||||
int tcp = sim_open_tcp(serverId);
|
||||
while (delivered < data_len) {
|
||||
sim_msg_set_repl_cell(delivered+1, clientId, delivered);
|
||||
sim_tcp_send(tcp);
|
||||
|
||||
Event event = sim_epoll_rcv(-1);
|
||||
switch (event.tag)
|
||||
{
|
||||
case Closed:
|
||||
elog(LOG, "connection closed");
|
||||
tcp = sim_open_tcp(serverId);
|
||||
break;
|
||||
|
||||
case Message:
|
||||
Assert(event.any_message == Just32);
|
||||
uint32_t val;
|
||||
sim_msg_get_just_u32(&val);
|
||||
if (val == delivered + 1) {
|
||||
delivered += 1;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
Assert(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool debug_enabled = false;
|
||||
|
||||
bool initializedMemoryContext = false;
|
||||
// pthread_mutex_init(&lock, NULL)?
|
||||
pthread_mutex_t lock;
|
||||
|
||||
void MyContextInit() {
|
||||
// initializes global variables, TODO how to make them thread-local?
|
||||
pthread_mutex_lock(&lock);
|
||||
if (!initializedMemoryContext) {
|
||||
initializedMemoryContext = true;
|
||||
MemoryContextInit();
|
||||
pg_prng_seed(&pg_global_prng_state, 0);
|
||||
|
||||
setenv("PGDATA", "/home/admin/simulator/libs/walproposer/pgdata", 1);
|
||||
|
||||
/*
|
||||
* Set default values for command-line options.
|
||||
*/
|
||||
InitializeGUCOptions();
|
||||
|
||||
/* Acquire configuration parameters */
|
||||
if (!SelectConfigFiles(NULL, progname))
|
||||
exit(1);
|
||||
|
||||
if (debug_enabled) {
|
||||
log_min_messages = LOG;
|
||||
} else {
|
||||
log_min_messages = FATAL;
|
||||
}
|
||||
Log_line_prefix = "[%p] ";
|
||||
|
||||
InitializeMaxBackends();
|
||||
ChangeToDataDir();
|
||||
CreateSharedMemoryAndSemaphores();
|
||||
SetInstallXLogFileSegmentActive();
|
||||
// CreateAuxProcessResourceOwner();
|
||||
// StartupXLOG();
|
||||
}
|
||||
pthread_mutex_unlock(&lock);
|
||||
}
|
||||
@@ -13,14 +13,13 @@ OBJS = \
|
||||
walproposer.o \
|
||||
walproposer_utils.o
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
SHLIB_LINK_INTERNAL = $(libpq)
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir) -DSIMLIB
|
||||
PG_LIBS = $(libpq)
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
|
||||
1
pgxn/neon/rust_bindings.h
Symbolic link
1
pgxn/neon/rust_bindings.h
Symbolic link
@@ -0,0 +1 @@
|
||||
../../libs/walproposer/rust_bindings.h
|
||||
@@ -71,7 +71,7 @@
|
||||
#include "walproposer.h"
|
||||
#include "walproposer_utils.h"
|
||||
|
||||
static bool syncSafekeepers = false;
|
||||
bool syncSafekeepers = false;
|
||||
|
||||
char *wal_acceptors_list;
|
||||
int wal_acceptor_reconnect_timeout;
|
||||
@@ -84,6 +84,11 @@ char *neon_safekeeper_token_walproposer = NULL;
|
||||
|
||||
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
||||
|
||||
#ifdef SIMLIB
|
||||
#include "rust_bindings.h"
|
||||
#define GetCurrentTimestamp() ((TimestampTz) sim_now())
|
||||
#endif
|
||||
|
||||
static int n_safekeepers = 0;
|
||||
static int quorum = 0;
|
||||
static Safekeeper safekeeper[MAX_SAFEKEEPERS];
|
||||
@@ -316,6 +321,84 @@ nwp_shmem_startup_hook(void)
|
||||
WalproposerShmemInit();
|
||||
}
|
||||
|
||||
void WalProposerCleanup()
|
||||
{
|
||||
for (int i = 0; i < n_safekeepers; i++)
|
||||
{
|
||||
if (safekeeper[i].xlogreader)
|
||||
{
|
||||
XLogReaderFree(safekeeper[i].xlogreader);
|
||||
safekeeper[i].xlogreader = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
n_safekeepers = 0;
|
||||
quorum = 0;
|
||||
lastSentCommitLsn = 0;
|
||||
availableLsn = 0;
|
||||
lastSentCommitLsn = 0;
|
||||
truncateLsn = 0;
|
||||
propTerm = 0;
|
||||
propTermHistory.entries = NULL;
|
||||
propTermHistory.n_entries = 0;
|
||||
propEpochStartLsn = 0;
|
||||
donorEpoch = 0;
|
||||
donor = 0;
|
||||
timelineStartLsn = 0;
|
||||
n_votes = 0;
|
||||
n_connected = 0;
|
||||
last_reconnect_attempt = 0;
|
||||
|
||||
walprop_shared = palloc(WalproposerShmemSize());
|
||||
if (walprop_shared != NULL)
|
||||
{
|
||||
memset(walprop_shared, 0, WalproposerShmemSize());
|
||||
SpinLockInit(&walprop_shared->mutex);
|
||||
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
|
||||
}
|
||||
|
||||
XLogWalPropClose(0);
|
||||
}
|
||||
|
||||
void InitMyInsert();
|
||||
|
||||
void WalProposerRust()
|
||||
{
|
||||
struct stat stat_buf;
|
||||
|
||||
walprop_log(LOG, "WalProposerRust");
|
||||
|
||||
InitMyInsert();
|
||||
|
||||
sim_log("started;walproposer;%d", (int) syncSafekeepers);
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
ThisTimeLineID = 1;
|
||||
#endif
|
||||
|
||||
ChangeToDataDir();
|
||||
|
||||
/* Create pg_wal directory, if it doesn't exist */
|
||||
if (stat(XLOGDIR, &stat_buf) != 0)
|
||||
{
|
||||
ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
|
||||
if (MakePGDirectory(XLOGDIR) < 0)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not create directory \"%s\": %m",
|
||||
XLOGDIR)));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
WalProposerInit(0, 0);
|
||||
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
WalProposerStart();
|
||||
}
|
||||
|
||||
/*
|
||||
* WAL proposer bgworker entry point.
|
||||
*/
|
||||
@@ -377,6 +460,68 @@ WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos)
|
||||
BroadcastAppendRequest();
|
||||
}
|
||||
|
||||
#ifdef SIMLIB
|
||||
XLogRecPtr MyInsertRecord();
|
||||
|
||||
int
|
||||
SimWaitEventSetWait(Safekeeper **sk, long timeout, WaitEvent *occurred_events)
|
||||
{
|
||||
Event event = sim_epoll_peek(timeout);
|
||||
if (event.tag == Closed) {
|
||||
sim_epoll_rcv(0);
|
||||
for (int i = 0; i < n_safekeepers; i++) {
|
||||
if (safekeeper[i].conn && ((int64_t) walprop_socket(safekeeper[i].conn)) == event.tcp) {
|
||||
walprop_log(LOG, "connection to %s:%s is closed", safekeeper[i].host, safekeeper[i].port);
|
||||
ResetConnection(&safekeeper[i]);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
} else if (event.tag == Message && event.any_message == Bytes) {
|
||||
// !!! code must read the message
|
||||
for (int i = 0; i < n_safekeepers; i++) {
|
||||
if (safekeeper[i].conn && ((int64_t) walprop_socket(safekeeper[i].conn)) == event.tcp) {
|
||||
*occurred_events = (WaitEvent) {
|
||||
.events = WL_SOCKET_READABLE,
|
||||
};
|
||||
*sk = &safekeeper[i];
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
walprop_log(FATAL, "unknown tcp connection");
|
||||
} else if (event.tag == Internal && event.any_message == Just32) {
|
||||
uint32_t tx_count;
|
||||
XLogRecPtr start_lsn = sim_latest_available_lsn;
|
||||
XLogRecPtr finish_lsn = sim_latest_available_lsn;
|
||||
|
||||
Assert(!syncSafekeepers);
|
||||
|
||||
sim_epoll_rcv(0);
|
||||
sim_msg_get_just_u32(&tx_count);
|
||||
|
||||
// don't write WAL before winning the election
|
||||
if (propEpochStartLsn != 0)
|
||||
{
|
||||
for (uint32_t i = 0; i < tx_count; i++)
|
||||
{
|
||||
finish_lsn = MyInsertRecord();
|
||||
}
|
||||
|
||||
sim_log("write_wal;%lu;%lu;%d", start_lsn, finish_lsn, (int) tx_count);
|
||||
sim_latest_available_lsn = finish_lsn;
|
||||
}
|
||||
|
||||
*occurred_events = (WaitEvent) {
|
||||
.events = WL_LATCH_SET,
|
||||
};
|
||||
return 1;
|
||||
} else if (event.tag == Timeout) {
|
||||
return 0;
|
||||
} else {
|
||||
Assert(false);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Advance the WAL proposer state machine, waiting each time for events to occur.
|
||||
* Will exit only when latch is set, i.e. new WAL should be pushed from walsender
|
||||
@@ -392,16 +537,25 @@ WalProposerPoll(void)
|
||||
WaitEvent event;
|
||||
TimestampTz now = GetCurrentTimestamp();
|
||||
|
||||
#ifndef SIMLIB
|
||||
rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
|
||||
&event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
|
||||
sk = (Safekeeper *) event.user_data;
|
||||
#else
|
||||
rc = SimWaitEventSetWait(&sk, TimeToReconnect(now), &event);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If the event contains something that one of our safekeeper states
|
||||
* was waiting for, we'll advance its state.
|
||||
*/
|
||||
if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
|
||||
{
|
||||
AdvancePollState(sk, event.events);
|
||||
#ifdef SIMLIB
|
||||
// TODO: assert that code consumed incoming message
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* If the timeout expired, attempt to reconnect to any safekeepers
|
||||
@@ -416,7 +570,9 @@ WalProposerPoll(void)
|
||||
*/
|
||||
if (rc != 0 && (event.events & WL_LATCH_SET))
|
||||
{
|
||||
#ifndef SIMLIB
|
||||
ResetLatch(MyLatch);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -431,6 +587,7 @@ WalProposerPoll(void)
|
||||
*/
|
||||
if (availableLsn != InvalidXLogRecPtr)
|
||||
{
|
||||
walprop_log(LOG, "no WAL generated during timeout, sending pool message");
|
||||
BroadcastAppendRequest();
|
||||
}
|
||||
|
||||
@@ -445,7 +602,7 @@ WalProposerPoll(void)
|
||||
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
|
||||
wal_acceptor_connection_timeout))
|
||||
{
|
||||
elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
|
||||
walprop_log(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
|
||||
ShutdownConnection(sk);
|
||||
}
|
||||
@@ -486,16 +643,18 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
char *sep;
|
||||
char *port;
|
||||
|
||||
#ifndef SIMLIB
|
||||
load_file("libpqwalreceiver", false);
|
||||
if (WalReceiverFunctions == NULL)
|
||||
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||
walprop_log(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||
#endif
|
||||
|
||||
for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
|
||||
{
|
||||
port = strchr(host, ':');
|
||||
if (port == NULL)
|
||||
{
|
||||
elog(FATAL, "port is not specified");
|
||||
walprop_log(FATAL, "port is not specified");
|
||||
}
|
||||
*port++ = '\0';
|
||||
sep = strchr(port, ',');
|
||||
@@ -503,8 +662,11 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
*sep++ = '\0';
|
||||
if (n_safekeepers + 1 >= MAX_SAFEKEEPERS)
|
||||
{
|
||||
elog(FATAL, "Too many safekeepers");
|
||||
walprop_log(FATAL, "Too many safekeepers");
|
||||
}
|
||||
|
||||
memset(&safekeeper[n_safekeepers], 0, sizeof(Safekeeper));
|
||||
|
||||
safekeeper[n_safekeepers].host = host;
|
||||
safekeeper[n_safekeepers].port = port;
|
||||
safekeeper[n_safekeepers].state = SS_OFFLINE;
|
||||
@@ -526,13 +688,13 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
}
|
||||
|
||||
if (written > MAXCONNINFO || written < 0)
|
||||
elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||
walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||
}
|
||||
|
||||
initStringInfo(&safekeeper[n_safekeepers].outbuf);
|
||||
safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
|
||||
if (safekeeper[n_safekeepers].xlogreader == NULL)
|
||||
elog(FATAL, "Failed to allocate xlog reader");
|
||||
walprop_log(FATAL, "Failed to allocate xlog reader");
|
||||
safekeeper[n_safekeepers].flushWrite = false;
|
||||
safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
|
||||
safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr;
|
||||
@@ -540,7 +702,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
}
|
||||
if (n_safekeepers < 1)
|
||||
{
|
||||
elog(FATAL, "Safekeepers addresses are not specified");
|
||||
walprop_log(FATAL, "Safekeepers addresses are not specified");
|
||||
}
|
||||
quorum = n_safekeepers / 2 + 1;
|
||||
|
||||
@@ -551,15 +713,15 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
|
||||
greetRequest.systemId = systemId;
|
||||
if (!neon_timeline_walproposer)
|
||||
elog(FATAL, "neon.timeline_id is not provided");
|
||||
walprop_log(FATAL, "neon.timeline_id is not provided");
|
||||
if (*neon_timeline_walproposer != '\0' &&
|
||||
!HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16))
|
||||
elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer);
|
||||
walprop_log(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer);
|
||||
if (!neon_tenant_walproposer)
|
||||
elog(FATAL, "neon.tenant_id is not provided");
|
||||
walprop_log(FATAL, "neon.tenant_id is not provided");
|
||||
if (*neon_tenant_walproposer != '\0' &&
|
||||
!HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16))
|
||||
elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer);
|
||||
walprop_log(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer);
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
/* FIXME don't use hardcoded timeline id */
|
||||
@@ -592,12 +754,14 @@ WalProposerLoop(void)
|
||||
WalProposerPoll();
|
||||
}
|
||||
|
||||
#ifndef SIMLIB
|
||||
|
||||
/* Initializes the internal event set, provided that it is currently null */
|
||||
static void
|
||||
InitEventSet(void)
|
||||
{
|
||||
if (waitEvents)
|
||||
elog(FATAL, "double-initialization of event set");
|
||||
walprop_log(FATAL, "double-initialization of event set");
|
||||
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
|
||||
AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
@@ -663,6 +827,26 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove)
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
static void
|
||||
InitEventSet(void)
|
||||
{
|
||||
walprop_log(DEBUG5, "InitEventSet");
|
||||
}
|
||||
|
||||
static void
|
||||
UpdateEventSet(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
walprop_log(DEBUG5, "UpdateEventSet");
|
||||
}
|
||||
|
||||
static void
|
||||
HackyRemoveWalProposerEvent(Safekeeper *to_remove)
|
||||
{
|
||||
walprop_log(DEBUG5, "HackyRemoveWalProposerEvent");
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
|
||||
static void
|
||||
ShutdownConnection(Safekeeper *sk)
|
||||
@@ -707,7 +891,7 @@ ResetConnection(Safekeeper *sk)
|
||||
* PGconn structure"
|
||||
*/
|
||||
if (!sk->conn)
|
||||
elog(FATAL, "failed to allocate new PGconn object");
|
||||
walprop_log(FATAL, "failed to allocate new PGconn object");
|
||||
|
||||
/*
|
||||
* PQconnectStart won't actually start connecting until we run
|
||||
@@ -725,7 +909,7 @@ ResetConnection(Safekeeper *sk)
|
||||
*
|
||||
* https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
|
||||
*/
|
||||
elog(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
|
||||
walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
|
||||
sk->host, sk->port, walprop_error_message(sk->conn));
|
||||
|
||||
/*
|
||||
@@ -750,13 +934,18 @@ ResetConnection(Safekeeper *sk)
|
||||
* (see libpqrcv_connect, defined in
|
||||
* src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
|
||||
*/
|
||||
elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
|
||||
walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
|
||||
|
||||
sk->state = SS_CONNECTING_WRITE;
|
||||
sk->latestMsgReceivedAt = GetCurrentTimestamp();
|
||||
|
||||
#ifndef SIMLIB
|
||||
sock = walprop_socket(sk->conn);
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
|
||||
#else
|
||||
HandleConnectionEvent(sk);
|
||||
RecvStartWALPushResult(sk);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -819,7 +1008,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* ResetConnection
|
||||
*/
|
||||
case SS_OFFLINE:
|
||||
elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
|
||||
walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
|
||||
sk->host, sk->port);
|
||||
break; /* actually unreachable, but prevents
|
||||
* -Wimplicit-fallthrough */
|
||||
@@ -855,7 +1044,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* requests.
|
||||
*/
|
||||
case SS_VOTING:
|
||||
elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
@@ -884,7 +1073,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* Idle state for waiting votes from quorum.
|
||||
*/
|
||||
case SS_IDLE:
|
||||
elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
@@ -909,7 +1098,7 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
switch (result)
|
||||
{
|
||||
case WP_CONN_POLLING_OK:
|
||||
elog(LOG, "connected with node %s:%s", sk->host,
|
||||
walprop_log(LOG, "connected with node %s:%s", sk->host,
|
||||
sk->port);
|
||||
sk->latestMsgReceivedAt = GetCurrentTimestamp();
|
||||
/*
|
||||
@@ -932,7 +1121,7 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
break;
|
||||
|
||||
case WP_CONN_POLLING_FAILED:
|
||||
elog(WARNING, "failed to connect to node '%s:%s': %s",
|
||||
walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
|
||||
sk->host, sk->port, walprop_error_message(sk->conn));
|
||||
|
||||
/*
|
||||
@@ -945,12 +1134,14 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef SIMLIB
|
||||
/*
|
||||
* Because PQconnectPoll can change the socket, we have to un-register the
|
||||
* old event and re-register an event on the new socket.
|
||||
*/
|
||||
HackyRemoveWalProposerEvent(sk);
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk);
|
||||
#endif
|
||||
|
||||
/* If we successfully connected, send START_WAL_PUSH query */
|
||||
if (result == WP_CONN_POLLING_OK)
|
||||
@@ -967,7 +1158,7 @@ SendStartWALPush(Safekeeper *sk)
|
||||
{
|
||||
if (!walprop_send_query(sk->conn, "START_WAL_PUSH"))
|
||||
{
|
||||
elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, walprop_error_message(sk->conn));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
@@ -1002,7 +1193,7 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
break;
|
||||
|
||||
case WP_EXEC_FAILED:
|
||||
elog(WARNING, "Failed to send query to safekeeper %s:%s: %s",
|
||||
walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, walprop_error_message(sk->conn));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
@@ -1013,7 +1204,7 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
* wrong"
|
||||
*/
|
||||
case WP_EXEC_UNEXPECTED_SUCCESS:
|
||||
elog(WARNING, "Received bad response from safekeeper %s:%s query execution",
|
||||
walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
|
||||
sk->host, sk->port);
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
@@ -1060,7 +1251,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
if (n_connected == quorum)
|
||||
{
|
||||
propTerm++;
|
||||
elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm);
|
||||
walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm);
|
||||
|
||||
voteRequest = (VoteRequest)
|
||||
{
|
||||
@@ -1073,7 +1264,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
else if (sk->greetResponse.term > propTerm)
|
||||
{
|
||||
/* Another compute with higher term is running. */
|
||||
elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->greetResponse.term, propTerm);
|
||||
}
|
||||
@@ -1113,7 +1304,7 @@ static void
|
||||
SendVoteRequest(Safekeeper *sk)
|
||||
{
|
||||
/* We have quorum for voting, send our vote request */
|
||||
elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term);
|
||||
walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term);
|
||||
/* On failure, logging & resetting is handled */
|
||||
if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
|
||||
return;
|
||||
@@ -1128,7 +1319,7 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->voteResponse))
|
||||
return;
|
||||
|
||||
elog(LOG,
|
||||
walprop_log(LOG,
|
||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
@@ -1143,7 +1334,7 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
if ((!sk->voteResponse.voteGiven) &&
|
||||
(sk->voteResponse.term > propTerm || n_votes < quorum))
|
||||
{
|
||||
elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->voteResponse.term, propTerm);
|
||||
}
|
||||
@@ -1188,17 +1379,24 @@ HandleElectedProposer(void)
|
||||
*/
|
||||
if (truncateLsn < propEpochStartLsn)
|
||||
{
|
||||
elog(LOG,
|
||||
walprop_log(LOG,
|
||||
"start recovery because truncateLsn=%X/%X is not "
|
||||
"equal to epochStartLsn=%X/%X",
|
||||
LSN_FORMAT_ARGS(truncateLsn),
|
||||
LSN_FORMAT_ARGS(propEpochStartLsn));
|
||||
/* Perform recovery */
|
||||
if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn))
|
||||
elog(FATAL, "Failed to recover state");
|
||||
walprop_log(FATAL, "Failed to recover state");
|
||||
}
|
||||
else if (syncSafekeepers)
|
||||
{
|
||||
#ifdef SIMLIB
|
||||
char lsn_str[8 + 1 + 8 + 1];
|
||||
|
||||
snprintf(lsn_str, sizeof(lsn_str), "%X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
|
||||
sim_exit(0, lsn_str);
|
||||
#endif
|
||||
|
||||
/* Sync is not needed: just exit */
|
||||
fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
|
||||
exit(0);
|
||||
@@ -1275,6 +1473,7 @@ static void
|
||||
DetermineEpochStartLsn(void)
|
||||
{
|
||||
TermHistory *dth;
|
||||
int n_ready = 0;
|
||||
|
||||
propEpochStartLsn = InvalidXLogRecPtr;
|
||||
donorEpoch = 0;
|
||||
@@ -1285,6 +1484,8 @@ DetermineEpochStartLsn(void)
|
||||
{
|
||||
if (safekeeper[i].state == SS_IDLE)
|
||||
{
|
||||
n_ready++;
|
||||
|
||||
if (GetEpoch(&safekeeper[i]) > donorEpoch ||
|
||||
(GetEpoch(&safekeeper[i]) == donorEpoch &&
|
||||
safekeeper[i].voteResponse.flushLsn > propEpochStartLsn))
|
||||
@@ -1301,7 +1502,7 @@ DetermineEpochStartLsn(void)
|
||||
if (timelineStartLsn != InvalidXLogRecPtr &&
|
||||
timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn)
|
||||
{
|
||||
elog(WARNING,
|
||||
walprop_log(WARNING,
|
||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||
LSN_FORMAT_ARGS(timelineStartLsn),
|
||||
LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn));
|
||||
@@ -1311,6 +1512,9 @@ DetermineEpochStartLsn(void)
|
||||
}
|
||||
}
|
||||
|
||||
if (n_ready < quorum)
|
||||
walprop_log(FATAL, "missing majority of votes, expected %d, got %d", n_votes, n_ready);
|
||||
|
||||
/*
|
||||
* If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
|
||||
* was committed yet. Start streaming then from the basebackup LSN.
|
||||
@@ -1322,7 +1526,18 @@ DetermineEpochStartLsn(void)
|
||||
{
|
||||
timelineStartLsn = GetRedoStartLsn();
|
||||
}
|
||||
elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
|
||||
walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
|
||||
}
|
||||
|
||||
if (truncateLsn == InvalidXLogRecPtr && timelineStartLsn == propEpochStartLsn)
|
||||
{
|
||||
/*
|
||||
* If truncateLsn is 0 everywhere, we are bootstrapping -- nothing was
|
||||
* committed yet. But if timelineStartLsn is not 0, we already know
|
||||
* the first record location, so we can bump truncateLsn to it.
|
||||
*/
|
||||
truncateLsn = timelineStartLsn;
|
||||
walprop_log(LOG, "bumped truncateLsn to timelineStartLsn %X/%X", LSN_FORMAT_ARGS(truncateLsn));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1349,13 +1564,24 @@ DetermineEpochStartLsn(void)
|
||||
propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
|
||||
propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
|
||||
|
||||
elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
quorum,
|
||||
propTerm,
|
||||
LSN_FORMAT_ARGS(propEpochStartLsn),
|
||||
safekeeper[donor].host, safekeeper[donor].port,
|
||||
LSN_FORMAT_ARGS(truncateLsn));
|
||||
|
||||
{
|
||||
XLogRecPtr prev_lsn = 0;
|
||||
term_t prev_term = 0;
|
||||
if (propTermHistory.n_entries > 1)
|
||||
{
|
||||
prev_lsn = propTermHistory.entries[propTermHistory.n_entries - 2].lsn;
|
||||
prev_term = propTermHistory.entries[propTermHistory.n_entries - 2].term;
|
||||
}
|
||||
sim_log("prop_elected;%lu;%lu;%lu;%lu", propEpochStartLsn, propTerm, prev_lsn, prev_term);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure the basebackup we are running (at RedoStartLsn) matches LSN
|
||||
* since which we are going to write according to the consensus. If not,
|
||||
@@ -1379,7 +1605,7 @@ DetermineEpochStartLsn(void)
|
||||
if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
|
||||
walprop_shared->mineLastElectedTerm)))
|
||||
{
|
||||
elog(PANIC,
|
||||
walprop_log(PANIC,
|
||||
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
|
||||
LSN_FORMAT_ARGS(propEpochStartLsn),
|
||||
LSN_FORMAT_ARGS(GetRedoStartLsn()));
|
||||
@@ -1389,6 +1615,60 @@ DetermineEpochStartLsn(void)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SIMLIB
|
||||
static bool
|
||||
WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
|
||||
{
|
||||
int node_id;
|
||||
int64_t tcp_id;
|
||||
char startcmd[1024];
|
||||
int len;
|
||||
XLogRecPtr pos = startpos;
|
||||
|
||||
const char *connstr_prefix = "host=node port=";
|
||||
Assert(strncmp(safekeeper[donor].conninfo, connstr_prefix, strlen(connstr_prefix)) == 0);
|
||||
|
||||
node_id = atoi(safekeeper[donor].conninfo + strlen(connstr_prefix));
|
||||
tcp_id = sim_open_tcp_nopoll(node_id);
|
||||
|
||||
len = snprintf(
|
||||
startcmd,
|
||||
sizeof(startcmd),
|
||||
"START_REPLICATION %s %s %ld %ld",
|
||||
neon_tenant_walproposer,
|
||||
neon_timeline_walproposer,
|
||||
(int64_t) startpos,
|
||||
(int64_t) endpos
|
||||
);
|
||||
Assert(len > 0 && len < sizeof(startcmd));
|
||||
|
||||
sim_msg_set_bytes(startcmd, len);
|
||||
sim_tcp_send(tcp_id);
|
||||
|
||||
while (pos < endpos)
|
||||
{
|
||||
uintptr_t msg_len;
|
||||
char *msg;
|
||||
Event event = sim_tcp_recv(tcp_id);
|
||||
if (event.tag == Closed)
|
||||
break;
|
||||
Assert(event.tag == Message);
|
||||
walprop_log(LOG, "recovery received event %d", (int) event.any_message);
|
||||
Assert(event.any_message == Bytes);
|
||||
msg = (char*) sim_msg_get_bytes(&msg_len);
|
||||
|
||||
XLogWalPropWrite(msg, msg_len, pos);
|
||||
pos += msg_len;
|
||||
}
|
||||
|
||||
walprop_log(LOG, "recovery finished at %X/%X, from %X/%X to %X/%X",
|
||||
LSN_FORMAT_ARGS(pos),
|
||||
LSN_FORMAT_ARGS(startpos),
|
||||
LSN_FORMAT_ARGS(endpos));
|
||||
|
||||
return pos == endpos;
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* Receive WAL from most advanced safekeeper
|
||||
*/
|
||||
@@ -1408,7 +1688,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
|
||||
err)));
|
||||
return false;
|
||||
}
|
||||
elog(LOG,
|
||||
walprop_log(LOG,
|
||||
"start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
|
||||
"%d",
|
||||
safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32),
|
||||
@@ -1474,6 +1754,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Determine for sk the starting streaming point and send it message
|
||||
@@ -1542,7 +1823,7 @@ SendProposerElected(Safekeeper *sk)
|
||||
*/
|
||||
sk->startStreamingAt = truncateLsn;
|
||||
|
||||
elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
|
||||
walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn),
|
||||
LSN_FORMAT_ARGS(sk->startStreamingAt));
|
||||
}
|
||||
@@ -1577,7 +1858,7 @@ SendProposerElected(Safekeeper *sk)
|
||||
msg.timelineStartLsn = timelineStartLsn;
|
||||
|
||||
lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
|
||||
elog(LOG,
|
||||
walprop_log(LOG,
|
||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||
|
||||
@@ -1607,7 +1888,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
|
||||
{
|
||||
StartReplicationCmd cmd;
|
||||
|
||||
elog(LOG, "WAL proposer starts streaming at %X/%X",
|
||||
walprop_log(LOG, "WAL proposer starts streaming at %X/%X",
|
||||
LSN_FORMAT_ARGS(startpos));
|
||||
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
|
||||
cmd.timeline = greetRequest.timeline;
|
||||
@@ -1809,7 +2090,7 @@ SendAppendRequests(Safekeeper *sk)
|
||||
return true;
|
||||
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
walprop_error_message(sk->conn));
|
||||
ShutdownConnection(sk);
|
||||
@@ -1858,7 +2139,7 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
if (sk->appendResponse.term > propTerm)
|
||||
{
|
||||
/* Another compute with higher term is running. */
|
||||
elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
||||
walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, propTerm);
|
||||
}
|
||||
@@ -1877,6 +2158,7 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
|
||||
if (minQuorumLsn > lastSentCommitLsn)
|
||||
{
|
||||
sim_log("commit_lsn;%lu", minQuorumLsn);
|
||||
BroadcastAppendRequest();
|
||||
lastSentCommitLsn = minQuorumLsn;
|
||||
}
|
||||
@@ -1904,7 +2186,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->currentClusterSize = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
|
||||
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
|
||||
rf->currentClusterSize);
|
||||
}
|
||||
else if (strcmp(key, "ps_writelsn") == 0)
|
||||
@@ -1912,7 +2194,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->ps_writelsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
|
||||
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->ps_writelsn));
|
||||
}
|
||||
else if (strcmp(key, "ps_flushlsn") == 0)
|
||||
@@ -1920,7 +2202,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->ps_flushlsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
|
||||
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->ps_flushlsn));
|
||||
}
|
||||
else if (strcmp(key, "ps_applylsn") == 0)
|
||||
@@ -1928,7 +2210,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->ps_applylsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
|
||||
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->ps_applylsn));
|
||||
}
|
||||
else if (strcmp(key, "ps_replytime") == 0)
|
||||
@@ -1941,7 +2223,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
|
||||
|
||||
/* Copy because timestamptz_to_str returns a static buffer */
|
||||
replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
|
||||
walprop_log(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
|
||||
rf->ps_replytime, replyTimeStr);
|
||||
|
||||
pfree(replyTimeStr);
|
||||
@@ -1956,7 +2238,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
|
||||
* Skip unknown keys to support backward compatibile protocol
|
||||
* changes
|
||||
*/
|
||||
elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
walprop_log(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
pq_getmsgbytes(reply_message, len);
|
||||
};
|
||||
}
|
||||
@@ -2107,7 +2389,7 @@ GetLatestNeonFeedback(ReplicationFeedback * rf)
|
||||
rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
|
||||
rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
|
||||
|
||||
elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
|
||||
walprop_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
|
||||
" ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
|
||||
rf->currentClusterSize,
|
||||
LSN_FORMAT_ARGS(rf->ps_writelsn),
|
||||
@@ -2133,7 +2415,9 @@ HandleSafekeeperResponse(void)
|
||||
{
|
||||
/* Get ReplicationFeedback fields from the most advanced safekeeper */
|
||||
GetLatestNeonFeedback(&quorumFeedback.rf);
|
||||
#ifndef SIMLIB
|
||||
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
|
||||
@@ -2142,6 +2426,7 @@ HandleSafekeeperResponse(void)
|
||||
if (minQuorumLsn > quorumFeedback.flushLsn)
|
||||
quorumFeedback.flushLsn = minQuorumLsn;
|
||||
|
||||
#ifndef SIMLIB
|
||||
/* advance the replication slot */
|
||||
if (!syncSafekeepers)
|
||||
ProcessStandbyReply(
|
||||
@@ -2156,18 +2441,31 @@ HandleSafekeeperResponse(void)
|
||||
*/
|
||||
quorumFeedback.rf.ps_flushlsn,
|
||||
GetCurrentTimestamp(), false);
|
||||
#endif
|
||||
|
||||
#ifdef SIMLIB
|
||||
if (!syncSafekeepers)
|
||||
{
|
||||
char lsn_str[8 + 1 + 8 + 1];
|
||||
|
||||
snprintf(lsn_str, sizeof(lsn_str), "%X/%X", LSN_FORMAT_ARGS(quorumFeedback.flushLsn));
|
||||
sim_set_result(1, lsn_str);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
CombineHotStanbyFeedbacks(&hsFeedback);
|
||||
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
|
||||
{
|
||||
quorumFeedback.hs = hsFeedback;
|
||||
#ifndef SIMLIB
|
||||
if (!syncSafekeepers)
|
||||
ProcessStandbyHSFeedback(hsFeedback.ts,
|
||||
XidFromFullTransactionId(hsFeedback.xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.xmin),
|
||||
XidFromFullTransactionId(hsFeedback.catalog_xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2227,6 +2525,13 @@ HandleSafekeeperResponse(void)
|
||||
}
|
||||
if (n_synced >= quorum)
|
||||
{
|
||||
#ifdef SIMLIB
|
||||
char lsn_str[8 + 1 + 8 + 1];
|
||||
|
||||
snprintf(lsn_str, sizeof(lsn_str), "%X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
|
||||
sim_exit(0, lsn_str);
|
||||
#endif
|
||||
|
||||
/* All safekeepers synced! */
|
||||
fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
|
||||
exit(0);
|
||||
@@ -2251,7 +2556,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
return false;
|
||||
|
||||
case PG_ASYNC_READ_FAIL:
|
||||
elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
|
||||
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state),
|
||||
walprop_error_message(sk->conn));
|
||||
ShutdownConnection(sk);
|
||||
@@ -2281,7 +2586,12 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
|
||||
if (!(AsyncRead(sk, &buf, &buf_size)))
|
||||
return false;
|
||||
|
||||
/* parse it */
|
||||
// for (int i = 0; i < buf_size; i++) {
|
||||
// fprintf(stderr, "%02x", buf[i]);
|
||||
// }
|
||||
// fprintf(stderr, "\n");
|
||||
|
||||
/* parse it */
|
||||
s.data = buf;
|
||||
s.len = buf_size;
|
||||
s.cursor = 0;
|
||||
@@ -2289,7 +2599,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
|
||||
tag = pq_getmsgint64_le(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
@@ -2364,7 +2674,7 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
|
||||
|
||||
if (!walprop_blocking_write(sk->conn, msg, msg_size))
|
||||
{
|
||||
elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
walprop_error_message(sk->conn));
|
||||
ShutdownConnection(sk);
|
||||
@@ -2409,7 +2719,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
|
||||
UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
|
||||
return false;
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
walprop_error_message(sk->conn));
|
||||
ShutdownConnection(sk);
|
||||
@@ -2446,7 +2756,7 @@ AsyncFlush(Safekeeper *sk)
|
||||
/* Nothing to do; try again when the socket's ready */
|
||||
return false;
|
||||
case -1:
|
||||
elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
|
||||
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
walprop_error_message(sk->conn));
|
||||
ResetConnection(sk);
|
||||
@@ -2474,7 +2784,7 @@ backpressure_lag_impl(void)
|
||||
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
|
||||
#define MB ((XLogRecPtr)1024 * 1024)
|
||||
|
||||
elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
|
||||
walprop_log(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
|
||||
LSN_FORMAT_ARGS(myFlushLsn),
|
||||
LSN_FORMAT_ARGS(writePtr),
|
||||
LSN_FORMAT_ARGS(flushPtr),
|
||||
@@ -2522,7 +2832,7 @@ backpressure_throttling_impl(void)
|
||||
/* Suspend writers until replicas catch up */
|
||||
set_ps_display("backpressure throttling");
|
||||
|
||||
elog(DEBUG2, "backpressure throttling: lag %lu", lag);
|
||||
walprop_log(DEBUG2, "backpressure throttling: lag %lu", lag);
|
||||
start = GetCurrentTimestamp();
|
||||
pg_usleep(BACK_PRESSURE_DELAY);
|
||||
stop = GetCurrentTimestamp();
|
||||
|
||||
@@ -10,6 +10,37 @@
|
||||
#include "utils/uuid.h"
|
||||
#include "replication/walreceiver.h"
|
||||
|
||||
#define WALPROPOSER_TAG "[WALPROPOSER] "
|
||||
|
||||
#ifdef SIMLIB
|
||||
#define walprop_log(tag, fmt, ...) do { \
|
||||
MyProcPid = sim_now(); \
|
||||
ereport((tag > WARNING ? WARNING : tag), \
|
||||
(errmsg(fmt, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), internalerrposition(0))); \
|
||||
if (tag > WARNING) \
|
||||
sim_exit(tag, "walprop_log error macros"); \
|
||||
} while (0)
|
||||
|
||||
#define exit(code) sim_exit(code, "exit()")
|
||||
|
||||
#define sim_log(fmt, ...) do { \
|
||||
char buf[1024]; \
|
||||
snprintf(buf, sizeof(buf), fmt, ##__VA_ARGS__); \
|
||||
sim_log_event(buf); \
|
||||
} while (0)
|
||||
#else
|
||||
#define walprop_log(tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(WALPROPOSER_TAG fmt, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), internalerrposition(0)))
|
||||
#endif
|
||||
|
||||
#ifdef SIMLIB
|
||||
extern uint64 sim_redo_start_lsn;
|
||||
#define GetRedoStartLsn() sim_redo_start_lsn
|
||||
extern XLogRecPtr sim_latest_available_lsn;
|
||||
#endif
|
||||
|
||||
#define SK_MAGIC 0xCafeCeefu
|
||||
#define SK_PROTOCOL_VERSION 2
|
||||
|
||||
@@ -28,6 +59,8 @@
|
||||
*/
|
||||
#define WL_NO_EVENTS 0
|
||||
|
||||
extern bool syncSafekeepers;
|
||||
|
||||
extern char *wal_acceptors_list;
|
||||
extern int wal_acceptor_reconnect_timeout;
|
||||
extern int wal_acceptor_connection_timeout;
|
||||
@@ -374,8 +407,11 @@ typedef struct Safekeeper
|
||||
XLogRecPtr streamingAt; /* current streaming position */
|
||||
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
|
||||
|
||||
#ifndef SIMLIB
|
||||
int eventPos; /* position in wait event set. Equal to -1 if*
|
||||
* no event */
|
||||
#endif
|
||||
|
||||
SafekeeperState state; /* safekeeper state machine state */
|
||||
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
|
||||
AcceptorGreeting greetResponse; /* acceptor greeting */
|
||||
|
||||
@@ -121,11 +121,11 @@ CompareLsn(const void *a, const void *b)
|
||||
*
|
||||
* The strings are intended to be used as a prefix to "state", e.g.:
|
||||
*
|
||||
* elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
|
||||
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
|
||||
*
|
||||
* If this sort of phrasing doesn't fit the message, instead use something like:
|
||||
*
|
||||
* elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
|
||||
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
|
||||
*/
|
||||
char *
|
||||
FormatSafekeeperState(SafekeeperState state)
|
||||
@@ -192,10 +192,10 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
if (!events_ok_for_state)
|
||||
{
|
||||
/*
|
||||
* To give a descriptive message in the case of failure, we use elog
|
||||
* To give a descriptive message in the case of failure, we use walprop_log
|
||||
* and then an assertion that's guaranteed to fail.
|
||||
*/
|
||||
elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
|
||||
Assert(events_ok_for_state);
|
||||
}
|
||||
@@ -298,7 +298,7 @@ FormatEvents(uint32 events)
|
||||
|
||||
if (events & (~all_flags))
|
||||
{
|
||||
elog(WARNING, "Event formatting found unexpected component %d",
|
||||
walprop_log(WARNING, "Event formatting found unexpected component %d",
|
||||
events & (~all_flags));
|
||||
return_str[6] = '*';
|
||||
return_str[7] = '\0';
|
||||
@@ -486,9 +486,9 @@ XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
|
||||
void
|
||||
XLogWalPropClose(XLogRecPtr recptr)
|
||||
{
|
||||
Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
|
||||
// Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
|
||||
|
||||
if (close(walpropFile) != 0)
|
||||
if (walpropFile >= 0 && close(walpropFile) != 0)
|
||||
{
|
||||
char xlogfname[MAXFNAMELEN];
|
||||
|
||||
@@ -505,6 +505,8 @@ XLogWalPropClose(XLogRecPtr recptr)
|
||||
|
||||
/* START of cloned functions from walsender.c */
|
||||
|
||||
void sim_start_replication(XLogRecPtr startpoint);
|
||||
|
||||
/*
|
||||
* Handle START_REPLICATION command.
|
||||
*
|
||||
@@ -517,6 +519,11 @@ StartProposerReplication(StartReplicationCmd *cmd)
|
||||
XLogRecPtr FlushPtr;
|
||||
TimeLineID currTLI;
|
||||
|
||||
#ifdef SIMLIB
|
||||
sim_start_replication(cmd->startpoint);
|
||||
return;
|
||||
#endif
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (ThisTimeLineID == 0)
|
||||
ereport(ERROR,
|
||||
@@ -1111,7 +1118,7 @@ XLogSendPhysical(void)
|
||||
|
||||
WalSndCaughtUp = true;
|
||||
|
||||
elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
|
||||
walprop_log(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
|
||||
LSN_FORMAT_ARGS(sendTimeLineValidUpto),
|
||||
LSN_FORMAT_ARGS(sentPtr));
|
||||
return;
|
||||
|
||||
@@ -42,8 +42,11 @@ remote_storage.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
storage_broker.workspace = true
|
||||
utils.workspace = true
|
||||
scopeguard.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
crossbeam = "0.8.2"
|
||||
rand.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile.workspace = true
|
||||
|
||||
@@ -19,6 +19,8 @@ pub mod receive_wal;
|
||||
pub mod remove_wal;
|
||||
pub mod safekeeper;
|
||||
pub mod send_wal;
|
||||
pub mod simlib;
|
||||
pub mod simtest;
|
||||
pub mod timeline;
|
||||
pub mod wal_backup;
|
||||
pub mod wal_service;
|
||||
@@ -75,8 +77,7 @@ impl SafeKeeperConf {
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
#[cfg(test)]
|
||||
fn dummy() -> Self {
|
||||
pub fn dummy() -> Self {
|
||||
SafeKeeperConf {
|
||||
workdir: PathBuf::from("./"),
|
||||
no_sync: false,
|
||||
|
||||
@@ -650,7 +650,7 @@ where
|
||||
self.state.persist(&state)?;
|
||||
}
|
||||
|
||||
info!(
|
||||
debug!(
|
||||
"processed greeting from walproposer {}, sending term {:?}",
|
||||
msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
|
||||
self.state.acceptor_state.term
|
||||
@@ -695,7 +695,7 @@ where
|
||||
resp.term = self.state.acceptor_state.term;
|
||||
resp.vote_given = true as u64;
|
||||
}
|
||||
info!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
|
||||
debug!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
|
||||
Ok(Some(AcceptorProposerMessage::VoteResponse(resp)))
|
||||
}
|
||||
|
||||
@@ -714,7 +714,7 @@ where
|
||||
}
|
||||
|
||||
fn handle_elected(&mut self, msg: &ProposerElected) -> Result<Option<AcceptorProposerMessage>> {
|
||||
info!("received ProposerElected {:?}", msg);
|
||||
debug!("received ProposerElected {:?}", msg);
|
||||
if self.state.acceptor_state.term < msg.term {
|
||||
let mut state = self.state.clone();
|
||||
state.acceptor_state.term = msg.term;
|
||||
@@ -760,14 +760,14 @@ where
|
||||
if state.timeline_start_lsn == Lsn(0) {
|
||||
// Remember point where WAL begins globally.
|
||||
state.timeline_start_lsn = msg.timeline_start_lsn;
|
||||
info!(
|
||||
debug!(
|
||||
"setting timeline_start_lsn to {:?}",
|
||||
state.timeline_start_lsn
|
||||
);
|
||||
}
|
||||
if state.local_start_lsn == Lsn(0) {
|
||||
state.local_start_lsn = msg.start_streaming_at;
|
||||
info!("setting local_start_lsn to {:?}", state.local_start_lsn);
|
||||
debug!("setting local_start_lsn to {:?}", state.local_start_lsn);
|
||||
}
|
||||
// Initializing commit_lsn before acking first flushed record is
|
||||
// important to let find_end_of_wal skip the hole in the beginning
|
||||
@@ -789,7 +789,7 @@ where
|
||||
self.persist_control_file(state)?;
|
||||
}
|
||||
|
||||
info!("start receiving WAL since {:?}", msg.start_streaming_at);
|
||||
debug!("start receiving WAL since {:?}", msg.start_streaming_at);
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
78
safekeeper/src/simlib/chan.rs
Normal file
78
safekeeper/src/simlib/chan.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use std::{collections::VecDeque, sync::Arc};
|
||||
|
||||
use super::sync::{Condvar, Mutex, Park};
|
||||
|
||||
/// FIFO channel with blocking send and receive. Can be cloned and shared between threads.
|
||||
#[derive(Clone)]
|
||||
pub struct Chan<T: Clone> {
|
||||
shared: Arc<ChanState<T>>,
|
||||
}
|
||||
|
||||
struct ChanState<T> {
|
||||
queue: Mutex<VecDeque<T>>,
|
||||
condvar: Condvar,
|
||||
}
|
||||
|
||||
impl<T: Clone> Chan<T> {
|
||||
pub fn new() -> Chan<T> {
|
||||
Chan {
|
||||
shared: Arc::new(ChanState {
|
||||
queue: Mutex::new(VecDeque::new()),
|
||||
condvar: Condvar::new(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Append a message to the end of the queue.
|
||||
/// Can be called from any thread.
|
||||
pub fn send(&self, t: T) {
|
||||
self.shared.queue.lock().push_back(t);
|
||||
self.shared.condvar.notify_one();
|
||||
}
|
||||
|
||||
/// Get a message from the front of the queue, or block if the queue is empty.
|
||||
/// Can be called only from the node thread.
|
||||
pub fn recv(&self) -> T {
|
||||
// interrupt the receiver to prevent consuming everything at once
|
||||
Park::yield_thread();
|
||||
|
||||
let mut queue = self.shared.queue.lock();
|
||||
loop {
|
||||
if let Some(t) = queue.pop_front() {
|
||||
return t;
|
||||
}
|
||||
self.shared.condvar.wait(&mut queue);
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as `recv`, but doesn't take the message from the queue.
|
||||
pub fn peek(&self) -> T {
|
||||
// interrupt the receiver to prevent consuming everything at once
|
||||
Park::yield_thread();
|
||||
|
||||
let mut queue = self.shared.queue.lock();
|
||||
loop {
|
||||
if let Some(t) = queue.front().cloned() {
|
||||
return t;
|
||||
}
|
||||
self.shared.condvar.wait(&mut queue);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a message from the front of the queue, or return `None` if the queue is empty.
|
||||
pub fn try_recv(&self) -> Option<T> {
|
||||
let mut queue = self.shared.queue.lock();
|
||||
queue.pop_front()
|
||||
}
|
||||
|
||||
/// Clone a message from the front of the queue, or return `None` if the queue is empty.
|
||||
pub fn try_peek(&self) -> Option<T> {
|
||||
let queue = self.shared.queue.lock();
|
||||
queue.front().cloned()
|
||||
}
|
||||
|
||||
pub fn clear(&self) {
|
||||
let mut queue = self.shared.queue.lock();
|
||||
queue.clear();
|
||||
}
|
||||
}
|
||||
8
safekeeper/src/simlib/mod.rs
Normal file
8
safekeeper/src/simlib/mod.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
pub mod chan;
|
||||
pub mod network;
|
||||
pub mod node_os;
|
||||
pub mod proto;
|
||||
pub mod sync;
|
||||
pub mod time;
|
||||
pub mod wait_group;
|
||||
pub mod world;
|
||||
413
safekeeper/src/simlib/network.rs
Normal file
413
safekeeper/src/simlib/network.rs
Normal file
@@ -0,0 +1,413 @@
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
fmt::{self, Debug},
|
||||
ops::DerefMut,
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use rand::{rngs::StdRng, Rng};
|
||||
use tracing::debug;
|
||||
|
||||
use super::{
|
||||
chan::Chan,
|
||||
proto::AnyMessage,
|
||||
sync::Mutex,
|
||||
time::NetworkEvent,
|
||||
world::{Node, NodeEvent, World},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Delay {
|
||||
pub min: u64,
|
||||
pub max: u64,
|
||||
pub fail_prob: f64, // [0; 1]
|
||||
}
|
||||
|
||||
impl Delay {
|
||||
/// No delay, no failures.
|
||||
pub fn empty() -> Delay {
|
||||
Delay {
|
||||
min: 0,
|
||||
max: 0,
|
||||
fail_prob: 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fixed delay.
|
||||
pub fn fixed(ms: u64) -> Delay {
|
||||
Delay {
|
||||
min: ms,
|
||||
max: ms,
|
||||
fail_prob: 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a random delay in range [min, max]. Return None if the
|
||||
/// message should be dropped.
|
||||
pub fn delay(&self, rng: &mut StdRng) -> Option<u64> {
|
||||
if rng.gen_bool(self.fail_prob) {
|
||||
return None;
|
||||
}
|
||||
Some(rng.gen_range(self.min..=self.max))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct NetworkOptions {
|
||||
/// Connection will be automatically closed after this timeout.
|
||||
pub keepalive_timeout: Option<u64>,
|
||||
pub connect_delay: Delay,
|
||||
pub send_delay: Delay,
|
||||
}
|
||||
|
||||
// 0 - from node(0) to node(1)
|
||||
// 1 - from node(1) to node(0)
|
||||
type MessageDirection = u8;
|
||||
|
||||
/// Virtual connection between two nodes.
|
||||
/// Node 0 is the creator of the connection (client),
|
||||
/// and node 1 is the acceptor (server).
|
||||
pub struct VirtualConnection {
|
||||
/// Connection id, used for logging and debugging and C API.
|
||||
pub connection_id: u64,
|
||||
pub world: Arc<World>,
|
||||
pub nodes: [Arc<Node>; 2],
|
||||
dst_sockets: [Chan<NodeEvent>; 2],
|
||||
state: Mutex<ConnectionState>,
|
||||
options: Arc<NetworkOptions>,
|
||||
}
|
||||
|
||||
struct ConnectionState {
|
||||
buffers: [NetworkBuffer; 2],
|
||||
rng: StdRng,
|
||||
}
|
||||
|
||||
impl VirtualConnection {
|
||||
pub fn new(
|
||||
id: u64,
|
||||
world: Arc<World>,
|
||||
src_sink: Chan<NodeEvent>,
|
||||
dst_sink: Chan<NodeEvent>,
|
||||
src: Arc<Node>,
|
||||
dst: Arc<Node>,
|
||||
options: Arc<NetworkOptions>,
|
||||
) -> Arc<Self> {
|
||||
let now = world.now();
|
||||
let rng = world.new_rng();
|
||||
|
||||
let conn = Arc::new(Self {
|
||||
connection_id: id,
|
||||
world,
|
||||
dst_sockets: [src_sink, dst_sink],
|
||||
nodes: [src, dst],
|
||||
state: Mutex::new(ConnectionState {
|
||||
buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))],
|
||||
rng,
|
||||
}),
|
||||
options,
|
||||
});
|
||||
|
||||
conn.world.add_conn(conn.clone());
|
||||
|
||||
conn.schedule_timeout();
|
||||
conn.send_connect();
|
||||
|
||||
// TODO: add connection to the dst node
|
||||
// conn.dst_sockets[1].send(NodeEvent::Connection(conn.clone()));
|
||||
|
||||
conn
|
||||
}
|
||||
|
||||
/// Notify the future about the possible timeout.
|
||||
fn schedule_timeout(self: &Arc<Self>) {
|
||||
if let Some(timeout) = self.options.keepalive_timeout {
|
||||
self.world.schedule(timeout, self.as_event());
|
||||
}
|
||||
}
|
||||
|
||||
/// Transmit some of the messages from the buffer to the nodes.
|
||||
pub fn process(self: &Arc<Self>) {
|
||||
let now = self.world.now();
|
||||
|
||||
let mut state = self.state.lock();
|
||||
|
||||
for direction in 0..2 {
|
||||
self.process_direction(
|
||||
state.deref_mut(),
|
||||
now,
|
||||
direction as MessageDirection,
|
||||
&self.dst_sockets[direction ^ 1],
|
||||
);
|
||||
}
|
||||
|
||||
// Close the one side of the connection by timeout if the node
|
||||
// has not received any messages for a long time.
|
||||
if let Some(timeout) = self.options.keepalive_timeout {
|
||||
let mut to_close = [false, false];
|
||||
for direction in 0..2 {
|
||||
let node_idx = direction ^ 1;
|
||||
let node = &self.nodes[node_idx];
|
||||
|
||||
let buffer = &mut state.buffers[direction];
|
||||
if buffer.recv_closed {
|
||||
continue;
|
||||
}
|
||||
if let Some(last_recv) = buffer.last_recv {
|
||||
if now - last_recv >= timeout {
|
||||
debug!(
|
||||
"NET: connection {} timed out at node {}",
|
||||
self.connection_id, node.id
|
||||
);
|
||||
to_close[node_idx] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
|
||||
for node_idx in 0..2 {
|
||||
if to_close[node_idx] {
|
||||
self.close(node_idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Process messages in the buffer in the given direction.
|
||||
fn process_direction(
|
||||
self: &Arc<Self>,
|
||||
state: &mut ConnectionState,
|
||||
now: u64,
|
||||
direction: MessageDirection,
|
||||
to_socket: &Chan<NodeEvent>,
|
||||
) {
|
||||
let buffer = &mut state.buffers[direction as usize];
|
||||
if buffer.recv_closed {
|
||||
assert!(buffer.buf.is_empty());
|
||||
}
|
||||
|
||||
while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now {
|
||||
let msg = buffer.buf.pop_front().unwrap().1;
|
||||
let callback = TCP::new(self.clone(), direction ^ 1);
|
||||
|
||||
// debug!(
|
||||
// "NET: {:?} delivered, {}=>{}",
|
||||
// msg, from_node.id, to_node.id
|
||||
// );
|
||||
buffer.last_recv = Some(now);
|
||||
self.schedule_timeout();
|
||||
|
||||
if let AnyMessage::InternalConnect = msg {
|
||||
to_socket.send(NodeEvent::Accept(callback));
|
||||
} else {
|
||||
to_socket.send(NodeEvent::Message((msg, callback)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Send a message to the buffer.
|
||||
pub fn send(self: &Arc<Self>, direction: MessageDirection, msg: AnyMessage) {
|
||||
let now = self.world.now();
|
||||
let mut state = self.state.lock();
|
||||
|
||||
let (delay, close) = if let Some(ms) = self.options.send_delay.delay(&mut state.rng) {
|
||||
(ms, false)
|
||||
} else {
|
||||
(0, true)
|
||||
};
|
||||
|
||||
let buffer = &mut state.buffers[direction as usize];
|
||||
if buffer.send_closed {
|
||||
debug!(
|
||||
"NET: TCP #{} dropped message {:?} (broken pipe)",
|
||||
self.connection_id, msg
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if close {
|
||||
debug!(
|
||||
"NET: TCP #{} dropped message {:?} (pipe just broke)",
|
||||
self.connection_id, msg
|
||||
);
|
||||
buffer.send_closed = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if buffer.recv_closed {
|
||||
debug!(
|
||||
"NET: TCP #{} dropped message {:?} (recv closed)",
|
||||
self.connection_id, msg
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Send a message into the future.
|
||||
buffer.buf.push_back((now + delay, msg));
|
||||
self.world.schedule(delay, self.as_event());
|
||||
}
|
||||
|
||||
/// Send the handshake (Accept) to the server.
|
||||
fn send_connect(self: &Arc<Self>) {
|
||||
let now = self.world.now();
|
||||
let mut state = self.state.lock();
|
||||
let delay = self.options.connect_delay.delay(&mut state.rng);
|
||||
let buffer = &mut state.buffers[0];
|
||||
assert!(buffer.buf.is_empty());
|
||||
assert!(!buffer.recv_closed);
|
||||
assert!(!buffer.send_closed);
|
||||
assert!(buffer.last_recv.is_none());
|
||||
|
||||
let delay = if let Some(ms) = delay {
|
||||
ms
|
||||
} else {
|
||||
debug!("NET: TCP #{} dropped connect", self.connection_id);
|
||||
buffer.send_closed = true;
|
||||
return;
|
||||
};
|
||||
|
||||
// Send a message into the future.
|
||||
buffer
|
||||
.buf
|
||||
.push_back((now + delay, AnyMessage::InternalConnect));
|
||||
self.world.schedule(delay, self.as_event());
|
||||
}
|
||||
|
||||
fn internal_recv(self: &Arc<Self>, node_idx: usize) -> NodeEvent {
|
||||
// Only src node can receive messages.
|
||||
assert!(node_idx == 0);
|
||||
return self.dst_sockets[node_idx].recv();
|
||||
}
|
||||
|
||||
/// Close the connection. Only one side of the connection will be closed,
|
||||
/// and no further messages will be delivered. The other side will not be notified.
|
||||
pub fn close(self: &Arc<Self>, node_idx: usize) {
|
||||
let node = &self.nodes[node_idx];
|
||||
|
||||
let mut state = self.state.lock();
|
||||
let recv_buffer = &mut state.buffers[1 ^ node_idx];
|
||||
if recv_buffer.recv_closed {
|
||||
debug!(
|
||||
"NET: TCP #{} closed twice at node {}",
|
||||
self.connection_id, node.id
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
debug!(
|
||||
"NET: TCP #{} closed at node {}",
|
||||
self.connection_id, node.id
|
||||
);
|
||||
recv_buffer.recv_closed = true;
|
||||
for msg in recv_buffer.buf.drain(..) {
|
||||
debug!(
|
||||
"NET: TCP #{} dropped message {:?} (closed)",
|
||||
self.connection_id, msg
|
||||
);
|
||||
}
|
||||
|
||||
let send_buffer = &mut state.buffers[node_idx];
|
||||
send_buffer.send_closed = true;
|
||||
drop(state);
|
||||
|
||||
// TODO: notify the other side?
|
||||
|
||||
self.dst_sockets[node_idx].send(NodeEvent::Closed(TCP::new(self.clone(), node_idx as u8)));
|
||||
}
|
||||
|
||||
/// Get an event suitable for scheduling.
|
||||
fn as_event(self: &Arc<Self>) -> Box<NetworkEvent> {
|
||||
Box::new(NetworkEvent(self.clone()))
|
||||
}
|
||||
|
||||
pub fn deallocate(&self) {
|
||||
self.dst_sockets[0].clear();
|
||||
self.dst_sockets[1].clear();
|
||||
}
|
||||
}
|
||||
|
||||
struct NetworkBuffer {
|
||||
/// Messages paired with time of delivery
|
||||
buf: VecDeque<(u64, AnyMessage)>,
|
||||
/// True if the connection is closed on the receiving side,
|
||||
/// i.e. no more messages from the buffer will be delivered.
|
||||
recv_closed: bool,
|
||||
/// True if the connection is closed on the sending side,
|
||||
/// i.e. no more messages will be added to the buffer.
|
||||
send_closed: bool,
|
||||
/// Last time a message was delivered from the buffer.
|
||||
/// If None, it means that the server is the receiver and
|
||||
/// it has not yet aware of this connection (i.e. has not
|
||||
/// received the Accept).
|
||||
last_recv: Option<u64>,
|
||||
}
|
||||
|
||||
impl NetworkBuffer {
|
||||
fn new(last_recv: Option<u64>) -> Self {
|
||||
Self {
|
||||
buf: VecDeque::new(),
|
||||
recv_closed: false,
|
||||
send_closed: false,
|
||||
last_recv,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Simplistic simulation of a bidirectional network stream without reordering (TCP).
|
||||
/// There are almost no errors, writes are always successful (but may end up in void).
|
||||
/// Reads are implemented as a messages in a shared queue, refer to [`NodeOs::network_epoll`]
|
||||
/// for details.
|
||||
///
|
||||
/// TCP struct is just a one side of a connection. To create a connection, use [`NodeOs::open_tcp`].
|
||||
#[derive(Clone)]
|
||||
pub struct TCP {
|
||||
conn: Arc<VirtualConnection>,
|
||||
dir: MessageDirection,
|
||||
}
|
||||
|
||||
impl Debug for TCP {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"TCP #{} {{{}=>{}}}",
|
||||
self.conn.connection_id,
|
||||
self.conn.nodes[self.dir as usize].id,
|
||||
self.conn.nodes[1 - self.dir as usize].id
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl TCP {
|
||||
pub fn new(conn: Arc<VirtualConnection>, dir: MessageDirection) -> TCP {
|
||||
TCP { conn, dir }
|
||||
}
|
||||
|
||||
/// Send a message to the other side. It's guaranteed that it will not arrive
|
||||
/// before the arrival of all messages sent earlier.
|
||||
pub fn send(&self, msg: AnyMessage) {
|
||||
self.conn.send(self.dir, msg);
|
||||
}
|
||||
|
||||
/// Receive a message. Blocks until a message is available. Can be used only
|
||||
/// with sockets opened with [`NodeOs::open_tcp_nopoll`].
|
||||
pub fn recv(&self) -> NodeEvent {
|
||||
// TODO: handle closed connection
|
||||
self.conn.internal_recv(self.dir as usize)
|
||||
}
|
||||
|
||||
pub fn id(&self) -> i64 {
|
||||
let positive: i64 = (self.conn.connection_id + 1) as i64;
|
||||
if self.dir == 0 {
|
||||
positive
|
||||
} else {
|
||||
-positive
|
||||
}
|
||||
}
|
||||
|
||||
pub fn connection_id(&self) -> u64 {
|
||||
self.conn.connection_id
|
||||
}
|
||||
|
||||
pub fn close(&self) {
|
||||
self.conn.close(self.dir as usize);
|
||||
}
|
||||
}
|
||||
171
safekeeper/src/simlib/node_os.rs
Normal file
171
safekeeper/src/simlib/node_os.rs
Normal file
@@ -0,0 +1,171 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use rand::Rng;
|
||||
|
||||
use super::{
|
||||
chan::Chan,
|
||||
network::TCP,
|
||||
time::SendMessageEvent,
|
||||
world::{Node, NodeEvent, NodeId, World},
|
||||
};
|
||||
|
||||
/// Abstraction with all functions (aka syscalls) available to the node.
|
||||
#[derive(Clone)]
|
||||
pub struct NodeOs {
|
||||
world: Arc<World>,
|
||||
internal: Arc<Node>,
|
||||
}
|
||||
|
||||
impl NodeOs {
|
||||
pub fn new(world: Arc<World>, internal: Arc<Node>) -> NodeOs {
|
||||
NodeOs { world, internal }
|
||||
}
|
||||
|
||||
/// Get the node id.
|
||||
pub fn id(&self) -> NodeId {
|
||||
self.internal.id
|
||||
}
|
||||
|
||||
pub fn now(&self) -> u64 {
|
||||
self.world.now()
|
||||
}
|
||||
|
||||
/// Returns a writable pipe. All incoming messages should be polled
|
||||
/// with [`network_epoll`]. Always successful.
|
||||
pub fn open_tcp(&self, dst: NodeId) -> TCP {
|
||||
self.world.open_tcp(&self.internal, dst)
|
||||
}
|
||||
|
||||
/// Returns a readable and writable pipe. All incoming messages should
|
||||
/// be read from [`TCP`] object.
|
||||
pub fn open_tcp_nopoll(&self, dst: NodeId) -> TCP {
|
||||
self.world.open_tcp_nopoll(&self.internal, dst)
|
||||
}
|
||||
|
||||
/// Returns a channel to receive timers and events from the network.
|
||||
pub fn epoll(&self) -> Chan<NodeEvent> {
|
||||
self.internal.network_chan()
|
||||
}
|
||||
|
||||
/// Returns next event from the epoll channel with timeout.
|
||||
/// Returns `None` if timeout is reached.
|
||||
/// -1 – wait forever.
|
||||
/// 0 - poll, return immediately.
|
||||
/// >0 - wait for timeout milliseconds.
|
||||
pub fn epoll_recv(&self, timeout: i64) -> Option<NodeEvent> {
|
||||
let epoll = self.epoll();
|
||||
|
||||
let ready_event = loop {
|
||||
let event = epoll.try_recv();
|
||||
if let Some(NodeEvent::WakeTimeout(_)) = event {
|
||||
continue;
|
||||
}
|
||||
break event;
|
||||
};
|
||||
|
||||
if let Some(event) = ready_event {
|
||||
// return event if it's ready
|
||||
return Some(event);
|
||||
}
|
||||
|
||||
if timeout == 0 {
|
||||
// poll, return immediately
|
||||
return None;
|
||||
}
|
||||
|
||||
// or wait for timeout
|
||||
|
||||
let rand_nonce = self.random(u64::MAX);
|
||||
if timeout > 0 {
|
||||
self.world.schedule(
|
||||
timeout as u64,
|
||||
SendMessageEvent::new(epoll.clone(), NodeEvent::WakeTimeout(rand_nonce)),
|
||||
);
|
||||
}
|
||||
|
||||
loop {
|
||||
match epoll.recv() {
|
||||
NodeEvent::WakeTimeout(nonce) if nonce == rand_nonce => {
|
||||
return None;
|
||||
}
|
||||
NodeEvent::WakeTimeout(_) => {}
|
||||
event => {
|
||||
return Some(event);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as epoll_recv, but does not remove the event from the queue.
|
||||
pub fn epoll_peek(&self, timeout: i64) -> Option<NodeEvent> {
|
||||
let epoll = self.epoll();
|
||||
|
||||
let ready_event = loop {
|
||||
let event = epoll.try_peek();
|
||||
if let Some(NodeEvent::WakeTimeout(_)) = event {
|
||||
assert!(epoll.try_recv().is_some());
|
||||
continue;
|
||||
}
|
||||
break event;
|
||||
};
|
||||
|
||||
if let Some(event) = ready_event {
|
||||
// return event if it's ready
|
||||
return Some(event);
|
||||
}
|
||||
|
||||
if timeout == 0 {
|
||||
// poll, return immediately
|
||||
return None;
|
||||
}
|
||||
|
||||
// or wait for timeout
|
||||
|
||||
let rand_nonce = self.random(u64::MAX);
|
||||
if timeout > 0 {
|
||||
self.world.schedule(
|
||||
timeout as u64,
|
||||
SendMessageEvent::new(epoll.clone(), NodeEvent::WakeTimeout(rand_nonce)),
|
||||
);
|
||||
}
|
||||
|
||||
loop {
|
||||
match epoll.peek() {
|
||||
NodeEvent::WakeTimeout(nonce) if nonce == rand_nonce => {
|
||||
assert!(epoll.try_recv().is_some());
|
||||
return None;
|
||||
}
|
||||
NodeEvent::WakeTimeout(_) => {
|
||||
assert!(epoll.try_recv().is_some());
|
||||
}
|
||||
event => {
|
||||
return Some(event);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sleep for a given number of milliseconds.
|
||||
/// Currently matches the global virtual time, TODO may be good to
|
||||
/// introduce a separate clocks for each node.
|
||||
pub fn sleep(&self, ms: u64) {
|
||||
let chan: Chan<()> = Chan::new();
|
||||
self.world
|
||||
.schedule(ms, SendMessageEvent::new(chan.clone(), ()));
|
||||
chan.recv();
|
||||
}
|
||||
|
||||
/// Generate a random number in range [0, max).
|
||||
pub fn random(&self, max: u64) -> u64 {
|
||||
self.internal.rng.lock().gen_range(0..max)
|
||||
}
|
||||
|
||||
/// Set the result for the current node.
|
||||
pub fn set_result(&self, code: i32, result: String) {
|
||||
*self.internal.result.lock() = (code, result);
|
||||
}
|
||||
|
||||
pub fn log_event(&self, data: String) {
|
||||
self.world.add_event(self.id(), data)
|
||||
}
|
||||
}
|
||||
38
safekeeper/src/simlib/proto.rs
Normal file
38
safekeeper/src/simlib/proto.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
use std::fmt::Debug;
|
||||
|
||||
use bytes::Bytes;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// All possible flavours of messages.
|
||||
/// Grouped by the receiver node.
|
||||
#[derive(Clone)]
|
||||
pub enum AnyMessage {
|
||||
/// Not used, empty placeholder.
|
||||
None,
|
||||
/// Used internally for notifying node about new incoming connection.
|
||||
InternalConnect,
|
||||
Just32(u32),
|
||||
ReplCell(ReplCell),
|
||||
Bytes(Bytes),
|
||||
LSN(u64),
|
||||
}
|
||||
|
||||
impl Debug for AnyMessage {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
AnyMessage::None => write!(f, "None"),
|
||||
AnyMessage::InternalConnect => write!(f, "InternalConnect"),
|
||||
AnyMessage::Just32(v) => write!(f, "Just32({})", v),
|
||||
AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v),
|
||||
AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)),
|
||||
AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ReplCell {
|
||||
pub value: u32,
|
||||
pub client_id: u32,
|
||||
pub seqno: u32,
|
||||
}
|
||||
273
safekeeper/src/simlib/sync.rs
Normal file
273
safekeeper/src/simlib/sync.rs
Normal file
@@ -0,0 +1,273 @@
|
||||
use std::{backtrace::Backtrace, sync::Arc};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
use super::world::{Node, NodeId, World};
|
||||
|
||||
pub type Mutex<T> = parking_lot::Mutex<T>;
|
||||
|
||||
/// More deterministic condvar. Determenism comes from the fact that
|
||||
/// at all times there is at most one running thread.
|
||||
pub struct Condvar {
|
||||
waiters: Mutex<CondvarState>,
|
||||
}
|
||||
|
||||
struct CondvarState {
|
||||
waiters: Vec<Arc<Park>>,
|
||||
}
|
||||
|
||||
impl Condvar {
|
||||
pub fn new() -> Condvar {
|
||||
Condvar {
|
||||
waiters: Mutex::new(CondvarState {
|
||||
waiters: Vec::new(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Blocks the current thread until this condition variable receives a notification.
|
||||
pub fn wait<'a, T>(&self, guard: &mut parking_lot::MutexGuard<'a, T>) {
|
||||
let park = Park::new(false);
|
||||
|
||||
// add the waiter to the list
|
||||
self.waiters.lock().waiters.push(park.clone());
|
||||
|
||||
parking_lot::MutexGuard::unlocked(guard, || {
|
||||
// part the thread, it will be woken up by notify_one or notify_all
|
||||
park.park();
|
||||
});
|
||||
}
|
||||
|
||||
/// Wakes up all blocked threads on this condvar, can be called only from the node thread.
|
||||
pub fn notify_all(&self) {
|
||||
// TODO: check that it's waked up in random order and yield to the scheduler
|
||||
|
||||
let mut waiters = self.waiters.lock().waiters.drain(..).collect::<Vec<_>>();
|
||||
for waiter in waiters.drain(..) {
|
||||
// block (park) the current thread, wake the other thread
|
||||
waiter.wake();
|
||||
}
|
||||
}
|
||||
|
||||
/// Wakes up one blocked thread on this condvar. Usually can be called only from the node thread,
|
||||
/// because we have a global running threads counter and we transfer it from the current thread
|
||||
/// to the woken up thread. But we have a HACK here to allow calling it from the world thread.
|
||||
pub fn notify_one(&self) {
|
||||
// TODO: wake up random thread
|
||||
|
||||
let to_wake = self.waiters.lock().waiters.pop();
|
||||
|
||||
if Node::is_node_thread() {
|
||||
if let Some(waiter) = to_wake {
|
||||
// block (park) the current thread, wake the other thread
|
||||
waiter.wake();
|
||||
} else {
|
||||
// block (park) the current thread just in case
|
||||
Park::yield_thread()
|
||||
}
|
||||
} else {
|
||||
// HACK: custom notify_one implementation for the world thread
|
||||
if let Some(waiter) = to_wake {
|
||||
// block (park) the current thread, wake the other thread
|
||||
waiter.external_wake();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A tool to block (park) a current thread until it will be woken up.
|
||||
pub struct Park {
|
||||
lock: Mutex<ParkState>,
|
||||
cvar: parking_lot::Condvar,
|
||||
}
|
||||
|
||||
struct ParkState {
|
||||
/// False means that thread cannot continue without external signal,
|
||||
/// i.e. waiting for some event to happen.
|
||||
can_continue: bool,
|
||||
/// False means that thread is unconditionally parked and waiting for
|
||||
/// world simulation to wake it up. True means that the parking is
|
||||
/// finished and the thread can continue.
|
||||
finished: bool,
|
||||
/// True means that the thread should wake up and panic.
|
||||
panic: bool,
|
||||
node_id: Option<NodeId>,
|
||||
backtrace: Option<Backtrace>,
|
||||
}
|
||||
|
||||
impl Park {
|
||||
pub fn new(can_continue: bool) -> Arc<Park> {
|
||||
Arc::new(Park {
|
||||
lock: Mutex::new(ParkState {
|
||||
can_continue,
|
||||
finished: false,
|
||||
panic: false,
|
||||
node_id: None,
|
||||
backtrace: None,
|
||||
}),
|
||||
cvar: parking_lot::Condvar::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn init_state(state: &mut ParkState, node: &Arc<Node>) {
|
||||
state.node_id = Some(node.id);
|
||||
state.backtrace = Some(Backtrace::capture());
|
||||
}
|
||||
|
||||
/// Should be called once by the waiting thread. Blocks the thread until wake() is called,
|
||||
/// and until the thread is woken up by the world simulation.
|
||||
pub fn park(self: &Arc<Self>) {
|
||||
let node = Node::current();
|
||||
|
||||
// start blocking
|
||||
let mut state = self.lock.lock();
|
||||
Self::init_state(&mut state, &node);
|
||||
|
||||
if state.can_continue {
|
||||
// unconditional parking
|
||||
|
||||
parking_lot::MutexGuard::unlocked(&mut state, || {
|
||||
// first put to world parking, then decrease the running threads counter
|
||||
node.internal_parking_middle(self.clone());
|
||||
});
|
||||
} else {
|
||||
parking_lot::MutexGuard::unlocked(&mut state, || {
|
||||
// conditional parking, decrease the running threads counter without parking
|
||||
node.internal_parking_start(self.clone());
|
||||
});
|
||||
|
||||
// wait for condition
|
||||
while !state.can_continue {
|
||||
self.cvar.wait(&mut state);
|
||||
}
|
||||
|
||||
if state.panic {
|
||||
panic!("thread was crashed by the simulation");
|
||||
}
|
||||
|
||||
// condition is met, we are now running instead of the waker thread.
|
||||
// the next thing is to park the thread in the world, then decrease
|
||||
// the running threads counter
|
||||
node.internal_parking_middle(self.clone());
|
||||
}
|
||||
|
||||
self.park_wait_the_world(node, &mut state);
|
||||
}
|
||||
|
||||
fn park_wait_the_world(&self, node: Arc<Node>, state: &mut parking_lot::MutexGuard<ParkState>) {
|
||||
// condition is met, wait for world simulation to wake us up
|
||||
while !state.finished {
|
||||
self.cvar.wait(state);
|
||||
}
|
||||
|
||||
if state.panic {
|
||||
panic!("node {} was crashed by the simulation", node.id);
|
||||
}
|
||||
|
||||
// We are the only running thread now, we just need to update the state,
|
||||
// and continue the execution.
|
||||
node.internal_parking_end();
|
||||
}
|
||||
|
||||
/// Hacky way to register parking before the thread is actually blocked.
|
||||
fn park_ahead_now() -> Arc<Park> {
|
||||
let park = Park::new(true);
|
||||
let node = Node::current();
|
||||
Self::init_state(&mut park.lock.lock(), &node);
|
||||
node.internal_parking_ahead(park.clone());
|
||||
park
|
||||
}
|
||||
|
||||
/// Will wake up the thread that is currently conditionally parked. Can be called only
|
||||
/// from the node thread, because it will block the caller thread. What it will do:
|
||||
/// 1. Park the thread that called wake() in the world
|
||||
/// 2. Wake up the waiting thread (it will also park in the world)
|
||||
/// 3. Block the thread that called wake()
|
||||
pub fn wake(&self) {
|
||||
// parking the thread that called wake()
|
||||
let self_park = Park::park_ahead_now();
|
||||
|
||||
let mut state = self.lock.lock();
|
||||
if state.can_continue {
|
||||
debug!(
|
||||
"WARN wake() called on a thread that is already waked, node {:?}",
|
||||
state.node_id
|
||||
);
|
||||
} else {
|
||||
state.can_continue = true;
|
||||
// and here we park the waiting thread
|
||||
self.cvar.notify_all();
|
||||
}
|
||||
drop(state);
|
||||
|
||||
// and here we block the thread that called wake() by defer
|
||||
let node = Node::current();
|
||||
let mut state = self_park.lock.lock();
|
||||
self_park.park_wait_the_world(node, &mut state);
|
||||
}
|
||||
|
||||
/// Will wake up the thread that is currently conditionally parked. Can be called only
|
||||
/// from the world threads. What it will do:
|
||||
/// 1. Increase the running threads counter
|
||||
/// 2. Wake up the waiting thread (it will park itself in the world)
|
||||
pub fn external_wake(&self) {
|
||||
let world = World::current();
|
||||
|
||||
let mut state = self.lock.lock();
|
||||
if state.can_continue {
|
||||
debug!(
|
||||
"WARN external_wake() called on a thread that is already waked, node {:?}",
|
||||
state.node_id
|
||||
);
|
||||
return;
|
||||
}
|
||||
world.internal_parking_wake();
|
||||
state.can_continue = true;
|
||||
// and here we park the waiting thread
|
||||
self.cvar.notify_all();
|
||||
drop(state);
|
||||
}
|
||||
|
||||
/// Will wake up the thread that is currently unconditionally parked.
|
||||
pub fn internal_world_wake(&self) {
|
||||
let mut state = self.lock.lock();
|
||||
if state.finished {
|
||||
debug!(
|
||||
"WARN internal_world_wake() called on a thread that is already waked, node {:?}",
|
||||
state.node_id
|
||||
);
|
||||
return;
|
||||
}
|
||||
state.finished = true;
|
||||
self.cvar.notify_all();
|
||||
}
|
||||
|
||||
/// Will wake up thread to panic instantly.
|
||||
pub fn crash_panic(&self) {
|
||||
let mut state = self.lock.lock();
|
||||
state.can_continue = true;
|
||||
state.finished = true;
|
||||
state.panic = true;
|
||||
self.cvar.notify_all();
|
||||
drop(state);
|
||||
}
|
||||
|
||||
/// Print debug info about the parked thread.
|
||||
pub fn debug_print(&self) {
|
||||
// let state = self.lock.lock();
|
||||
// debug!("PARK: node {:?} wake1={} wake2={}", state.node_id, state.can_continue, state.finished);
|
||||
// debug!("DEBUG: node {:?} wake1={} wake2={}, trace={:?}", state.node_id, state.can_continue, state.finished, state.backtrace);
|
||||
}
|
||||
|
||||
/// It feels that this function can cause deadlocks.
|
||||
pub fn node_id(&self) -> Option<NodeId> {
|
||||
let state = self.lock.lock();
|
||||
state.node_id
|
||||
}
|
||||
|
||||
/// Yield the current thread to the world simulation.
|
||||
pub fn yield_thread() {
|
||||
let park = Park::new(true);
|
||||
park.park();
|
||||
}
|
||||
}
|
||||
155
safekeeper/src/simlib/time.rs
Normal file
155
safekeeper/src/simlib/time.rs
Normal file
@@ -0,0 +1,155 @@
|
||||
use std::{cmp::Ordering, collections::BinaryHeap, fmt::Debug, sync::Arc};
|
||||
|
||||
use super::{chan::Chan, network::VirtualConnection};
|
||||
|
||||
pub struct Timing {
|
||||
/// Current world's time.
|
||||
current_time: u64,
|
||||
/// Pending timers.
|
||||
timers: BinaryHeap<Pending>,
|
||||
/// Global nonce.
|
||||
nonce: u32,
|
||||
}
|
||||
|
||||
impl Timing {
|
||||
pub fn new() -> Timing {
|
||||
Timing {
|
||||
current_time: 0,
|
||||
timers: BinaryHeap::new(),
|
||||
nonce: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the current world's time.
|
||||
pub fn now(&self) -> u64 {
|
||||
self.current_time
|
||||
}
|
||||
|
||||
/// Tick-tock the global clock. Return the event ready to be processed
|
||||
/// or move the clock forward and then return the event.
|
||||
pub fn step(&mut self) -> Option<Pending> {
|
||||
if self.timers.is_empty() {
|
||||
// no future events
|
||||
return None;
|
||||
}
|
||||
|
||||
if !self.is_event_ready() {
|
||||
let next_time = self.timers.peek().unwrap().time;
|
||||
self.current_time = next_time;
|
||||
assert!(self.is_event_ready());
|
||||
}
|
||||
|
||||
self.timers.pop()
|
||||
}
|
||||
|
||||
/// TODO: write docs
|
||||
pub fn schedule_future(&mut self, ms: u64, event: Box<dyn Event + Send + Sync>) {
|
||||
self.nonce += 1;
|
||||
let nonce = self.nonce;
|
||||
self.timers.push(Pending {
|
||||
time: self.current_time + ms,
|
||||
nonce,
|
||||
event,
|
||||
})
|
||||
}
|
||||
|
||||
/// Return true if there is a ready event.
|
||||
fn is_event_ready(&self) -> bool {
|
||||
self.timers
|
||||
.peek()
|
||||
.map_or(false, |x| x.time <= self.current_time)
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.timers.clear();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Pending {
|
||||
pub time: u64,
|
||||
pub nonce: u32,
|
||||
pub event: Box<dyn Event + Send + Sync>,
|
||||
}
|
||||
|
||||
impl Pending {
|
||||
pub fn process(&self) {
|
||||
self.event.process();
|
||||
}
|
||||
}
|
||||
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
// to get that.
|
||||
impl PartialOrd for Pending {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
(other.time, other.nonce).partial_cmp(&(self.time, self.nonce))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Pending {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
(other.time, other.nonce).cmp(&(self.time, self.nonce))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Pending {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
(other.time, other.nonce) == (self.time, self.nonce)
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Pending {}
|
||||
|
||||
pub trait Event: Debug {
|
||||
fn process(&self);
|
||||
}
|
||||
|
||||
pub struct SendMessageEvent<T: Debug + Clone> {
|
||||
chan: Chan<T>,
|
||||
msg: T,
|
||||
}
|
||||
|
||||
impl<T: Debug + Clone> SendMessageEvent<T> {
|
||||
pub fn new(chan: Chan<T>, msg: T) -> Box<SendMessageEvent<T>> {
|
||||
Box::new(SendMessageEvent { chan, msg })
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Debug + Clone> Event for SendMessageEvent<T> {
|
||||
fn process(&self) {
|
||||
self.chan.send(self.msg.clone());
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Debug + Clone> Debug for SendMessageEvent<T> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// TODO: add more context about receiver channel
|
||||
f.debug_struct("SendMessageEvent")
|
||||
.field("msg", &self.msg)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NetworkEvent(pub Arc<VirtualConnection>);
|
||||
|
||||
impl Event for NetworkEvent {
|
||||
fn process(&self) {
|
||||
self.0.process();
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for NetworkEvent {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Network")
|
||||
.field("conn", &self.0.connection_id)
|
||||
.field("node[0]", &self.0.nodes[0].id)
|
||||
.field("node[1]", &self.0.nodes[1].id)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct EmptyEvent;
|
||||
|
||||
impl Event for EmptyEvent {
|
||||
fn process(&self) {}
|
||||
}
|
||||
54
safekeeper/src/simlib/wait_group.rs
Normal file
54
safekeeper/src/simlib/wait_group.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
|
||||
/// This is a custom waitgroup for internal use, shouldn't be used by the custom code.
|
||||
#[derive(Clone)]
|
||||
pub struct WaitGroup {
|
||||
inner: Arc<Inner>,
|
||||
}
|
||||
|
||||
/// Inner state of a `WaitGroup`.
|
||||
struct Inner {
|
||||
// using std convar
|
||||
cvar: Condvar,
|
||||
count: Mutex<i32>,
|
||||
}
|
||||
|
||||
impl Default for WaitGroup {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inner: Arc::new(Inner {
|
||||
cvar: Condvar::new(),
|
||||
count: Mutex::new(0),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WaitGroup {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
if *self.inner.count.lock().unwrap() <= 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut count = self.inner.count.lock().unwrap();
|
||||
while *count > 0 {
|
||||
count = self.inner.cvar.wait(count).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(&self, delta: i32) {
|
||||
let mut count = self.inner.count.lock().unwrap();
|
||||
*count += delta;
|
||||
if *count <= 0 {
|
||||
self.inner.cvar.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn done(&self) {
|
||||
self.add(-1);
|
||||
}
|
||||
}
|
||||
536
safekeeper/src/simlib/world.rs
Normal file
536
safekeeper/src/simlib/world.rs
Normal file
@@ -0,0 +1,536 @@
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use std::{
|
||||
cell::RefCell,
|
||||
ops::DerefMut,
|
||||
panic::AssertUnwindSafe,
|
||||
sync::{atomic::AtomicU64, Arc},
|
||||
};
|
||||
use tracing::{debug, info, trace};
|
||||
|
||||
use super::{
|
||||
chan::Chan,
|
||||
network::{NetworkOptions, VirtualConnection, TCP},
|
||||
node_os::NodeOs,
|
||||
proto::AnyMessage,
|
||||
sync::{Mutex, Park},
|
||||
time::{Event, Timing},
|
||||
wait_group::WaitGroup,
|
||||
};
|
||||
|
||||
pub type NodeId = u32;
|
||||
|
||||
/// Full world simulation state, shared between all nodes.
|
||||
pub struct World {
|
||||
nodes: Mutex<Vec<Arc<Node>>>,
|
||||
|
||||
/// List of parked threads, to be woken up by the world simulation.
|
||||
unconditional_parking: Mutex<Vec<Arc<Park>>>,
|
||||
|
||||
/// Counter for running threads. Generally should not be more than 1, if you want
|
||||
/// to get a deterministic simulation. 0 means that all threads are parked or finished.
|
||||
wait_group: WaitGroup,
|
||||
|
||||
/// Random number generator.
|
||||
rng: Mutex<StdRng>,
|
||||
|
||||
/// Timers and stuff.
|
||||
timing: Mutex<Timing>,
|
||||
|
||||
/// Network connection counter.
|
||||
connection_counter: AtomicU64,
|
||||
|
||||
/// Network options.
|
||||
network_options: Arc<NetworkOptions>,
|
||||
|
||||
/// Optional function to initialize nodes right after thread creation.
|
||||
nodes_init: Option<Box<dyn Fn(NodeOs) + Send + Sync>>,
|
||||
|
||||
/// Internal event log.
|
||||
events: Mutex<Vec<SEvent>>,
|
||||
|
||||
/// Connections.
|
||||
connections: Mutex<Vec<Arc<VirtualConnection>>>,
|
||||
}
|
||||
|
||||
impl World {
|
||||
pub fn new(
|
||||
seed: u64,
|
||||
network_options: Arc<NetworkOptions>,
|
||||
nodes_init: Option<Box<dyn Fn(NodeOs) + Send + Sync>>,
|
||||
) -> World {
|
||||
World {
|
||||
nodes: Mutex::new(Vec::new()),
|
||||
unconditional_parking: Mutex::new(Vec::new()),
|
||||
wait_group: WaitGroup::new(),
|
||||
rng: Mutex::new(StdRng::seed_from_u64(seed)),
|
||||
timing: Mutex::new(Timing::new()),
|
||||
connection_counter: AtomicU64::new(0),
|
||||
network_options,
|
||||
nodes_init,
|
||||
events: Mutex::new(Vec::new()),
|
||||
connections: Mutex::new(Vec::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new random number generator.
|
||||
pub fn new_rng(&self) -> StdRng {
|
||||
let mut rng = self.rng.lock();
|
||||
StdRng::from_rng(rng.deref_mut()).unwrap()
|
||||
}
|
||||
|
||||
/// Create a new node.
|
||||
pub fn new_node(self: &Arc<Self>) -> Arc<Node> {
|
||||
let mut nodes = self.nodes.lock();
|
||||
let id = nodes.len() as NodeId;
|
||||
let node = Arc::new(Node::new(id, self.clone(), self.new_rng()));
|
||||
nodes.push(node.clone());
|
||||
node
|
||||
}
|
||||
|
||||
/// Register world for the current thread. This is required before calling
|
||||
/// step().
|
||||
pub fn register_world(self: &Arc<Self>) {
|
||||
CURRENT_WORLD.with(|world| {
|
||||
*world.borrow_mut() = Some(self.clone());
|
||||
});
|
||||
}
|
||||
|
||||
/// Get an internal node state by id.
|
||||
pub fn get_node(&self, id: NodeId) -> Option<Arc<Node>> {
|
||||
let nodes = self.nodes.lock();
|
||||
let num = id as usize;
|
||||
if num < nodes.len() {
|
||||
Some(nodes[num].clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop_all(&self) {
|
||||
let nodes = self.nodes.lock().clone();
|
||||
for node in nodes {
|
||||
node.crash_stop();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a writable end of a TCP connection, to send src->dst messages.
|
||||
pub fn open_tcp(self: &Arc<World>, src: &Arc<Node>, dst: NodeId) -> TCP {
|
||||
// TODO: replace unwrap() with /dev/null socket.
|
||||
let dst = self.get_node(dst).unwrap();
|
||||
|
||||
let id = self
|
||||
.connection_counter
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
|
||||
let conn = VirtualConnection::new(
|
||||
id,
|
||||
self.clone(),
|
||||
src.network_chan(),
|
||||
dst.network_chan(),
|
||||
src.clone(),
|
||||
dst,
|
||||
self.network_options.clone(),
|
||||
);
|
||||
|
||||
// MessageDirection(0) is src->dst
|
||||
TCP::new(conn, 0)
|
||||
}
|
||||
|
||||
pub fn open_tcp_nopoll(self: &Arc<World>, src: &Arc<Node>, dst: NodeId) -> TCP {
|
||||
// TODO: replace unwrap() with /dev/null socket.
|
||||
let dst = self.get_node(dst).unwrap();
|
||||
|
||||
let id = self
|
||||
.connection_counter
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
|
||||
let conn = VirtualConnection::new(
|
||||
id,
|
||||
self.clone(),
|
||||
Chan::new(), // creating a new channel to read from
|
||||
dst.network_chan(),
|
||||
src.clone(),
|
||||
dst,
|
||||
self.network_options.clone(),
|
||||
);
|
||||
|
||||
// MessageDirection(0) is src->dst
|
||||
TCP::new(conn, 0)
|
||||
}
|
||||
|
||||
/// Blocks the current thread until all nodes will park or finish.
|
||||
pub fn await_all(&self) {
|
||||
self.wait_group.wait();
|
||||
}
|
||||
|
||||
/// Take a random unconditionally parked thread and return it.
|
||||
fn thread_to_unpark(&self) -> Option<Arc<Park>> {
|
||||
let mut parking = self.unconditional_parking.lock();
|
||||
if parking.is_empty() {
|
||||
// nothing to do, all threads have finished
|
||||
return None;
|
||||
}
|
||||
|
||||
let chosen_one = self.rng.lock().gen_range(0..parking.len());
|
||||
let park = parking.swap_remove(chosen_one);
|
||||
drop(parking);
|
||||
Some(park)
|
||||
}
|
||||
|
||||
pub fn step(&self) -> bool {
|
||||
self.await_all();
|
||||
|
||||
// First try to wake up unconditional thread.
|
||||
let to_resume = self.thread_to_unpark();
|
||||
if let Some(park) = to_resume {
|
||||
// debug!("Waking up park at node {:?}", park.node_id());
|
||||
|
||||
// Wake up the chosen thread. To do that:
|
||||
// 1. Increment the counter of running threads.
|
||||
// 2. Send a singal to continue the thread.
|
||||
self.wait_group.add(1);
|
||||
park.internal_world_wake();
|
||||
|
||||
// to have a clean state after each step, wait for all threads to finish
|
||||
self.await_all();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Otherwise, all threads are probably waiting for some event.
|
||||
// We'll try to advance virtual time to the next available event.
|
||||
//
|
||||
// This way all code running in simulation is considered to be
|
||||
// instant in terms of "virtual time", and time is advanced only
|
||||
// when code is waiting for external events.
|
||||
let time_event = self.timing.lock().step();
|
||||
if let Some(event) = time_event {
|
||||
// debug!("Processing event: {:?}", event.event);
|
||||
event.process();
|
||||
|
||||
// to have a clean state after each step, wait for all threads to finish
|
||||
self.await_all();
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Print full world state to stdout.
|
||||
pub fn debug_print_state(&self) {
|
||||
debug!(
|
||||
"World state, nodes.len()={:?}, parking.len()={:?}",
|
||||
self.nodes.lock().len(),
|
||||
self.unconditional_parking.lock().len()
|
||||
);
|
||||
for node in self.nodes.lock().iter() {
|
||||
debug!("node id={:?} status={:?}", node.id, node.status.lock());
|
||||
}
|
||||
for park in self.unconditional_parking.lock().iter() {
|
||||
park.debug_print();
|
||||
}
|
||||
}
|
||||
|
||||
/// Schedule an event to be processed after `ms` milliseconds of global time.
|
||||
pub fn schedule(&self, ms: u64, e: Box<dyn Event + Send + Sync>) {
|
||||
let mut timing = self.timing.lock();
|
||||
timing.schedule_future(ms, e);
|
||||
}
|
||||
|
||||
/// Get current time.
|
||||
pub fn now(&self) -> u64 {
|
||||
let timing = self.timing.lock();
|
||||
timing.now()
|
||||
}
|
||||
|
||||
/// Get the current world, panics if called from outside of a world thread.
|
||||
pub fn current() -> Arc<World> {
|
||||
CURRENT_WORLD.with(|world| {
|
||||
world
|
||||
.borrow()
|
||||
.as_ref()
|
||||
.expect("World::current() called from outside of a world thread")
|
||||
.clone()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn internal_parking_wake(&self) {
|
||||
// waking node with condition, increase the running threads counter
|
||||
self.wait_group.add(1);
|
||||
}
|
||||
|
||||
fn find_parked_node(&self, node: &Node) -> Option<Arc<Park>> {
|
||||
let mut parking = self.unconditional_parking.lock();
|
||||
let mut found: Option<usize> = None;
|
||||
for (i, park) in parking.iter().enumerate() {
|
||||
if park.node_id() == Some(node.id) {
|
||||
if found.is_some() {
|
||||
panic!("found more than one parked thread for node {}", node.id);
|
||||
}
|
||||
found = Some(i);
|
||||
}
|
||||
}
|
||||
Some(parking.swap_remove(found?))
|
||||
}
|
||||
|
||||
pub fn add_event(&self, node: NodeId, data: String) {
|
||||
let time = self.now();
|
||||
self.events.lock().push(SEvent { time, node, data });
|
||||
}
|
||||
|
||||
pub fn take_events(&self) -> Vec<SEvent> {
|
||||
let mut events = self.events.lock();
|
||||
let mut res = Vec::new();
|
||||
std::mem::swap(&mut res, &mut events);
|
||||
res
|
||||
}
|
||||
|
||||
pub fn add_conn(&self, conn: Arc<VirtualConnection>) {
|
||||
self.connections.lock().push(conn);
|
||||
}
|
||||
|
||||
pub fn deallocate(&self) {
|
||||
self.stop_all();
|
||||
|
||||
self.timing.lock().clear();
|
||||
self.unconditional_parking.lock().clear();
|
||||
|
||||
let mut connections = Vec::new();
|
||||
std::mem::swap(&mut connections, &mut self.connections.lock());
|
||||
for conn in connections {
|
||||
conn.deallocate();
|
||||
trace!("conn strong count: {}", Arc::strong_count(&conn));
|
||||
}
|
||||
|
||||
let mut nodes = Vec::new();
|
||||
std::mem::swap(&mut nodes, &mut self.nodes.lock());
|
||||
|
||||
let mut weak_ptrs = Vec::new();
|
||||
for node in nodes {
|
||||
node.deallocate();
|
||||
weak_ptrs.push(Arc::downgrade(&node));
|
||||
}
|
||||
|
||||
for weak_ptr in weak_ptrs {
|
||||
let node = weak_ptr.upgrade();
|
||||
if node.is_none() {
|
||||
trace!("node is already deallocated");
|
||||
continue;
|
||||
}
|
||||
let node = node.unwrap();
|
||||
debug!("node strong count: {}", Arc::strong_count(&node));
|
||||
}
|
||||
|
||||
self.events.lock().clear();
|
||||
}
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
pub static CURRENT_NODE: RefCell<Option<Arc<Node>>> = RefCell::new(None);
|
||||
pub static CURRENT_WORLD: RefCell<Option<Arc<World>>> = RefCell::new(None);
|
||||
}
|
||||
|
||||
/// Internal node state.
|
||||
pub struct Node {
|
||||
pub id: NodeId,
|
||||
network: Mutex<Chan<NodeEvent>>,
|
||||
status: Mutex<NodeStatus>,
|
||||
waiting_park: Mutex<Arc<Park>>,
|
||||
world: Arc<World>,
|
||||
join_handle: Mutex<Option<std::thread::JoinHandle<()>>>,
|
||||
pub rng: Mutex<StdRng>,
|
||||
/// Every node can set a result string, which can be read by the test.
|
||||
pub result: Mutex<(i32, String)>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum NodeStatus {
|
||||
NotStarted,
|
||||
Running,
|
||||
Waiting,
|
||||
Parked,
|
||||
Finished,
|
||||
Failed,
|
||||
}
|
||||
|
||||
impl Node {
|
||||
pub fn new(id: NodeId, world: Arc<World>, rng: StdRng) -> Node {
|
||||
Node {
|
||||
id,
|
||||
network: Mutex::new(Chan::new()),
|
||||
status: Mutex::new(NodeStatus::NotStarted),
|
||||
waiting_park: Mutex::new(Park::new(false)),
|
||||
world,
|
||||
join_handle: Mutex::new(None),
|
||||
rng: Mutex::new(rng),
|
||||
result: Mutex::new((-1, String::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set a code to run in this node thread.
|
||||
pub fn launch(self: &Arc<Self>, f: impl FnOnce(NodeOs) + Send + 'static) {
|
||||
let node = self.clone();
|
||||
let world = self.world.clone();
|
||||
world.wait_group.add(1);
|
||||
let join_handle = std::thread::spawn(move || {
|
||||
CURRENT_NODE.with(|current_node| {
|
||||
*current_node.borrow_mut() = Some(node.clone());
|
||||
});
|
||||
|
||||
let wg = world.wait_group.clone();
|
||||
scopeguard::defer! {
|
||||
wg.done();
|
||||
}
|
||||
|
||||
let mut status = node.status.lock();
|
||||
if *status != NodeStatus::NotStarted && *status != NodeStatus::Finished {
|
||||
// clearly a caller bug, should never happen
|
||||
panic!("node {} is already running", node.id);
|
||||
}
|
||||
*status = NodeStatus::Running;
|
||||
drop(status);
|
||||
|
||||
let res = std::panic::catch_unwind(AssertUnwindSafe(|| {
|
||||
// park the current thread, [`launch`] will wait until it's parked
|
||||
Park::yield_thread();
|
||||
|
||||
if let Some(nodes_init) = world.nodes_init.as_ref() {
|
||||
nodes_init(NodeOs::new(world.clone(), node.clone()));
|
||||
}
|
||||
|
||||
f(NodeOs::new(world, node.clone()));
|
||||
}));
|
||||
match res {
|
||||
Ok(_) => {
|
||||
debug!("Node {} finished successfully", node.id);
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Node {} finished with panic: {:?}", node.id, e);
|
||||
}
|
||||
}
|
||||
|
||||
let mut status = node.status.lock();
|
||||
*status = NodeStatus::Finished;
|
||||
});
|
||||
*self.join_handle.lock() = Some(join_handle);
|
||||
|
||||
// we need to wait for the thread to park, to assure that threads
|
||||
// are parked in deterministic order
|
||||
self.world.wait_group.wait();
|
||||
}
|
||||
|
||||
/// Returns a channel to receive events from the network.
|
||||
pub fn network_chan(&self) -> Chan<NodeEvent> {
|
||||
self.network.lock().clone()
|
||||
}
|
||||
|
||||
pub fn internal_parking_start(&self, park: Arc<Park>) {
|
||||
// Node started parking (waiting for condition), and the current thread
|
||||
// is the only one running, so we need to do:
|
||||
// 1. Change the node status to Waiting
|
||||
// 2. Decrease the running threads counter
|
||||
// 3. Block the current thread until it's woken up (outside this function)
|
||||
*self.status.lock() = NodeStatus::Waiting;
|
||||
*self.waiting_park.lock() = park;
|
||||
self.world.wait_group.done();
|
||||
}
|
||||
|
||||
pub fn internal_parking_middle(&self, park: Arc<Park>) {
|
||||
// [`park`] entered the unconditional_parking state, and the current thread
|
||||
// is the only one running, so we need to do:
|
||||
// 1. Change the node status to Parked
|
||||
// 2. Park in the world list
|
||||
// 3. Decrease the running threads counter
|
||||
// 4. Block the current thread until it's woken up (outside this function)
|
||||
*self.status.lock() = NodeStatus::Parked;
|
||||
self.world.unconditional_parking.lock().push(park);
|
||||
self.world.wait_group.done();
|
||||
}
|
||||
|
||||
pub fn internal_parking_ahead(&self, park: Arc<Park>) {
|
||||
// [`park`] entered the unconditional_parking state, and the current thread
|
||||
// wants to transfer control to another thread, so we need to do:
|
||||
// 1. Change the node status to Parked
|
||||
// 2. Park in the world list
|
||||
// 3. Notify the other thread to continue
|
||||
// 4. Block the current thread until it's woken up (outside this function)
|
||||
*self.status.lock() = NodeStatus::Parked;
|
||||
self.world.unconditional_parking.lock().push(park);
|
||||
}
|
||||
|
||||
pub fn internal_parking_end(&self) {
|
||||
// node finished parking, now it's running again
|
||||
*self.status.lock() = NodeStatus::Running;
|
||||
}
|
||||
|
||||
/// Get the current node, panics if called from outside of a node thread.
|
||||
pub fn current() -> Arc<Node> {
|
||||
CURRENT_NODE.with(|current_node| current_node.borrow().clone().unwrap())
|
||||
}
|
||||
|
||||
pub fn is_node_thread() -> bool {
|
||||
CURRENT_NODE.with(|current_node| current_node.borrow().is_some())
|
||||
}
|
||||
|
||||
pub fn is_finished(&self) -> bool {
|
||||
let status = self.status.lock();
|
||||
*status == NodeStatus::Finished
|
||||
}
|
||||
|
||||
pub fn crash_stop(self: &Arc<Self>) {
|
||||
self.world.await_all();
|
||||
|
||||
let status = self.status.lock().clone();
|
||||
match status {
|
||||
NodeStatus::NotStarted | NodeStatus::Finished | NodeStatus::Failed => return,
|
||||
NodeStatus::Running => {
|
||||
panic!("crash unexpected node state: Running")
|
||||
}
|
||||
NodeStatus::Waiting | NodeStatus::Parked => {}
|
||||
}
|
||||
|
||||
debug!("Node {} is crashing, status={:?}", self.id, status);
|
||||
|
||||
let park = self.world.find_parked_node(self);
|
||||
|
||||
let park = if park.is_some() {
|
||||
assert!(status == NodeStatus::Parked);
|
||||
park.unwrap()
|
||||
} else {
|
||||
assert!(status == NodeStatus::Waiting);
|
||||
self.waiting_park.lock().clone()
|
||||
};
|
||||
|
||||
park.debug_print();
|
||||
// self.world.debug_print_state();
|
||||
|
||||
// unplug old network socket, and create a new one
|
||||
*self.network.lock() = Chan::new();
|
||||
|
||||
self.world.wait_group.add(1);
|
||||
park.crash_panic();
|
||||
// self.world.debug_print_state();
|
||||
self.world.wait_group.wait();
|
||||
}
|
||||
|
||||
pub fn deallocate(&self) {
|
||||
self.network.lock().clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Network events and timers.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum NodeEvent {
|
||||
Accept(TCP),
|
||||
Closed(TCP),
|
||||
Message((AnyMessage, TCP)),
|
||||
Internal(AnyMessage),
|
||||
WakeTimeout(u64),
|
||||
// TODO: close?
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SEvent {
|
||||
pub time: u64,
|
||||
pub node: NodeId,
|
||||
pub data: String,
|
||||
}
|
||||
48
safekeeper/src/simtest/client.rs
Normal file
48
safekeeper/src/simtest/client.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
use tracing::info;
|
||||
|
||||
use crate::simlib::{
|
||||
node_os::NodeOs,
|
||||
proto::{AnyMessage, ReplCell},
|
||||
world::{NodeEvent, NodeId},
|
||||
};
|
||||
|
||||
/// Copy all data from array to the remote node.
|
||||
pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) {
|
||||
info!("started client");
|
||||
|
||||
let epoll = os.epoll();
|
||||
let mut delivered = 0;
|
||||
|
||||
let mut sock = os.open_tcp(dst);
|
||||
|
||||
while delivered < data.len() {
|
||||
let num = &data[delivered];
|
||||
info!("sending data: {:?}", num.clone());
|
||||
sock.send(AnyMessage::ReplCell(num.clone()));
|
||||
|
||||
// loop {
|
||||
let event = epoll.recv();
|
||||
match event {
|
||||
NodeEvent::Message((AnyMessage::Just32(flush_pos), _)) => {
|
||||
if flush_pos == 1 + delivered as u32 {
|
||||
delivered += 1;
|
||||
}
|
||||
}
|
||||
NodeEvent::Closed(_) => {
|
||||
info!("connection closed, reestablishing");
|
||||
sock = os.open_tcp(dst);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// }
|
||||
}
|
||||
|
||||
let sock = os.open_tcp(dst);
|
||||
for num in data {
|
||||
info!("sending data: {:?}", num.clone());
|
||||
sock.send(AnyMessage::ReplCell(num.clone()));
|
||||
}
|
||||
|
||||
info!("sent all data and finished client");
|
||||
}
|
||||
63
safekeeper/src/simtest/disk.rs
Normal file
63
safekeeper/src/simtest/disk.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::simlib::sync::{Mutex, Park};
|
||||
|
||||
pub trait Storage<T> {
|
||||
fn flush_pos(&self) -> u32;
|
||||
fn flush(&mut self) -> Result<()>;
|
||||
fn write(&mut self, t: T);
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedStorage<T> {
|
||||
pub state: Arc<Mutex<InMemoryStorage<T>>>,
|
||||
}
|
||||
|
||||
impl<T> SharedStorage<T> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
state: Arc::new(Mutex::new(InMemoryStorage::new())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Storage<T> for SharedStorage<T> {
|
||||
fn flush_pos(&self) -> u32 {
|
||||
self.state.lock().flush_pos
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> Result<()> {
|
||||
Park::yield_thread();
|
||||
self.state.lock().flush()
|
||||
}
|
||||
|
||||
fn write(&mut self, t: T) {
|
||||
Park::yield_thread();
|
||||
self.state.lock().write(t);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct InMemoryStorage<T> {
|
||||
pub data: Vec<T>,
|
||||
pub flush_pos: u32,
|
||||
}
|
||||
|
||||
impl<T> InMemoryStorage<T> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
data: Vec::new(),
|
||||
flush_pos: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> Result<()> {
|
||||
self.flush_pos = self.data.len() as u32;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write(&mut self, t: T) {
|
||||
self.data.push(t);
|
||||
}
|
||||
}
|
||||
109
safekeeper/src/simtest/mod.rs
Normal file
109
safekeeper/src/simtest/mod.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
mod client;
|
||||
mod disk;
|
||||
mod server;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{
|
||||
simlib::{
|
||||
proto::ReplCell,
|
||||
world::World, node_os::NodeOs,
|
||||
},
|
||||
simtest::{disk::SharedStorage, server::run_server},
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
use crate::simlib::{network::{Delay, NetworkOptions}, world::World};
|
||||
use super::{u32_to_cells, start_simulation, Options, client::run_client};
|
||||
|
||||
#[test]
|
||||
fn run_pure_rust_test() {
|
||||
let delay = Delay {
|
||||
min: 1,
|
||||
max: 60,
|
||||
fail_prob: 0.4,
|
||||
};
|
||||
|
||||
let network = NetworkOptions {
|
||||
keepalive_timeout: Some(50),
|
||||
connect_delay: delay.clone(),
|
||||
send_delay: delay.clone(),
|
||||
};
|
||||
|
||||
for seed in 0..20 {
|
||||
let u32_data: [u32; 5] = [1, 2, 3, 4, 5];
|
||||
let data = u32_to_cells(&u32_data, 1);
|
||||
let world = Arc::new(World::new(seed, Arc::new(network.clone()), None));
|
||||
|
||||
start_simulation(Options {
|
||||
world,
|
||||
time_limit: 1_000_000,
|
||||
client_fn: Box::new(move |os, server_id| {
|
||||
run_client(os, &data, server_id)
|
||||
}),
|
||||
u32_data,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub struct Options {
|
||||
pub world: Arc<World>,
|
||||
pub time_limit: u64,
|
||||
pub u32_data: [u32; 5],
|
||||
pub client_fn: Box<dyn FnOnce(NodeOs, u32) + Send + 'static>,
|
||||
}
|
||||
|
||||
pub fn start_simulation(options: Options) {
|
||||
let world = options.world;
|
||||
world.register_world();
|
||||
|
||||
let client_node = world.new_node();
|
||||
let server_node = world.new_node();
|
||||
let server_id = server_node.id;
|
||||
|
||||
// start the client thread
|
||||
client_node.launch(move |os| {
|
||||
let client_fn = options.client_fn;
|
||||
client_fn(os, server_id);
|
||||
});
|
||||
|
||||
// start the server thread
|
||||
let shared_storage = SharedStorage::new();
|
||||
let server_storage = shared_storage.clone();
|
||||
server_node.launch(move |os| run_server(os, Box::new(server_storage)));
|
||||
|
||||
world.await_all();
|
||||
|
||||
while world.step() && world.now() < options.time_limit {}
|
||||
|
||||
let disk_data = shared_storage.state.lock().data.clone();
|
||||
assert!(verify_data(&disk_data, &options.u32_data[..]));
|
||||
}
|
||||
|
||||
pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec<ReplCell> {
|
||||
let mut res = Vec::new();
|
||||
for i in 0..data.len() {
|
||||
res.push(ReplCell {
|
||||
client_id,
|
||||
seqno: i as u32,
|
||||
value: data[i],
|
||||
});
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
fn verify_data(disk_data: &[u32], data: &[u32]) -> bool {
|
||||
if disk_data.len() != data.len() {
|
||||
return false;
|
||||
}
|
||||
for i in 0..data.len() {
|
||||
if disk_data[i] != data[i] {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
53
safekeeper/src/simtest/server.rs
Normal file
53
safekeeper/src/simtest/server.rs
Normal file
@@ -0,0 +1,53 @@
|
||||
use tracing::info;
|
||||
|
||||
use crate::simlib::{node_os::NodeOs, proto::AnyMessage, world::NodeEvent};
|
||||
|
||||
use super::disk::Storage;
|
||||
|
||||
// pub struct DiskLog {
|
||||
// pub map: HashMap<String, u32>,
|
||||
// }
|
||||
|
||||
// impl DiskLog {
|
||||
// pub fn new() -> Self {
|
||||
// Self {
|
||||
// map: HashMap::new(),
|
||||
// }
|
||||
// }
|
||||
|
||||
// pub fn get(&self, key: &str) -> u32 {
|
||||
// self.map.get(key).copied().unwrap_or(0)
|
||||
// }
|
||||
|
||||
// pub fn set(&mut self, key: &str, value: u32) {
|
||||
// self.map.insert(key.to_string(), value);
|
||||
// }
|
||||
// }
|
||||
|
||||
pub fn run_server(os: NodeOs, mut storage: Box<dyn Storage<u32>>) {
|
||||
info!("started server");
|
||||
|
||||
let epoll = os.epoll();
|
||||
loop {
|
||||
let event = epoll.recv();
|
||||
info!("got event: {:?}", event);
|
||||
match event {
|
||||
NodeEvent::Message((msg, tcp)) => match msg {
|
||||
AnyMessage::ReplCell(cell) => {
|
||||
if cell.seqno != storage.flush_pos() {
|
||||
info!("got out of order data: {:?}", cell);
|
||||
continue;
|
||||
}
|
||||
storage.write(cell.value);
|
||||
storage.flush().unwrap();
|
||||
tcp.send(AnyMessage::Just32(storage.flush_pos()));
|
||||
}
|
||||
_ => {}
|
||||
},
|
||||
NodeEvent::Accept(tcp) => {
|
||||
tcp.send(AnyMessage::Just32(storage.flush_pos()));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 919851e781...298cdce9ab
Reference in New Issue
Block a user