mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 13:10:38 +00:00
Compare commits
68 Commits
release-pr
...
sk-tla-mem
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f5e59cf51b | ||
|
|
691c290280 | ||
|
|
db343caf5d | ||
|
|
6b7af160bf | ||
|
|
a9662b6f64 | ||
|
|
7a580627fc | ||
|
|
176fe2cade | ||
|
|
c846389812 | ||
|
|
cb1ebedc9f | ||
|
|
08b19dd6aa | ||
|
|
ced1903267 | ||
|
|
9584317564 | ||
|
|
4c7bdfa70a | ||
|
|
59cb648457 | ||
|
|
8c880d088b | ||
|
|
c940f196ce | ||
|
|
d0b4b3e64a | ||
|
|
617a8711be | ||
|
|
da5b71b5c1 | ||
|
|
6b62b7633b | ||
|
|
07872b310c | ||
|
|
90aa12c3d8 | ||
|
|
02dc3b2ba2 | ||
|
|
9ab6a89b5c | ||
|
|
79137382e7 | ||
|
|
e87d0813d1 | ||
|
|
ec7c8814f4 | ||
|
|
31c3eb7628 | ||
|
|
a2c67361b0 | ||
|
|
0d057d1374 | ||
|
|
2917e49391 | ||
|
|
91357b05e8 | ||
|
|
765adaf16c | ||
|
|
50a23d5a14 | ||
|
|
1c30e6a61a | ||
|
|
42a9ef3645 | ||
|
|
83b8e5c117 | ||
|
|
5912932de8 | ||
|
|
9aa29712d3 | ||
|
|
443a6fdfdb | ||
|
|
664569ecdb | ||
|
|
a9ced3573a | ||
|
|
f7b9fc1c81 | ||
|
|
979f925949 | ||
|
|
13fd695e3f | ||
|
|
2d0c22d77d | ||
|
|
9cde4ab0a7 | ||
|
|
c3eecf6763 | ||
|
|
6fa9b0cd8c | ||
|
|
10bc1903e1 | ||
|
|
261d065e6f | ||
|
|
b6154b03f4 | ||
|
|
8880134171 | ||
|
|
de7e4a34ca | ||
|
|
ac689ab014 | ||
|
|
23eabb9919 | ||
|
|
2af791ba83 | ||
|
|
e12628fe93 | ||
|
|
7880c246f1 | ||
|
|
04938d9d55 | ||
|
|
19f7d40c1d | ||
|
|
38563de7dd | ||
|
|
93939f123f | ||
|
|
49b599c113 | ||
|
|
8cde37bc0b | ||
|
|
f70611c8df | ||
|
|
21282aa113 | ||
|
|
d06bf4b0fe |
12
.github/workflows/benchmarking.yml
vendored
12
.github/workflows/benchmarking.yml
vendored
@@ -545,12 +545,12 @@ jobs:
|
||||
arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
|
||||
|
||||
cd /home/nonroot
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb"
|
||||
dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg
|
||||
dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg
|
||||
dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.1-1.pgdg110+1_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.5-1.pgdg110+1_${arch}.deb"
|
||||
dpkg -x libpq5_17.1-1.pgdg110+1_${arch}.deb pg
|
||||
dpkg -x postgresql-16_16.5-1.pgdg110+1_${arch}.deb pg
|
||||
dpkg -x postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb pg
|
||||
|
||||
mkdir -p /tmp/neon/pg_install/v16/bin
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
|
||||
|
||||
24
Cargo.lock
generated
24
Cargo.lock
generated
@@ -4009,7 +4009,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4022,7 +4022,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -4041,7 +4041,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -5663,9 +5663,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.13.1"
|
||||
version = "1.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
|
||||
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
|
||||
|
||||
[[package]]
|
||||
name = "smol_str"
|
||||
@@ -6074,9 +6074,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemalloc-ctl"
|
||||
version = "0.5.4"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c"
|
||||
checksum = "f21f216790c8df74ce3ab25b534e0718da5a1916719771d3fec23315c99e468b"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"paste",
|
||||
@@ -6085,9 +6085,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemalloc-sys"
|
||||
version = "0.5.4+5.3.0-patched"
|
||||
version = "0.6.0+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
|
||||
checksum = "cd3c60906412afa9c2b5b5a48ca6a5abe5736aec9eb48ad05037a677e52e4e2d"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
@@ -6095,9 +6095,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemallocator"
|
||||
version = "0.5.4"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca"
|
||||
checksum = "4cec5ff18518d81584f477e9bfdf957f5bb0979b0bac3af4ca30b5b3ae2d2865"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tikv-jemalloc-sys",
|
||||
@@ -6227,7 +6227,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
|
||||
25
Cargo.toml
25
Cargo.toml
@@ -168,8 +168,8 @@ sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
test-context = "0.3"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
tikv-jemalloc-ctl = "0.5"
|
||||
tikv-jemallocator = { version = "0.6", features = ["stats"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
@@ -203,21 +203,10 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
|
||||
# We want to use the 'neon' branch for these, but there's currently one
|
||||
# incompatible change on the branch. See:
|
||||
#
|
||||
# - PR #8076 which contained changes that depended on the new changes in
|
||||
# the rust-postgres crate, and
|
||||
# - PR #8654 which reverted those changes and made the code in proxy incompatible
|
||||
# with the tip of the 'neon' branch again.
|
||||
#
|
||||
# When those proxy changes are re-applied (see PR #8747), we can switch using
|
||||
# the tip of the 'neon' branch again.
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
@@ -255,7 +244,7 @@ tonic-build = "0.12"
|
||||
[patch.crates-io]
|
||||
|
||||
# Needed to get `tokio-postgres-rustls` to depend on our fork.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
|
||||
import 'sql_exporter/compute_current_lsn.libsonnet',
|
||||
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
|
||||
import 'sql_exporter/compute_max_connections.libsonnet',
|
||||
import 'sql_exporter/compute_receive_lsn.libsonnet',
|
||||
import 'sql_exporter/compute_subscriptions_count.libsonnet',
|
||||
import 'sql_exporter/connection_counts.libsonnet',
|
||||
|
||||
10
compute/etc/sql_exporter/compute_max_connections.libsonnet
Normal file
10
compute/etc/sql_exporter/compute_max_connections.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'compute_max_connections',
|
||||
type: 'gauge',
|
||||
help: 'Max connections allowed for Postgres',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'max_connections',
|
||||
],
|
||||
query: importstr 'sql_exporter/compute_max_connections.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/compute_max_connections.sql
Normal file
1
compute/etc/sql_exporter/compute_max_connections.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT current_setting('max_connections') as max_connections;
|
||||
@@ -147,7 +147,7 @@ index 542c2e098c..0062d3024f 100644
|
||||
ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
|
||||
ALTER TABLE ptnowner OWNER TO regress_ptnowner;
|
||||
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
|
||||
index 97bbe53b64..eac3d42a79 100644
|
||||
index 3f9a8f539c..0a51b52940 100644
|
||||
--- a/src/test/regress/expected/collate.icu.utf8.out
|
||||
+++ b/src/test/regress/expected/collate.icu.utf8.out
|
||||
@@ -1016,7 +1016,7 @@ select * from collate_test1 where b ilike 'ABC';
|
||||
@@ -309,7 +309,7 @@ index b48365ec98..a6ef910055 100644
|
||||
-- the wrong partition. This test is *not* guaranteed to trigger that bug, but
|
||||
-- does so when shared_buffers is small enough. To test if we encountered the
|
||||
diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out
|
||||
index faf1a4d1b0..a44c97db52 100644
|
||||
index 9a74820ee8..22400a5551 100644
|
||||
--- a/src/test/regress/expected/copy2.out
|
||||
+++ b/src/test/regress/expected/copy2.out
|
||||
@@ -553,8 +553,8 @@ select * from check_con_tbl;
|
||||
@@ -573,7 +573,7 @@ index 93302a07ef..1a73f083ac 100644
|
||||
-- that does not match with what's expected.
|
||||
-- This checks all the object types that include schema qualifications.
|
||||
diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
|
||||
index f3f8c7b5a2..3e3e54ff4c 100644
|
||||
index f551624afb..57f1e432d4 100644
|
||||
--- a/src/test/regress/expected/create_view.out
|
||||
+++ b/src/test/regress/expected/create_view.out
|
||||
@@ -18,7 +18,8 @@ CREATE TABLE real_city (
|
||||
@@ -700,12 +700,12 @@ index 6ed50fdcfa..caa00a345d 100644
|
||||
COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
|
||||
CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
|
||||
diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
|
||||
index 12e523c737..8872e23935 100644
|
||||
index 6b8c2f2414..8e13b7fa46 100644
|
||||
--- a/src/test/regress/expected/foreign_key.out
|
||||
+++ b/src/test/regress/expected/foreign_key.out
|
||||
@@ -1968,7 +1968,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2
|
||||
FOR VALUES IN (1600);
|
||||
-- leave these tables around intentionally
|
||||
@@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
|
||||
ERROR: cannot ALTER TABLE "fk_partitioned_pk_61" because it is being used by active queries in this session
|
||||
DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6;
|
||||
-- test the case when the referenced table is owned by a different user
|
||||
-create role regress_other_partitioned_fk_owner;
|
||||
+create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
|
||||
@@ -713,7 +713,7 @@ index 12e523c737..8872e23935 100644
|
||||
set role regress_other_partitioned_fk_owner;
|
||||
create table other_partitioned_fk(a int, b int) partition by list (a);
|
||||
diff --git a/src/test/regress/expected/generated.out b/src/test/regress/expected/generated.out
|
||||
index 0f623f7119..b48588a54e 100644
|
||||
index 5881420388..4ae21aa43c 100644
|
||||
--- a/src/test/regress/expected/generated.out
|
||||
+++ b/src/test/regress/expected/generated.out
|
||||
@@ -534,7 +534,7 @@ CREATE TABLE gtest10a (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) STOR
|
||||
@@ -762,7 +762,7 @@ index a2036a1597..805d73b9d2 100644
|
||||
-- fields, leading to long bucket chains and lots of table expansion.
|
||||
-- this is therefore a stress test of the bucket overflow code (unlike
|
||||
diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out
|
||||
index cc7772349f..98a08eb48d 100644
|
||||
index 1b74958de9..078187b542 100644
|
||||
--- a/src/test/regress/expected/identity.out
|
||||
+++ b/src/test/regress/expected/identity.out
|
||||
@@ -520,7 +520,7 @@ ALTER TABLE itest7 ALTER COLUMN a SET GENERATED BY DEFAULT;
|
||||
@@ -775,10 +775,10 @@ index cc7772349f..98a08eb48d 100644
|
||||
GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
|
||||
SET ROLE regress_identity_user1;
|
||||
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
|
||||
index 4943429e9b..0257f22b15 100644
|
||||
index 8f831c95c3..ec681b52af 100644
|
||||
--- a/src/test/regress/expected/inherit.out
|
||||
+++ b/src/test/regress/expected/inherit.out
|
||||
@@ -2606,7 +2606,7 @@ create index on permtest_parent (left(c, 3));
|
||||
@@ -2636,7 +2636,7 @@ create index on permtest_parent (left(c, 3));
|
||||
insert into permtest_parent
|
||||
select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
|
||||
analyze permtest_parent;
|
||||
@@ -1133,7 +1133,7 @@ index 8475231735..1afae5395f 100644
|
||||
SELECT rolname, rolpassword
|
||||
FROM pg_authid
|
||||
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
|
||||
index fbb0489a4f..2905194e2c 100644
|
||||
index 5b9dba7b32..cc408dad42 100644
|
||||
--- a/src/test/regress/expected/privileges.out
|
||||
+++ b/src/test/regress/expected/privileges.out
|
||||
@@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3
|
||||
@@ -1185,7 +1185,7 @@ index fbb0489a4f..2905194e2c 100644
|
||||
GRANT pg_read_all_data TO regress_priv_user6;
|
||||
GRANT pg_write_all_data TO regress_priv_user7;
|
||||
GRANT pg_read_all_settings TO regress_priv_user8 WITH ADMIN OPTION;
|
||||
@@ -145,8 +145,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8;
|
||||
@@ -212,8 +212,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8;
|
||||
DROP USER regress_priv_user10;
|
||||
DROP USER regress_priv_user9;
|
||||
DROP USER regress_priv_user8;
|
||||
@@ -1196,7 +1196,7 @@ index fbb0489a4f..2905194e2c 100644
|
||||
ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
|
||||
GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1;
|
||||
SET SESSION AUTHORIZATION regress_priv_user1;
|
||||
@@ -172,12 +172,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
|
||||
@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
|
||||
ERROR: permission denied to grant privileges as role "regress_priv_role"
|
||||
DETAIL: The grantor must have the ADMIN option on role "regress_priv_role".
|
||||
GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE;
|
||||
@@ -1213,7 +1213,7 @@ index fbb0489a4f..2905194e2c 100644
|
||||
DROP ROLE regress_priv_role;
|
||||
SET SESSION AUTHORIZATION regress_priv_user1;
|
||||
SELECT session_user, current_user;
|
||||
@@ -1709,7 +1713,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
|
||||
@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
|
||||
|
||||
-- security-restricted operations
|
||||
\c -
|
||||
@@ -1222,7 +1222,7 @@ index fbb0489a4f..2905194e2c 100644
|
||||
-- Check that index expressions and predicates are run as the table's owner
|
||||
-- A dummy index function checking current_user
|
||||
CREATE FUNCTION sro_ifun(int) RETURNS int AS $$
|
||||
@@ -2601,8 +2605,8 @@ drop cascades to function testns.priv_testagg(integer)
|
||||
@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer)
|
||||
drop cascades to function testns.priv_testproc(integer)
|
||||
-- Change owner of the schema & and rename of new schema owner
|
||||
\c -
|
||||
@@ -1233,7 +1233,7 @@ index fbb0489a4f..2905194e2c 100644
|
||||
SET SESSION ROLE regress_schemauser1;
|
||||
CREATE SCHEMA testns;
|
||||
SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid;
|
||||
@@ -2725,7 +2729,7 @@ DROP USER regress_priv_user7;
|
||||
@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7;
|
||||
DROP USER regress_priv_user8; -- does not exist
|
||||
ERROR: role "regress_priv_user8" does not exist
|
||||
-- permissions with LOCK TABLE
|
||||
@@ -1242,7 +1242,7 @@ index fbb0489a4f..2905194e2c 100644
|
||||
CREATE TABLE lock_table (a int);
|
||||
-- LOCK TABLE and SELECT permission
|
||||
GRANT SELECT ON lock_table TO regress_locktable_user;
|
||||
@@ -2807,7 +2811,7 @@ DROP USER regress_locktable_user;
|
||||
@@ -2874,7 +2878,7 @@ DROP USER regress_locktable_user;
|
||||
-- pg_backend_memory_contexts.
|
||||
-- switch to superuser
|
||||
\c -
|
||||
@@ -1251,7 +1251,7 @@ index fbb0489a4f..2905194e2c 100644
|
||||
SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
|
||||
has_table_privilege
|
||||
---------------------
|
||||
@@ -2851,10 +2855,10 @@ RESET ROLE;
|
||||
@@ -2918,10 +2922,10 @@ RESET ROLE;
|
||||
-- clean up
|
||||
DROP ROLE regress_readallstats;
|
||||
-- test role grantor machinery
|
||||
@@ -1266,7 +1266,7 @@ index fbb0489a4f..2905194e2c 100644
|
||||
GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
|
||||
GRANT regress_group_direct_manager TO regress_group_indirect_manager;
|
||||
SET SESSION AUTHORIZATION regress_group_direct_manager;
|
||||
@@ -2883,9 +2887,9 @@ DROP ROLE regress_group_direct_manager;
|
||||
@@ -2950,9 +2954,9 @@ DROP ROLE regress_group_direct_manager;
|
||||
DROP ROLE regress_group_indirect_manager;
|
||||
DROP ROLE regress_group_member;
|
||||
-- test SET and INHERIT options with object ownership changes
|
||||
@@ -1813,7 +1813,7 @@ index 5e6969b173..2c4d52237f 100644
|
||||
|
||||
-- clean up roles
|
||||
diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out
|
||||
index 97ca9bf72c..b2a7a6f710 100644
|
||||
index 218c0c2863..f7af0cfb12 100644
|
||||
--- a/src/test/regress/expected/rowsecurity.out
|
||||
+++ b/src/test/regress/expected/rowsecurity.out
|
||||
@@ -14,13 +14,13 @@ DROP ROLE IF EXISTS regress_rls_group2;
|
||||
@@ -1917,6 +1917,19 @@ index b79fe9a1c0..e29fab88ab 100644
|
||||
ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
|
||||
REVOKE INSERT ON TABLES FROM regress_selinto_user;
|
||||
GRANT ALL ON SCHEMA selinto_schema TO public;
|
||||
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
|
||||
index afc6ab08c2..dfcd891af3 100644
|
||||
--- a/src/test/regress/expected/select_parallel.out
|
||||
+++ b/src/test/regress/expected/select_parallel.out
|
||||
@@ -1220,7 +1220,7 @@ SELECT 1 FROM tenk1_vw_sec
|
||||
|
||||
rollback;
|
||||
-- test that function option SET ROLE works in parallel workers.
|
||||
-create role regress_parallel_worker;
|
||||
+create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER;
|
||||
create function set_and_report_role() returns text as
|
||||
$$ select current_setting('role') $$ language sql parallel safe
|
||||
set role = regress_parallel_worker;
|
||||
diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out
|
||||
index 1aeed8452b..7d9427d070 100644
|
||||
--- a/src/test/regress/expected/select_views.out
|
||||
@@ -2369,7 +2382,7 @@ index 6cb9c926c0..5e689e4062 100644
|
||||
ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
|
||||
ALTER TABLE ptnowner OWNER TO regress_ptnowner;
|
||||
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
|
||||
index 3db9e25913..c66d5aa2c2 100644
|
||||
index 8aa902d5ab..24bb823b86 100644
|
||||
--- a/src/test/regress/sql/collate.icu.utf8.sql
|
||||
+++ b/src/test/regress/sql/collate.icu.utf8.sql
|
||||
@@ -353,7 +353,7 @@ reset enable_seqscan;
|
||||
@@ -2532,7 +2545,7 @@ index 43d2e906dd..6c993d70f0 100644
|
||||
-- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from
|
||||
-- the wrong partition. This test is *not* guaranteed to trigger that bug, but
|
||||
diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql
|
||||
index d759635068..d58e50dcc5 100644
|
||||
index cf3828c16e..cf3ca38175 100644
|
||||
--- a/src/test/regress/sql/copy2.sql
|
||||
+++ b/src/test/regress/sql/copy2.sql
|
||||
@@ -365,8 +365,8 @@ copy check_con_tbl from stdin;
|
||||
@@ -2774,7 +2787,7 @@ index 1b7064247a..be5b662ce1 100644
|
||||
-- Cases where schema creation fails as objects are qualified with a schema
|
||||
-- that does not match with what's expected.
|
||||
diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql
|
||||
index 3a78be1b0c..617d2dc8d6 100644
|
||||
index ae6841308b..47bc792e30 100644
|
||||
--- a/src/test/regress/sql/create_view.sql
|
||||
+++ b/src/test/regress/sql/create_view.sql
|
||||
@@ -23,7 +23,8 @@ CREATE TABLE real_city (
|
||||
@@ -2901,11 +2914,11 @@ index aa147b14a9..370e0dd570 100644
|
||||
CREATE FOREIGN DATA WRAPPER dummy;
|
||||
COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
|
||||
diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
|
||||
index 22e177f89b..7138d5e1d4 100644
|
||||
index 45c7a534cb..32dd26b8cd 100644
|
||||
--- a/src/test/regress/sql/foreign_key.sql
|
||||
+++ b/src/test/regress/sql/foreign_key.sql
|
||||
@@ -1418,7 +1418,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2
|
||||
-- leave these tables around intentionally
|
||||
@@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
|
||||
DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6;
|
||||
|
||||
-- test the case when the referenced table is owned by a different user
|
||||
-create role regress_other_partitioned_fk_owner;
|
||||
@@ -2963,7 +2976,7 @@ index 527024f710..de49c0b85f 100644
|
||||
-- the data in this file has a lot of duplicates in the index key
|
||||
-- fields, leading to long bucket chains and lots of table expansion.
|
||||
diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql
|
||||
index 91d2e443b4..241c93f373 100644
|
||||
index 7537258a75..9041e35e34 100644
|
||||
--- a/src/test/regress/sql/identity.sql
|
||||
+++ b/src/test/regress/sql/identity.sql
|
||||
@@ -287,7 +287,7 @@ ALTER TABLE itest7 ALTER COLUMN a RESTART;
|
||||
@@ -2976,10 +2989,10 @@ index 91d2e443b4..241c93f373 100644
|
||||
GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
|
||||
SET ROLE regress_identity_user1;
|
||||
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
|
||||
index fe699c54d5..bdd5993f45 100644
|
||||
index b5b554a125..109889ad24 100644
|
||||
--- a/src/test/regress/sql/inherit.sql
|
||||
+++ b/src/test/regress/sql/inherit.sql
|
||||
@@ -950,7 +950,7 @@ create index on permtest_parent (left(c, 3));
|
||||
@@ -958,7 +958,7 @@ create index on permtest_parent (left(c, 3));
|
||||
insert into permtest_parent
|
||||
select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
|
||||
analyze permtest_parent;
|
||||
@@ -3218,7 +3231,7 @@ index 53e86b0b6c..f07cf1ec54 100644
|
||||
CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
|
||||
|
||||
diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
|
||||
index 3f68cafcd1..004b26831d 100644
|
||||
index 249df17a58..b258e7f26a 100644
|
||||
--- a/src/test/regress/sql/privileges.sql
|
||||
+++ b/src/test/regress/sql/privileges.sql
|
||||
@@ -24,18 +24,18 @@ RESET client_min_messages;
|
||||
@@ -3269,7 +3282,7 @@ index 3f68cafcd1..004b26831d 100644
|
||||
|
||||
GRANT pg_read_all_data TO regress_priv_user6;
|
||||
GRANT pg_write_all_data TO regress_priv_user7;
|
||||
@@ -130,8 +130,8 @@ DROP USER regress_priv_user10;
|
||||
@@ -163,8 +163,8 @@ DROP USER regress_priv_user10;
|
||||
DROP USER regress_priv_user9;
|
||||
DROP USER regress_priv_user8;
|
||||
|
||||
@@ -3280,7 +3293,7 @@ index 3f68cafcd1..004b26831d 100644
|
||||
|
||||
ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
|
||||
|
||||
@@ -1124,7 +1124,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
|
||||
@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
|
||||
|
||||
-- security-restricted operations
|
||||
\c -
|
||||
@@ -3289,7 +3302,7 @@ index 3f68cafcd1..004b26831d 100644
|
||||
|
||||
-- Check that index expressions and predicates are run as the table's owner
|
||||
|
||||
@@ -1620,8 +1620,8 @@ DROP SCHEMA testns CASCADE;
|
||||
@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE;
|
||||
-- Change owner of the schema & and rename of new schema owner
|
||||
\c -
|
||||
|
||||
@@ -3300,7 +3313,7 @@ index 3f68cafcd1..004b26831d 100644
|
||||
|
||||
SET SESSION ROLE regress_schemauser1;
|
||||
CREATE SCHEMA testns;
|
||||
@@ -1715,7 +1715,7 @@ DROP USER regress_priv_user8; -- does not exist
|
||||
@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist
|
||||
|
||||
|
||||
-- permissions with LOCK TABLE
|
||||
@@ -3309,7 +3322,7 @@ index 3f68cafcd1..004b26831d 100644
|
||||
CREATE TABLE lock_table (a int);
|
||||
|
||||
-- LOCK TABLE and SELECT permission
|
||||
@@ -1803,7 +1803,7 @@ DROP USER regress_locktable_user;
|
||||
@@ -1836,7 +1836,7 @@ DROP USER regress_locktable_user;
|
||||
-- switch to superuser
|
||||
\c -
|
||||
|
||||
@@ -3318,7 +3331,7 @@ index 3f68cafcd1..004b26831d 100644
|
||||
|
||||
SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
|
||||
SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
|
||||
@@ -1823,10 +1823,10 @@ RESET ROLE;
|
||||
@@ -1856,10 +1856,10 @@ RESET ROLE;
|
||||
DROP ROLE regress_readallstats;
|
||||
|
||||
-- test role grantor machinery
|
||||
@@ -3333,7 +3346,7 @@ index 3f68cafcd1..004b26831d 100644
|
||||
|
||||
GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
|
||||
GRANT regress_group_direct_manager TO regress_group_indirect_manager;
|
||||
@@ -1848,9 +1848,9 @@ DROP ROLE regress_group_indirect_manager;
|
||||
@@ -1881,9 +1881,9 @@ DROP ROLE regress_group_indirect_manager;
|
||||
DROP ROLE regress_group_member;
|
||||
|
||||
-- test SET and INHERIT options with object ownership changes
|
||||
@@ -3625,7 +3638,7 @@ index c961b2d730..0859b89c4f 100644
|
||||
-- clean up roles
|
||||
DROP ROLE regress_test_def_superuser;
|
||||
diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
|
||||
index dec7340538..cdbc03a5cc 100644
|
||||
index d3bfd53e23..919ce1d0c6 100644
|
||||
--- a/src/test/regress/sql/rowsecurity.sql
|
||||
+++ b/src/test/regress/sql/rowsecurity.sql
|
||||
@@ -20,13 +20,13 @@ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE;
|
||||
@@ -3701,6 +3714,19 @@ index 689c448cc2..223ceb1d75 100644
|
||||
ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
|
||||
REVOKE INSERT ON TABLES FROM regress_selinto_user;
|
||||
GRANT ALL ON SCHEMA selinto_schema TO public;
|
||||
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
|
||||
index 33d78e16dc..cb193c9b27 100644
|
||||
--- a/src/test/regress/sql/select_parallel.sql
|
||||
+++ b/src/test/regress/sql/select_parallel.sql
|
||||
@@ -464,7 +464,7 @@ SELECT 1 FROM tenk1_vw_sec
|
||||
rollback;
|
||||
|
||||
-- test that function option SET ROLE works in parallel workers.
|
||||
-create role regress_parallel_worker;
|
||||
+create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER;
|
||||
|
||||
create function set_and_report_role() returns text as
|
||||
$$ select current_setting('role') $$ language sql parallel safe
|
||||
diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql
|
||||
index e742f13699..7bd0255df8 100644
|
||||
--- a/src/test/regress/sql/select_views.sql
|
||||
|
||||
@@ -37,6 +37,7 @@ allow = [
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
"CC0-1.0",
|
||||
"CDDL-1.0",
|
||||
"ISC",
|
||||
"MIT",
|
||||
"MPL-2.0",
|
||||
|
||||
@@ -24,7 +24,7 @@ pub struct Key {
|
||||
|
||||
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
|
||||
/// a struct of fields.
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
pub struct CompactKey(i128);
|
||||
|
||||
/// The storage key size.
|
||||
|
||||
@@ -41,6 +41,11 @@ pub enum NeonWalRecord {
|
||||
file_path: String,
|
||||
content: Option<Bytes>,
|
||||
},
|
||||
// Truncate visibility map page
|
||||
TruncateVisibilityMap {
|
||||
trunc_byte: usize,
|
||||
trunc_offs: usize,
|
||||
},
|
||||
|
||||
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
|
||||
#[cfg(feature = "testing")]
|
||||
|
||||
@@ -24,7 +24,7 @@ use postgres_ffi::Oid;
|
||||
// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
|
||||
// Then we could replace the custom Ord and PartialOrd implementations below with
|
||||
// deriving them. This will require changes in walredoproc.c.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: Oid,
|
||||
|
||||
@@ -243,8 +243,11 @@ const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_P
|
||||
pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
|
||||
|
||||
/* From visibilitymap.c */
|
||||
pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
|
||||
(BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
|
||||
|
||||
pub const VM_MAPSIZE: usize = BLCKSZ as usize - MAXALIGN_SIZE_OF_PAGE_HEADER_DATA;
|
||||
pub const VM_BITS_PER_HEAPBLOCK: usize = 2;
|
||||
pub const VM_HEAPBLOCKS_PER_BYTE: usize = 8 / VM_BITS_PER_HEAPBLOCK;
|
||||
pub const VM_HEAPBLOCKS_PER_PAGE: usize = VM_MAPSIZE * VM_HEAPBLOCKS_PER_BYTE;
|
||||
|
||||
/* From origin.c */
|
||||
pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
|
||||
|
||||
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlMultiXactCreate {
|
||||
pub mid: MultiXactId,
|
||||
/* new MultiXact's ID */
|
||||
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlMultiXactTruncate {
|
||||
pub oldest_multi_db: Oid,
|
||||
/* to-be-truncated range of multixact offsets */
|
||||
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlRelmapUpdate {
|
||||
pub dbid: Oid, /* database ID, or 0 for shared map */
|
||||
pub tsid: Oid, /* database's tablespace, or pg_global */
|
||||
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlReploriginDrop {
|
||||
pub node_id: RepOriginId,
|
||||
}
|
||||
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlReploriginSet {
|
||||
pub remote_lsn: Lsn,
|
||||
pub node_id: RepOriginId,
|
||||
@@ -120,7 +120,7 @@ impl XlReploriginSet {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelFileNode {
|
||||
pub spcnode: Oid, /* tablespace */
|
||||
pub dbnode: Oid, /* database */
|
||||
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlSmgrTruncate {
|
||||
pub blkno: BlockNumber,
|
||||
pub rnode: RelFileNode,
|
||||
@@ -984,7 +984,7 @@ impl XlDropDatabase {
|
||||
/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
|
||||
/// struct for commits and aborts.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlXactParsedRecord {
|
||||
pub xid: TransactionId,
|
||||
pub info: u8,
|
||||
|
||||
@@ -50,8 +50,8 @@ REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"|
|
||||
declare -i WAL_SIZE=$REDO_POS+114
|
||||
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
|
||||
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
|
||||
cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
|
||||
cp "$DATA_DIR"/pg_wal/000000010000000000000001 "$DATA_DIR"
|
||||
cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
|
||||
for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
|
||||
dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||
rm -f 000000010000000000000001
|
||||
dd if="$DATA_DIR"/000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||
rm -f "$DATA_DIR"/000000010000000000000001
|
||||
|
||||
@@ -14,8 +14,8 @@ REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"|
|
||||
declare -i WAL_SIZE=$REDO_POS+114
|
||||
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
|
||||
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
|
||||
cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
|
||||
cp "$DATA_DIR"/pg_wal/000000010000000000000001 "$DATA_DIR"
|
||||
cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
|
||||
for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
|
||||
dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||
rm -f 000000010000000000000001
|
||||
dd if="$DATA_DIR"/000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||
rm -f "$DATA_DIR"/000000010000000000000001
|
||||
|
||||
@@ -19,7 +19,7 @@ impl InterpretedWalRecord {
|
||||
pub fn from_bytes_filtered(
|
||||
buf: Bytes,
|
||||
shard: &ShardIdentity,
|
||||
record_end_lsn: Lsn,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<InterpretedWalRecord> {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
@@ -32,18 +32,18 @@ impl InterpretedWalRecord {
|
||||
FlushUncommittedRecords::No
|
||||
};
|
||||
|
||||
let metadata_record = MetadataRecord::from_decoded(&decoded, record_end_lsn, pg_version)?;
|
||||
let metadata_record = MetadataRecord::from_decoded(&decoded, next_record_lsn, pg_version)?;
|
||||
let batch = SerializedValueBatch::from_decoded_filtered(
|
||||
decoded,
|
||||
shard,
|
||||
record_end_lsn,
|
||||
next_record_lsn,
|
||||
pg_version,
|
||||
)?;
|
||||
|
||||
Ok(InterpretedWalRecord {
|
||||
metadata_record,
|
||||
batch,
|
||||
end_lsn: record_end_lsn,
|
||||
next_record_lsn,
|
||||
flush_uncommitted,
|
||||
xid,
|
||||
})
|
||||
@@ -53,7 +53,7 @@ impl InterpretedWalRecord {
|
||||
impl MetadataRecord {
|
||||
fn from_decoded(
|
||||
decoded: &DecodedWALRecord,
|
||||
record_end_lsn: Lsn,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// Note: this doesn't actually copy the bytes since
|
||||
@@ -74,7 +74,9 @@ impl MetadataRecord {
|
||||
Ok(None)
|
||||
}
|
||||
pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
|
||||
pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, record_end_lsn),
|
||||
pg_constants::RM_XACT_ID => {
|
||||
Self::decode_xact_record(&mut buf, decoded, next_record_lsn)
|
||||
}
|
||||
pg_constants::RM_MULTIXACT_ID => {
|
||||
Self::decode_multixact_record(&mut buf, decoded, pg_version)
|
||||
}
|
||||
@@ -86,7 +88,9 @@ impl MetadataRecord {
|
||||
//
|
||||
// Alternatively, one can make the checkpoint part of the subscription protocol
|
||||
// to the pageserver. This should work fine, but can be done at a later point.
|
||||
pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, record_end_lsn),
|
||||
pg_constants::RM_XLOG_ID => {
|
||||
Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)
|
||||
}
|
||||
pg_constants::RM_LOGICALMSG_ID => {
|
||||
Self::decode_logical_message_record(&mut buf, decoded)
|
||||
}
|
||||
|
||||
@@ -32,16 +32,19 @@ use postgres_ffi::walrecord::{
|
||||
XlSmgrTruncate, XlXactParsedRecord,
|
||||
};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::serialized_batch::SerializedValueBatch;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum FlushUncommittedRecords {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct InterpretedWalRecord {
|
||||
/// Optional metadata record - may cause writes to metadata keys
|
||||
/// in the storage engine
|
||||
@@ -49,8 +52,10 @@ pub struct InterpretedWalRecord {
|
||||
/// A pre-serialized batch along with the required metadata for ingestion
|
||||
/// by the pageserver
|
||||
pub batch: SerializedValueBatch,
|
||||
/// Byte offset within WAL for the end of the original PG WAL record
|
||||
pub end_lsn: Lsn,
|
||||
/// Byte offset within WAL for the start of the next PG WAL record.
|
||||
/// Usually this is the end LSN of the current record, but in case of
|
||||
/// XLOG SWITCH records it will be within the next segment.
|
||||
pub next_record_lsn: Lsn,
|
||||
/// Whether to flush all uncommitted modifications to the storage engine
|
||||
/// before ingesting this record. This is currently only used for legacy PG
|
||||
/// database creations which read pages from a template database. Such WAL
|
||||
@@ -62,6 +67,7 @@ pub struct InterpretedWalRecord {
|
||||
|
||||
/// The interpreted part of the Postgres WAL record which requires metadata
|
||||
/// writes to the underlying storage engine.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum MetadataRecord {
|
||||
Heapam(HeapamRecord),
|
||||
Neonrmgr(NeonrmgrRecord),
|
||||
@@ -77,10 +83,12 @@ pub enum MetadataRecord {
|
||||
Replorigin(ReploriginRecord),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ClearVmBits {
|
||||
pub new_heap_blkno: Option<u32>,
|
||||
pub old_heap_blkno: Option<u32>,
|
||||
@@ -88,24 +96,29 @@ pub struct ClearVmBits {
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(XlSmgrTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SmgrCreate {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct DbaseCreate {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
@@ -113,27 +126,32 @@ pub struct DbaseCreate {
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct DbaseDrop {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_ids: Vec<Oid>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ClogZeroPage {
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
@@ -142,6 +160,7 @@ pub enum XactRecord {
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct XactCommon {
|
||||
pub parsed: XlXactParsedRecord,
|
||||
pub origin_id: u16,
|
||||
@@ -150,61 +169,73 @@ pub struct XactCommon {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct XactPrepare {
|
||||
pub xl_xid: TransactionId,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct MultiXactZeroPage {
|
||||
pub slru_kind: SlruKind,
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct RelmapUpdate {
|
||||
pub update: XlRelmapUpdate,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct RawXlogRecord {
|
||||
pub info: u8,
|
||||
pub lsn: Lsn,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
#[cfg(feature = "testing")]
|
||||
Failpoint,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct PutLogicalMessage {
|
||||
pub path: String,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct StandbyRunningXacts {
|
||||
pub oldest_running_xid: TransactionId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
|
||||
@@ -16,6 +16,7 @@ use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::{key::CompactKey, value::Value};
|
||||
use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
|
||||
use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -29,6 +30,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
/// relation sizes. In the case of "observed" values, we only need to know
|
||||
/// the key and LSN, so two types of metadata are supported to save on network
|
||||
/// bandwidth.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum ValueMeta {
|
||||
Serialized(SerializedValueMeta),
|
||||
Observed(ObservedValueMeta),
|
||||
@@ -75,6 +77,7 @@ impl PartialEq for OrderedValueMeta {
|
||||
impl Eq for OrderedValueMeta {}
|
||||
|
||||
/// Metadata for a [`Value`] serialized into the batch.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SerializedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
@@ -86,12 +89,14 @@ pub struct SerializedValueMeta {
|
||||
}
|
||||
|
||||
/// Metadata for a [`Value`] observed by the batch
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ObservedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Batch of serialized [`Value`]s.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SerializedValueBatch {
|
||||
/// [`Value`]s serialized in EphemeralFile's native format,
|
||||
/// ready for disk write by the pageserver
|
||||
@@ -132,7 +137,7 @@ impl SerializedValueBatch {
|
||||
pub(crate) fn from_decoded_filtered(
|
||||
decoded: DecodedWALRecord,
|
||||
shard: &ShardIdentity,
|
||||
record_end_lsn: Lsn,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<SerializedValueBatch> {
|
||||
// First determine how big the buffer needs to be and allocate it up-front.
|
||||
@@ -156,13 +161,17 @@ impl SerializedValueBatch {
|
||||
let key = rel_block_to_key(rel, blk.blkno);
|
||||
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!("Unsupported key decoded at LSN {}: {}", record_end_lsn, key);
|
||||
anyhow::bail!(
|
||||
"Unsupported key decoded at LSN {}: {}",
|
||||
next_record_lsn,
|
||||
key
|
||||
);
|
||||
}
|
||||
|
||||
let key_is_local = shard.is_key_local(&key);
|
||||
|
||||
tracing::debug!(
|
||||
lsn=%record_end_lsn,
|
||||
lsn=%next_record_lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {}",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
@@ -174,7 +183,7 @@ impl SerializedValueBatch {
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
metadata.push(ValueMeta::Observed(ObservedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: record_end_lsn,
|
||||
lsn: next_record_lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -205,7 +214,7 @@ impl SerializedValueBatch {
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, record_end_lsn)
|
||||
page_set_lsn(&mut image, next_record_lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
@@ -224,12 +233,12 @@ impl SerializedValueBatch {
|
||||
|
||||
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: record_end_lsn,
|
||||
lsn: next_record_lsn,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
}));
|
||||
max_lsn = std::cmp::max(max_lsn, record_end_lsn);
|
||||
max_lsn = std::cmp::max(max_lsn, next_record_lsn);
|
||||
len += 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -167,6 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
16384,
|
||||
virtual_file::io_engine_for_bench(),
|
||||
conf.virtual_file_io_mode,
|
||||
virtual_file::SyncMode::Sync,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
|
||||
@@ -138,6 +138,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
virtual_file::SyncMode::Sync,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
virtual_file::SyncMode::Sync,
|
||||
);
|
||||
page_cache::init(100);
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
@@ -65,6 +66,7 @@ async fn read_image_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
virtual_file::SyncMode::Sync,
|
||||
);
|
||||
page_cache::init(100);
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
@@ -171,6 +173,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
virtual_file::SyncMode::Sync,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
|
||||
@@ -209,6 +209,7 @@ async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
virtual_file::SyncMode::Sync,
|
||||
);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
@@ -171,11 +171,18 @@ fn main() -> anyhow::Result<()> {
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
tracing::info!("Initializing virtual_file...");
|
||||
virtual_file::init(
|
||||
conf.max_file_descriptors,
|
||||
conf.virtual_file_io_engine,
|
||||
conf.virtual_file_io_mode,
|
||||
if conf.no_sync {
|
||||
virtual_file::SyncMode::UnsafeNoSync
|
||||
} else {
|
||||
virtual_file::SyncMode::Sync
|
||||
},
|
||||
);
|
||||
tracing::info!("Initializing page_cache...");
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||
|
||||
@@ -324,6 +324,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
.into_boxed_str(),
|
||||
),
|
||||
a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
|
||||
Cancelled => ApiError::ResourceUnavailable("shutting down".into()),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ use remote_timeline_client::UploadQueueNotReadyError;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
@@ -524,6 +525,9 @@ pub struct OffloadedTimeline {
|
||||
/// Prevent two tasks from deleting the timeline at the same time. If held, the
|
||||
/// timeline is being deleted. If 'true', the timeline has already been deleted.
|
||||
pub delete_progress: TimelineDeleteProgress,
|
||||
|
||||
/// Part of the `OffloadedTimeline` object's lifecycle: this needs to be set before we drop it
|
||||
pub deleted_from_ancestor: AtomicBool,
|
||||
}
|
||||
|
||||
impl OffloadedTimeline {
|
||||
@@ -533,9 +537,16 @@ impl OffloadedTimeline {
|
||||
/// the timeline is not in a stopped state.
|
||||
/// Panics if the timeline is not archived.
|
||||
fn from_timeline(timeline: &Timeline) -> Result<Self, UploadQueueNotReadyError> {
|
||||
let ancestor_retain_lsn = timeline
|
||||
.get_ancestor_timeline_id()
|
||||
.map(|_timeline_id| timeline.get_ancestor_lsn());
|
||||
let (ancestor_retain_lsn, ancestor_timeline_id) =
|
||||
if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
|
||||
let ancestor_lsn = timeline.get_ancestor_lsn();
|
||||
let ancestor_timeline_id = ancestor_timeline.timeline_id;
|
||||
let mut gc_info = ancestor_timeline.gc_info.write().unwrap();
|
||||
gc_info.insert_child(timeline.timeline_id, ancestor_lsn, MaybeOffloaded::Yes);
|
||||
(Some(ancestor_lsn), Some(ancestor_timeline_id))
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
let archived_at = timeline
|
||||
.remote_client
|
||||
.archived_at_stopped_queue()?
|
||||
@@ -543,14 +554,17 @@ impl OffloadedTimeline {
|
||||
Ok(Self {
|
||||
tenant_shard_id: timeline.tenant_shard_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
|
||||
ancestor_timeline_id,
|
||||
ancestor_retain_lsn,
|
||||
archived_at,
|
||||
|
||||
delete_progress: timeline.delete_progress.clone(),
|
||||
deleted_from_ancestor: AtomicBool::new(false),
|
||||
})
|
||||
}
|
||||
fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self {
|
||||
// We expect to reach this case in tenant loading, where the `retain_lsn` is populated in the parent's `gc_info`
|
||||
// by the `initialize_gc_info` function.
|
||||
let OffloadedTimelineManifest {
|
||||
timeline_id,
|
||||
ancestor_timeline_id,
|
||||
@@ -564,6 +578,7 @@ impl OffloadedTimeline {
|
||||
ancestor_retain_lsn,
|
||||
archived_at,
|
||||
delete_progress: TimelineDeleteProgress::default(),
|
||||
deleted_from_ancestor: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
fn manifest(&self) -> OffloadedTimelineManifest {
|
||||
@@ -581,6 +596,33 @@ impl OffloadedTimeline {
|
||||
archived_at: *archived_at,
|
||||
}
|
||||
}
|
||||
/// Delete this timeline's retain_lsn from its ancestor, if present in the given tenant
|
||||
fn delete_from_ancestor_with_timelines(
|
||||
&self,
|
||||
timelines: &std::sync::MutexGuard<'_, HashMap<TimelineId, Arc<Timeline>>>,
|
||||
) {
|
||||
if let (Some(_retain_lsn), Some(ancestor_timeline_id)) =
|
||||
(self.ancestor_retain_lsn, self.ancestor_timeline_id)
|
||||
{
|
||||
if let Some((_, ancestor_timeline)) = timelines
|
||||
.iter()
|
||||
.find(|(tid, _tl)| **tid == ancestor_timeline_id)
|
||||
{
|
||||
ancestor_timeline
|
||||
.gc_info
|
||||
.write()
|
||||
.unwrap()
|
||||
.remove_child_offloaded(self.timeline_id);
|
||||
}
|
||||
}
|
||||
self.deleted_from_ancestor.store(true, Ordering::Release);
|
||||
}
|
||||
/// Call [`Self::delete_from_ancestor_with_timelines`] instead if possible.
|
||||
///
|
||||
/// As the entire tenant is being dropped, don't bother deregistering the `retain_lsn` from the ancestor.
|
||||
fn defuse_for_tenant_drop(&self) {
|
||||
self.deleted_from_ancestor.store(true, Ordering::Release);
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for OffloadedTimeline {
|
||||
@@ -589,6 +631,17 @@ impl fmt::Debug for OffloadedTimeline {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for OffloadedTimeline {
|
||||
fn drop(&mut self) {
|
||||
if !self.deleted_from_ancestor.load(Ordering::Acquire) {
|
||||
tracing::warn!(
|
||||
"offloaded timeline {} was dropped without having cleaned it up at the ancestor",
|
||||
self.timeline_id
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
|
||||
pub enum MaybeOffloaded {
|
||||
Yes,
|
||||
@@ -700,6 +753,9 @@ pub enum DeleteTimelineError {
|
||||
#[error("Timeline deletion is already in progress")]
|
||||
AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
@@ -710,6 +766,7 @@ impl Debug for DeleteTimelineError {
|
||||
Self::NotFound => write!(f, "NotFound"),
|
||||
Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
|
||||
Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
|
||||
Self::Cancelled => f.debug_tuple("Cancelled").finish(),
|
||||
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
|
||||
}
|
||||
}
|
||||
@@ -1527,7 +1584,7 @@ impl Tenant {
|
||||
}
|
||||
// Complete deletions for offloaded timeline id's.
|
||||
offloaded_timelines_list
|
||||
.retain(|(offloaded_id, _offloaded)| {
|
||||
.retain(|(offloaded_id, offloaded)| {
|
||||
// At this point, offloaded_timeline_ids has the list of all offloaded timelines
|
||||
// without a prefix in S3, so they are inexistent.
|
||||
// In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage.
|
||||
@@ -1535,6 +1592,7 @@ impl Tenant {
|
||||
let delete = offloaded_timeline_ids.contains(offloaded_id);
|
||||
if delete {
|
||||
tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found");
|
||||
offloaded.defuse_for_tenant_drop();
|
||||
}
|
||||
!delete
|
||||
});
|
||||
@@ -1923,9 +1981,15 @@ impl Tenant {
|
||||
)));
|
||||
};
|
||||
let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
|
||||
if offloaded_timelines.remove(&timeline_id).is_none() {
|
||||
warn!("timeline already removed from offloaded timelines");
|
||||
match offloaded_timelines.remove(&timeline_id) {
|
||||
Some(offloaded) => {
|
||||
offloaded.delete_from_ancestor_with_timelines(&timelines);
|
||||
}
|
||||
None => warn!("timeline already removed from offloaded timelines"),
|
||||
}
|
||||
|
||||
self.initialize_gc_info(&timelines, &offloaded_timelines, Some(timeline_id));
|
||||
|
||||
Arc::clone(timeline)
|
||||
};
|
||||
|
||||
@@ -2663,7 +2727,7 @@ impl Tenant {
|
||||
.filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));
|
||||
|
||||
// Before activation, populate each Timeline's GcInfo with information about its children
|
||||
self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor);
|
||||
self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None);
|
||||
|
||||
// Spawn gc and compaction loops. The loops will shut themselves
|
||||
// down when they notice that the tenant is inactive.
|
||||
@@ -2778,8 +2842,14 @@ impl Tenant {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
|
||||
js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
|
||||
})
|
||||
};
|
||||
});
|
||||
}
|
||||
{
|
||||
let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
|
||||
timelines_offloaded.values().for_each(|timeline| {
|
||||
timeline.defuse_for_tenant_drop();
|
||||
});
|
||||
}
|
||||
// test_long_timeline_create_then_tenant_delete is leaning on this message
|
||||
tracing::info!("Waiting for timelines...");
|
||||
while let Some(res) = js.join_next().await {
|
||||
@@ -3763,10 +3833,13 @@ impl Tenant {
|
||||
&self,
|
||||
timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
timelines_offloaded: &std::sync::MutexGuard<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
|
||||
restrict_to_timeline: Option<TimelineId>,
|
||||
) {
|
||||
// This function must be called before activation: after activation timeline create/delete operations
|
||||
// might happen, and this function is not safe to run concurrently with those.
|
||||
assert!(!self.is_active());
|
||||
if restrict_to_timeline.is_none() {
|
||||
// This function must be called before activation: after activation timeline create/delete operations
|
||||
// might happen, and this function is not safe to run concurrently with those.
|
||||
assert!(!self.is_active());
|
||||
}
|
||||
|
||||
// Scan all timelines. For each timeline, remember the timeline ID and
|
||||
// the branch point where it was created.
|
||||
@@ -3799,7 +3872,12 @@ impl Tenant {
|
||||
let horizon = self.get_gc_horizon();
|
||||
|
||||
// Populate each timeline's GcInfo with information about its child branches
|
||||
for timeline in timelines.values() {
|
||||
let timelines_to_write = if let Some(timeline_id) = restrict_to_timeline {
|
||||
itertools::Either::Left(timelines.get(&timeline_id).into_iter())
|
||||
} else {
|
||||
itertools::Either::Right(timelines.values())
|
||||
};
|
||||
for timeline in timelines_to_write {
|
||||
let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints
|
||||
.remove(&timeline.timeline_id)
|
||||
.unwrap_or_default();
|
||||
@@ -9646,4 +9724,54 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_timeline_offload_retain_lsn() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_timeline_offload_retain_lsn")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline_parent = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let tline_child = tenant
|
||||
.branch_timeline_test(&tline_parent, NEW_TIMELINE_ID, Some(Lsn(0x20)), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
{
|
||||
let gc_info_parent = tline_parent.gc_info.read().unwrap();
|
||||
assert_eq!(
|
||||
gc_info_parent.retain_lsns,
|
||||
vec![(Lsn(0x20), tline_child.timeline_id, MaybeOffloaded::No)]
|
||||
);
|
||||
}
|
||||
// We have to directly call the remote_client instead of using the archive function to avoid constructing broker client...
|
||||
tline_child
|
||||
.remote_client
|
||||
.schedule_index_upload_for_timeline_archival_state(TimelineArchivalState::Archived)
|
||||
.unwrap();
|
||||
tline_child.remote_client.wait_completion().await.unwrap();
|
||||
offload_timeline(&tenant, &tline_child)
|
||||
.instrument(tracing::info_span!(parent: None, "offload_test", tenant_id=%"test", shard_id=%"test", timeline_id=%"test"))
|
||||
.await.unwrap();
|
||||
let child_timeline_id = tline_child.timeline_id;
|
||||
Arc::try_unwrap(tline_child).unwrap();
|
||||
|
||||
{
|
||||
let gc_info_parent = tline_parent.gc_info.read().unwrap();
|
||||
assert_eq!(
|
||||
gc_info_parent.retain_lsns,
|
||||
vec![(Lsn(0x20), child_timeline_id, MaybeOffloaded::Yes)]
|
||||
);
|
||||
}
|
||||
|
||||
tenant
|
||||
.get_offloaded_timeline(child_timeline_id)
|
||||
.unwrap()
|
||||
.defuse_for_tenant_drop();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,7 +243,7 @@ use self::index::IndexPart;
|
||||
use super::metadata::MetadataUpdate;
|
||||
use super::storage_layer::{Layer, LayerName, ResidentLayer};
|
||||
use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
|
||||
use super::Generation;
|
||||
use super::{DeleteTimelineError, Generation};
|
||||
|
||||
pub(crate) use download::{
|
||||
download_index_part, download_tenant_manifest, is_temp_download_file,
|
||||
@@ -1550,15 +1550,17 @@ impl RemoteTimelineClient {
|
||||
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
||||
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
||||
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
||||
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
pub(crate) async fn delete_all(self: &Arc<Self>) -> Result<(), DeleteTimelineError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let layers: Vec<RemotePath> = {
|
||||
let mut locked = self.upload_queue.lock().unwrap();
|
||||
let stopped = locked.stopped_mut()?;
|
||||
let stopped = locked.stopped_mut().map_err(DeleteTimelineError::Other)?;
|
||||
|
||||
if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
|
||||
anyhow::bail!("deleted_at is not set")
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!(
|
||||
"deleted_at is not set"
|
||||
)));
|
||||
}
|
||||
|
||||
debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
|
||||
@@ -1593,7 +1595,10 @@ impl RemoteTimelineClient {
|
||||
};
|
||||
|
||||
let layer_deletion_count = layers.len();
|
||||
self.deletion_queue_client.push_immediate(layers).await?;
|
||||
self.deletion_queue_client
|
||||
.push_immediate(layers)
|
||||
.await
|
||||
.map_err(|_| DeleteTimelineError::Cancelled)?;
|
||||
|
||||
// Delete the initdb.tar.zst, which is not always present, but deletion attempts of
|
||||
// inexistant objects are not considered errors.
|
||||
@@ -1601,7 +1606,8 @@ impl RemoteTimelineClient {
|
||||
remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
|
||||
self.deletion_queue_client
|
||||
.push_immediate(vec![initdb_path])
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|_| DeleteTimelineError::Cancelled)?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
@@ -1609,7 +1615,9 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Execute all pending deletions, so that when we proceed to do a listing below, we aren't
|
||||
// taking the burden of listing all the layers that we already know we should delete.
|
||||
self.flush_deletion_queue().await?;
|
||||
self.flush_deletion_queue()
|
||||
.await
|
||||
.map_err(|_| DeleteTimelineError::Cancelled)?;
|
||||
|
||||
let cancel = shutdown_token();
|
||||
|
||||
@@ -1672,28 +1680,32 @@ impl RemoteTimelineClient {
|
||||
if !remaining_layers.is_empty() {
|
||||
self.deletion_queue_client
|
||||
.push_immediate(remaining_layers)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|_| DeleteTimelineError::Cancelled)?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
Err(DeleteTimelineError::Other(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-delete"
|
||||
))?
|
||||
)))?
|
||||
});
|
||||
|
||||
debug!("enqueuing index part deletion");
|
||||
self.deletion_queue_client
|
||||
.push_immediate([latest_index].to_vec())
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|_| DeleteTimelineError::Cancelled)?;
|
||||
|
||||
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
|
||||
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
|
||||
self.flush_deletion_queue().await?;
|
||||
self.flush_deletion_queue()
|
||||
.await
|
||||
.map_err(|_| DeleteTimelineError::Cancelled)?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
Err(DeleteTimelineError::Other(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-after-index-delete"
|
||||
))?
|
||||
)))?
|
||||
});
|
||||
|
||||
info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
|
||||
|
||||
@@ -477,8 +477,21 @@ impl GcInfo {
|
||||
self.retain_lsns.sort_by_key(|i| i.0);
|
||||
}
|
||||
|
||||
pub(super) fn remove_child(&mut self, child_id: TimelineId) {
|
||||
self.retain_lsns.retain(|i| i.1 != child_id);
|
||||
pub(super) fn remove_child_maybe_offloaded(
|
||||
&mut self,
|
||||
child_id: TimelineId,
|
||||
maybe_offloaded: MaybeOffloaded,
|
||||
) {
|
||||
self.retain_lsns
|
||||
.retain(|i| !(i.1 == child_id && i.2 == maybe_offloaded));
|
||||
}
|
||||
|
||||
pub(super) fn remove_child_not_offloaded(&mut self, child_id: TimelineId) {
|
||||
self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::No);
|
||||
}
|
||||
|
||||
pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) {
|
||||
self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4501,7 +4514,7 @@ impl Drop for Timeline {
|
||||
// This lock should never be poisoned, but in case it is we do a .map() instead of
|
||||
// an unwrap(), to avoid panicking in a destructor and thereby aborting the process.
|
||||
if let Ok(mut gc_info) = ancestor.gc_info.write() {
|
||||
gc_info.remove_child(self.timeline_id)
|
||||
gc_info.remove_child_not_offloaded(self.timeline_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5030,7 +5043,7 @@ impl Timeline {
|
||||
|
||||
// 1. Is it newer than GC horizon cutoff point?
|
||||
if l.get_lsn_range().end > space_cutoff {
|
||||
debug!(
|
||||
info!(
|
||||
"keeping {} because it's newer than space_cutoff {}",
|
||||
l.layer_name(),
|
||||
space_cutoff,
|
||||
@@ -5041,7 +5054,7 @@ impl Timeline {
|
||||
|
||||
// 2. It is newer than PiTR cutoff point?
|
||||
if l.get_lsn_range().end > time_cutoff {
|
||||
debug!(
|
||||
info!(
|
||||
"keeping {} because it's newer than time_cutoff {}",
|
||||
l.layer_name(),
|
||||
time_cutoff,
|
||||
@@ -5060,7 +5073,7 @@ impl Timeline {
|
||||
for retain_lsn in &retain_lsns {
|
||||
// start_lsn is inclusive
|
||||
if &l.get_lsn_range().start <= retain_lsn {
|
||||
debug!(
|
||||
info!(
|
||||
"keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
|
||||
l.layer_name(),
|
||||
retain_lsn,
|
||||
@@ -5075,7 +5088,7 @@ impl Timeline {
|
||||
if let Some(lsn) = &max_lsn_with_valid_lease {
|
||||
// keep if layer start <= any of the lease
|
||||
if &l.get_lsn_range().start <= lsn {
|
||||
debug!(
|
||||
info!(
|
||||
"keeping {} because there is a valid lease preventing GC at {}",
|
||||
l.layer_name(),
|
||||
lsn,
|
||||
@@ -5107,13 +5120,13 @@ impl Timeline {
|
||||
if !layers
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
|
||||
{
|
||||
debug!("keeping {} because it is the latest layer", l.layer_name());
|
||||
info!("keeping {} because it is the latest layer", l.layer_name());
|
||||
result.layers_not_updated += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// We didn't find any reason to keep this file, so remove it.
|
||||
debug!(
|
||||
info!(
|
||||
"garbage collecting {} is_dropped: xx is_incremental: {}",
|
||||
l.layer_name(),
|
||||
l.is_incremental(),
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::{
|
||||
|
||||
use anyhow::Context;
|
||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||
use remote_storage::DownloadError;
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, info_span, instrument, Instrument};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
@@ -16,7 +17,7 @@ use crate::{
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
|
||||
TimelineOrOffloaded,
|
||||
TenantManifestError, TimelineOrOffloaded,
|
||||
},
|
||||
virtual_file::MaybeFatalIo,
|
||||
};
|
||||
@@ -110,13 +111,6 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
info!("finished deleting layer files, releasing locks");
|
||||
}
|
||||
|
||||
/// Removes remote layers and an index file after them.
|
||||
async fn delete_remote_layers_and_index(
|
||||
remote_client: &Arc<RemoteTimelineClient>,
|
||||
) -> anyhow::Result<()> {
|
||||
remote_client.delete_all().await.context("delete_all")
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn remove_maybe_offloaded_timeline_from_tenant(
|
||||
@@ -147,9 +141,10 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
|
||||
);
|
||||
}
|
||||
TimelineOrOffloaded::Offloaded(timeline) => {
|
||||
timelines_offloaded
|
||||
let offloaded_timeline = timelines_offloaded
|
||||
.remove(&timeline.timeline_id)
|
||||
.expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map");
|
||||
offloaded_timeline.delete_from_ancestor_with_timelines(&timelines);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -221,11 +216,24 @@ impl DeleteTimelineFlow {
|
||||
None => {
|
||||
let remote_client = tenant
|
||||
.build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
|
||||
let result = remote_client
|
||||
let result = match remote_client
|
||||
.download_index_file(&tenant.cancel)
|
||||
.instrument(info_span!("download_index_file"))
|
||||
.await
|
||||
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
|
||||
{
|
||||
Ok(r) => r,
|
||||
Err(DownloadError::NotFound) => {
|
||||
// Deletion is already complete
|
||||
tracing::info!("Timeline already deleted in remote storage");
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!(
|
||||
"error: {:?}",
|
||||
e
|
||||
)));
|
||||
}
|
||||
};
|
||||
let index_part = match result {
|
||||
MaybeDeletedIndexPart::Deleted(p) => {
|
||||
tracing::info!("Timeline already set as deleted in remote index");
|
||||
@@ -406,7 +414,12 @@ impl DeleteTimelineFlow {
|
||||
"timeline_delete",
|
||||
async move {
|
||||
if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
|
||||
error!("Error: {err:#}");
|
||||
// Only log as an error if it's not a cancellation.
|
||||
if matches!(err, DeleteTimelineError::Cancelled) {
|
||||
info!("Shutdown during timeline deletion");
|
||||
}else {
|
||||
error!("Error: {err:#}");
|
||||
}
|
||||
if let TimelineOrOffloaded::Timeline(timeline) = timeline {
|
||||
timeline.set_broken(format!("{err:#}"))
|
||||
}
|
||||
@@ -438,7 +451,7 @@ impl DeleteTimelineFlow {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
});
|
||||
|
||||
delete_remote_layers_and_index(&remote_client).await?;
|
||||
remote_client.delete_all().await?;
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
@@ -449,10 +462,10 @@ impl DeleteTimelineFlow {
|
||||
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
|
||||
// However, we handle this case in tenant loading code so the next time we attach, the issue is
|
||||
// resolved.
|
||||
tenant
|
||||
.store_tenant_manifest()
|
||||
.await
|
||||
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
|
||||
tenant.store_tenant_manifest().await.map_err(|e| match e {
|
||||
TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
|
||||
_ => DeleteTimelineError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ pub(crate) async fn offload_timeline(
|
||||
let conf = &tenant.conf;
|
||||
delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
|
||||
|
||||
remove_timeline_from_tenant(tenant, &timeline, &guard);
|
||||
let remaining_refcount = remove_timeline_from_tenant(tenant, &timeline, &guard);
|
||||
|
||||
{
|
||||
let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
|
||||
@@ -87,16 +87,20 @@ pub(crate) async fn offload_timeline(
|
||||
// not our actual state of offloaded timelines.
|
||||
tenant.store_tenant_manifest().await?;
|
||||
|
||||
tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
///
|
||||
/// Returns the strong count of the timeline `Arc`
|
||||
fn remove_timeline_from_tenant(
|
||||
tenant: &Tenant,
|
||||
timeline: &Timeline,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) {
|
||||
) -> usize {
|
||||
// Remove the timeline from the map.
|
||||
let mut timelines = tenant.timelines.lock().unwrap();
|
||||
let children_exist = timelines
|
||||
@@ -109,7 +113,9 @@ fn remove_timeline_from_tenant(
|
||||
panic!("Timeline grew children while we removed layer files");
|
||||
}
|
||||
|
||||
timelines
|
||||
let timeline = timelines
|
||||
.remove(&timeline.timeline_id)
|
||||
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
|
||||
|
||||
Arc::strong_count(&timeline)
|
||||
}
|
||||
|
||||
@@ -331,11 +331,11 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
while let Some((record_end_lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hitting a deadlock.
|
||||
if !record_end_lsn.is_aligned() {
|
||||
if !next_record_lsn.is_aligned() {
|
||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||
}
|
||||
|
||||
@@ -343,7 +343,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
modification.tline.get_shard_identity(),
|
||||
record_end_lsn,
|
||||
next_record_lsn,
|
||||
modification.tline.pg_version,
|
||||
)?;
|
||||
|
||||
@@ -367,10 +367,10 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {record_end_lsn}")
|
||||
format!("could not ingest record at {next_record_lsn}")
|
||||
})?;
|
||||
if !ingested {
|
||||
tracing::debug!("ingest: filtered out record @ LSN {record_end_lsn}");
|
||||
tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
|
||||
WAL_INGEST.records_filtered.inc();
|
||||
filtered_records += 1;
|
||||
}
|
||||
@@ -380,7 +380,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// to timeout the tests.
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
last_rec_lsn = record_end_lsn;
|
||||
last_rec_lsn = next_record_lsn;
|
||||
|
||||
// Commit every ingest_batch_size records. Even if we filtered out
|
||||
// all records, we still need to call commit to advance the LSN.
|
||||
|
||||
@@ -175,10 +175,16 @@ impl VirtualFile {
|
||||
}
|
||||
|
||||
pub async fn sync_all(&self) -> Result<(), Error> {
|
||||
if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 {
|
||||
return Ok(());
|
||||
}
|
||||
self.inner.sync_all().await
|
||||
}
|
||||
|
||||
pub async fn sync_data(&self) -> Result<(), Error> {
|
||||
if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 {
|
||||
return Ok(());
|
||||
}
|
||||
self.inner.sync_data().await
|
||||
}
|
||||
|
||||
@@ -233,6 +239,27 @@ impl VirtualFile {
|
||||
}
|
||||
}
|
||||
|
||||
/// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing
|
||||
/// files. Switching this off is unsafe and only used for testing on machines
|
||||
/// with slow drives.
|
||||
#[repr(u8)]
|
||||
pub enum SyncMode {
|
||||
Sync,
|
||||
UnsafeNoSync,
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for SyncMode {
|
||||
type Error = u8;
|
||||
|
||||
fn try_from(value: u8) -> Result<Self, Self::Error> {
|
||||
Ok(match value {
|
||||
v if v == (SyncMode::Sync as u8) => SyncMode::Sync,
|
||||
v if v == (SyncMode::UnsafeNoSync as u8) => SyncMode::UnsafeNoSync,
|
||||
x => return Err(x),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
/// the underlying file is closed if the system is low on file descriptors,
|
||||
@@ -1332,12 +1359,13 @@ impl OpenFiles {
|
||||
/// server startup.
|
||||
///
|
||||
#[cfg(not(test))]
|
||||
pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode) {
|
||||
pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode, sync_mode: SyncMode) {
|
||||
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
|
||||
panic!("virtual_file::init called twice");
|
||||
}
|
||||
set_io_mode(mode);
|
||||
io_engine::init(engine);
|
||||
SYNC_MODE.store(sync_mode as u8, std::sync::atomic::Ordering::Relaxed);
|
||||
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
|
||||
}
|
||||
|
||||
@@ -1379,6 +1407,9 @@ pub(crate) fn set_io_mode(mode: IoMode) {
|
||||
pub(crate) fn get_io_mode() -> IoMode {
|
||||
IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap()
|
||||
}
|
||||
|
||||
static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::context::DownloadBehavior;
|
||||
|
||||
@@ -154,7 +154,7 @@ impl WalIngest {
|
||||
WAL_INGEST.records_received.inc();
|
||||
let prev_len = modification.len();
|
||||
|
||||
modification.set_lsn(interpreted.end_lsn)?;
|
||||
modification.set_lsn(interpreted.next_record_lsn)?;
|
||||
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) {
|
||||
// Records of this type should always be preceded by a commit(), as they
|
||||
@@ -587,11 +587,29 @@ impl WalIngest {
|
||||
forknum: VISIBILITYMAP_FORKNUM,
|
||||
};
|
||||
|
||||
let mut vm_page_no = blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE;
|
||||
if blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
|
||||
// Tail of last remaining vm page has to be zeroed.
|
||||
// We are not precise here and instead of digging in VM bitmap format just clear the whole page.
|
||||
modification.put_rel_page_image_zero(rel, vm_page_no)?;
|
||||
// last remaining block, byte, and bit
|
||||
let mut vm_page_no = blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32);
|
||||
let trunc_byte = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE
|
||||
/ pg_constants::VM_HEAPBLOCKS_PER_BYTE;
|
||||
let trunc_offs = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_BYTE
|
||||
* pg_constants::VM_BITS_PER_HEAPBLOCK;
|
||||
|
||||
// Unless the new size is exactly at a visibility map page boundary, the
|
||||
// tail bits in the last remaining map page, representing truncated heap
|
||||
// blocks, need to be cleared. This is not only tidy, but also necessary
|
||||
// because we don't get a chance to clear the bits if the heap is extended
|
||||
// again.
|
||||
if (trunc_byte != 0 || trunc_offs != 0)
|
||||
&& self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
|
||||
{
|
||||
modification.put_rel_wal_record(
|
||||
rel,
|
||||
vm_page_no,
|
||||
NeonWalRecord::TruncateVisibilityMap {
|
||||
trunc_byte,
|
||||
trunc_offs,
|
||||
},
|
||||
)?;
|
||||
vm_page_no += 1;
|
||||
}
|
||||
let nblocks = get_relsize(modification, rel, ctx).await?;
|
||||
|
||||
@@ -42,6 +42,34 @@ pub(crate) fn apply_in_neon(
|
||||
} => {
|
||||
anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
|
||||
}
|
||||
//
|
||||
// Code copied from PostgreSQL `visibilitymap_prepare_truncate` function in `visibilitymap.c`
|
||||
//
|
||||
NeonWalRecord::TruncateVisibilityMap {
|
||||
trunc_byte,
|
||||
trunc_offs,
|
||||
} => {
|
||||
// sanity check that this is modifying the correct relation
|
||||
let (rel, _) = key.to_rel_block().context("invalid record")?;
|
||||
assert!(
|
||||
rel.forknum == VISIBILITYMAP_FORKNUM,
|
||||
"TruncateVisibilityMap record on unexpected rel {}",
|
||||
rel
|
||||
);
|
||||
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
|
||||
map[*trunc_byte + 1..].fill(0u8);
|
||||
/*----
|
||||
* Mask out the unwanted bits of the last remaining byte.
|
||||
*
|
||||
* ((1 << 0) - 1) = 00000000
|
||||
* ((1 << 1) - 1) = 00000001
|
||||
* ...
|
||||
* ((1 << 6) - 1) = 00111111
|
||||
* ((1 << 7) - 1) = 01111111
|
||||
*----
|
||||
*/
|
||||
map[*trunc_byte] &= (1 << *trunc_offs) - 1;
|
||||
}
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
|
||||
@@ -512,7 +512,7 @@ neon_shmem_startup_hook(void)
|
||||
if (prev_shmem_startup_hook)
|
||||
prev_shmem_startup_hook();
|
||||
|
||||
#if PG_PG_MAJORVERSION_NUM >= 17
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");
|
||||
WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read");
|
||||
WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate");
|
||||
|
||||
3
safekeeper/spec/.gitignore
vendored
Normal file
3
safekeeper/spec/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
*TTrace*
|
||||
*.toolbox/
|
||||
states/
|
||||
41
safekeeper/spec/MCProposerAcceptorReconfig.tla
Normal file
41
safekeeper/spec/MCProposerAcceptorReconfig.tla
Normal file
@@ -0,0 +1,41 @@
|
||||
---- MODULE MCProposerAcceptorReconfig ----
|
||||
EXTENDS TLC, ProposerAcceptorReconfig
|
||||
|
||||
\* Augments the spec with model checking constraints.
|
||||
|
||||
\* It slightly duplicates MCProposerAcceptorStatic, but we can't EXTENDS it
|
||||
\* because it EXTENDS ProposerAcceptorStatic in turn. The duplication isn't big
|
||||
\* anyway.
|
||||
|
||||
\* For model checking.
|
||||
CONSTANTS
|
||||
max_entries, \* model constraint: max log entries acceptor/proposer can hold
|
||||
max_term, \* model constraint: max allowed term
|
||||
max_generation \* mode constraint: max config generation
|
||||
|
||||
ASSUME max_entries \in Nat /\ max_term \in Nat /\ max_generation \in Nat
|
||||
|
||||
\* Model space constraint.
|
||||
StateConstraint == /\ \A p \in proposers:
|
||||
/\ prop_state[p].term <= max_term
|
||||
/\ Len(prop_state[p].wal) <= max_entries
|
||||
/\ conf_store.generation <= max_generation
|
||||
|
||||
\* Sets of proposers and acceptors and symmetric because we don't take any
|
||||
\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN
|
||||
\* ...)
|
||||
ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors)
|
||||
|
||||
\* enforce order of the vars in the error trace with ALIAS
|
||||
\* Note that ALIAS is supported only since version 1.8.0 which is pre-release
|
||||
\* as of writing this.
|
||||
Alias == [
|
||||
prop_state |-> prop_state,
|
||||
prop_conf |-> prop_conf,
|
||||
acc_state |-> acc_state,
|
||||
acc_conf |-> acc_conf,
|
||||
committed |-> committed,
|
||||
conf_store |-> conf_store
|
||||
]
|
||||
|
||||
====
|
||||
31
safekeeper/spec/MCProposerAcceptorStatic.tla
Normal file
31
safekeeper/spec/MCProposerAcceptorStatic.tla
Normal file
@@ -0,0 +1,31 @@
|
||||
---- MODULE MCProposerAcceptorStatic ----
|
||||
EXTENDS TLC, ProposerAcceptorStatic
|
||||
|
||||
\* Augments the spec with model checking constraints.
|
||||
|
||||
\* For model checking.
|
||||
CONSTANTS
|
||||
max_entries, \* model constraint: max log entries acceptor/proposer can hold
|
||||
max_term \* model constraint: max allowed term
|
||||
|
||||
ASSUME max_entries \in Nat /\ max_term \in Nat
|
||||
|
||||
\* Model space constraint.
|
||||
StateConstraint == \A p \in proposers:
|
||||
/\ prop_state[p].term <= max_term
|
||||
/\ Len(prop_state[p].wal) <= max_entries
|
||||
\* Sets of proposers and acceptors are symmetric because we don't take any
|
||||
\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN
|
||||
\* ...)
|
||||
ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors)
|
||||
|
||||
\* enforce order of the vars in the error trace with ALIAS
|
||||
\* Note that ALIAS is supported only since version 1.8.0 which is pre-release
|
||||
\* as of writing this.
|
||||
Alias == [
|
||||
prop_state |-> prop_state,
|
||||
acc_state |-> acc_state,
|
||||
committed |-> committed
|
||||
]
|
||||
|
||||
====
|
||||
@@ -1,34 +0,0 @@
|
||||
\* MV CONSTANT declarations
|
||||
CONSTANT NULL = NULL
|
||||
CONSTANTS
|
||||
p1 = p1
|
||||
p2 = p2
|
||||
p3 = p3
|
||||
a1 = a1
|
||||
a2 = a2
|
||||
a3 = a3
|
||||
\* MV CONSTANT definitions
|
||||
CONSTANT
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3}
|
||||
\* SYMMETRY definition
|
||||
SYMMETRY perms
|
||||
\* CONSTANT definitions
|
||||
CONSTANT
|
||||
max_term = 3
|
||||
CONSTANT
|
||||
max_entries = 3
|
||||
\* INIT definition
|
||||
INIT
|
||||
Init
|
||||
\* NEXT definition
|
||||
NEXT
|
||||
Next
|
||||
\* INVARIANT definition
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
CommittedNotOverwritten
|
||||
CHECK_DEADLOCK FALSE
|
||||
@@ -1,363 +0,0 @@
|
||||
---- MODULE ProposerAcceptorConsensus ----
|
||||
|
||||
\* Differences from current implementation:
|
||||
\* - unified not-globally-unique epoch & term (node_id)
|
||||
\* Simplifications:
|
||||
\* - instant message delivery
|
||||
\* - feedback is not modeled separately, commit_lsn is updated directly
|
||||
|
||||
EXTENDS Integers, Sequences, FiniteSets, TLC
|
||||
|
||||
VARIABLES
|
||||
prop_state, \* prop_state[p] is state of proposer p
|
||||
acc_state, \* acc_state[a] is state of acceptor a
|
||||
commit_lsns \* map of acceptor -> commit_lsn
|
||||
|
||||
CONSTANT
|
||||
acceptors,
|
||||
proposers,
|
||||
max_entries, \* model constraint: max log entries acceptor/proposer can hold
|
||||
max_term \* model constraint: max allowed term
|
||||
|
||||
CONSTANT NULL
|
||||
|
||||
ASSUME max_entries \in Nat /\ max_term \in Nat
|
||||
|
||||
\* For specifying symmetry set in manual cfg file, see
|
||||
\* https://github.com/tlaplus/tlaplus/issues/404
|
||||
perms == Permutations(proposers) \union Permutations(acceptors)
|
||||
|
||||
\********************************************************************************
|
||||
\* Helpers
|
||||
\********************************************************************************
|
||||
|
||||
Maximum(S) ==
|
||||
(*************************************************************************)
|
||||
(* If S is a set of numbers, then this define Maximum(S) to be the *)
|
||||
(* maximum of those numbers, or -1 if S is empty. *)
|
||||
(*************************************************************************)
|
||||
IF S = {} THEN -1
|
||||
ELSE CHOOSE n \in S : \A m \in S : n \geq m
|
||||
|
||||
\* minimum of numbers in the set, error if set is empty
|
||||
Minimum(S) ==
|
||||
CHOOSE min \in S : \A n \in S : min <= n
|
||||
|
||||
\* Min of two numbers
|
||||
Min(a, b) == IF a < b THEN a ELSE b
|
||||
|
||||
\* Set of values of function f. XXX is there a such builtin?
|
||||
FValues(f) == {f[a] : a \in DOMAIN f}
|
||||
|
||||
\* Sort of 0 for functions
|
||||
EmptyF == [x \in {} |-> 42]
|
||||
IsEmptyF(f) == DOMAIN f = {}
|
||||
|
||||
\* Next entry proposer p will push to acceptor a or NULL.
|
||||
NextEntry(p, a) ==
|
||||
IF Len(prop_state[p].wal) >= prop_state[p].next_send_lsn[a] THEN
|
||||
CHOOSE r \in FValues(prop_state[p].wal) : r.lsn = prop_state[p].next_send_lsn[a]
|
||||
ELSE
|
||||
NULL
|
||||
|
||||
|
||||
\*****************
|
||||
|
||||
NumAccs == Cardinality(acceptors)
|
||||
|
||||
\* does acc_set form the quorum?
|
||||
Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1)
|
||||
\* all quorums of acceptors
|
||||
Quorums == {subset \in SUBSET acceptors: Quorum(subset)}
|
||||
|
||||
\* flush_lsn of acceptor a.
|
||||
FlushLsn(a) == Len(acc_state[a].wal)
|
||||
|
||||
|
||||
\********************************************************************************
|
||||
\* Type assertion
|
||||
\********************************************************************************
|
||||
\* Defining sets of all possible tuples and using them in TypeOk in usual
|
||||
\* all-tuples constructor is not practical because such definitions force
|
||||
\* TLC to enumerate them, while they are are horribly enormous
|
||||
\* (TLC screams "Attempted to construct a set with too many elements").
|
||||
\* So instead check types manually.
|
||||
TypeOk ==
|
||||
/\ \A p \in proposers:
|
||||
/\ DOMAIN prop_state[p] = {"state", "term", "votes", "donor_epoch", "vcl", "wal", "next_send_lsn"}
|
||||
\* in campaign proposer sends RequestVote and waits for acks;
|
||||
\* in leader he is elected
|
||||
/\ prop_state[p].state \in {"campaign", "leader"}
|
||||
\* 0..max_term should be actually Nat in the unbounded model, but TLC won't
|
||||
\* swallow it
|
||||
/\ prop_state[p].term \in 0..max_term
|
||||
\* votes received
|
||||
/\ \A voter \in DOMAIN prop_state[p].votes:
|
||||
/\ voter \in acceptors
|
||||
/\ prop_state[p].votes[voter] \in [epoch: 0..max_term, flush_lsn: 0..max_entries]
|
||||
/\ prop_state[p].donor_epoch \in 0..max_term
|
||||
\* wal is sequence of just <lsn, epoch of author> records
|
||||
/\ \A i \in DOMAIN prop_state[p].wal:
|
||||
prop_state[p].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term]
|
||||
\* Following implementation, we skew the original Aurora meaning of this;
|
||||
\* here it is lsn of highest definitely committed record as set by proposer
|
||||
\* when it is elected; it doesn't change since then
|
||||
/\ prop_state[p].vcl \in 0..max_entries
|
||||
\* map of acceptor -> next lsn to send
|
||||
/\ \A a \in DOMAIN prop_state[p].next_send_lsn:
|
||||
/\ a \in acceptors
|
||||
/\ prop_state[p].next_send_lsn[a] \in 1..(max_entries + 1)
|
||||
/\ \A a \in acceptors:
|
||||
/\ DOMAIN acc_state[a] = {"term", "epoch", "wal"}
|
||||
/\ acc_state[a].term \in 0..max_term
|
||||
/\ acc_state[a].epoch \in 0..max_term
|
||||
/\ \A i \in DOMAIN acc_state[a].wal:
|
||||
acc_state[a].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term]
|
||||
/\ \A a \in DOMAIN commit_lsns:
|
||||
/\ a \in acceptors
|
||||
/\ commit_lsns[a] \in 0..max_entries
|
||||
|
||||
\********************************************************************************
|
||||
\* Initial
|
||||
\********************************************************************************
|
||||
|
||||
Init ==
|
||||
/\ prop_state = [p \in proposers |-> [
|
||||
state |-> "campaign",
|
||||
term |-> 1,
|
||||
votes |-> EmptyF,
|
||||
donor_epoch |-> 0,
|
||||
vcl |-> 0,
|
||||
wal |-> << >>,
|
||||
next_send_lsn |-> EmptyF
|
||||
]]
|
||||
/\ acc_state = [a \in acceptors |-> [
|
||||
\* there will be no leader in this term, 1 is the first real
|
||||
term |-> 0,
|
||||
epoch |-> 0,
|
||||
wal |-> << >>
|
||||
]]
|
||||
/\ commit_lsns = [a \in acceptors |-> 0]
|
||||
|
||||
|
||||
\********************************************************************************
|
||||
\* Actions
|
||||
\********************************************************************************
|
||||
|
||||
\* Proposer loses all state.
|
||||
\* For simplicity (and to reduct state space), we assume it immediately gets
|
||||
\* current state from quorum q of acceptors determining the term he will request
|
||||
\* to vote for.
|
||||
RestartProposer(p, q) ==
|
||||
/\ Quorum(q)
|
||||
/\ LET
|
||||
new_term == Maximum({acc_state[a].term : a \in q}) + 1
|
||||
IN
|
||||
/\ new_term <= max_term
|
||||
/\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
|
||||
![p].term = new_term,
|
||||
![p].votes = EmptyF,
|
||||
![p].donor_epoch = 0,
|
||||
![p].vcl = 0,
|
||||
![p].wal = << >>,
|
||||
![p].next_send_lsn = EmptyF]
|
||||
/\ UNCHANGED <<acc_state, commit_lsns>>
|
||||
|
||||
\* Acceptor a immediately votes for proposer p.
|
||||
Vote(p, a) ==
|
||||
/\ prop_state[p].state = "campaign"
|
||||
/\ acc_state[a].term < prop_state[p].term \* main voting condition
|
||||
/\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
|
||||
/\ LET
|
||||
vote == [epoch |-> acc_state[a].epoch, flush_lsn |-> FlushLsn(a)]
|
||||
IN
|
||||
prop_state' = [prop_state EXCEPT ![p].votes = prop_state[p].votes @@ (a :> vote)]
|
||||
/\ UNCHANGED <<commit_lsns>>
|
||||
|
||||
|
||||
\* Proposer p gets elected.
|
||||
BecomeLeader(p) ==
|
||||
/\ prop_state[p].state = "campaign"
|
||||
/\ Quorum(DOMAIN prop_state[p].votes)
|
||||
/\ LET
|
||||
max_epoch == Maximum({v.epoch : v \in FValues(prop_state[p].votes)})
|
||||
max_epoch_votes == {v \in FValues(prop_state[p].votes) : v.epoch = max_epoch}
|
||||
donor == CHOOSE dv \in DOMAIN prop_state[p].votes :
|
||||
/\ prop_state[p].votes[dv].epoch = max_epoch
|
||||
/\ \A v \in max_epoch_votes:
|
||||
prop_state[p].votes[dv].flush_lsn >= v.flush_lsn
|
||||
max_vote == prop_state[p].votes[donor]
|
||||
\* Establish lsn to stream from for voters.
|
||||
\* At some point it seemed like we can regard log as correct and only
|
||||
\* append to it if has in the max_epoch, however TLC showed that's not
|
||||
\* the case; we must always stream since first not matching record.
|
||||
next_send_lsn == [voter \in DOMAIN prop_state[p].votes |-> 1]
|
||||
IN
|
||||
\* we fetch log from the most advanced node (this is separate
|
||||
\* roundtrip), make sure node is still on one term with us
|
||||
/\ acc_state[donor].term = prop_state[p].term
|
||||
/\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
|
||||
\* fetch the log from donor
|
||||
![p].wal = acc_state[donor].wal,
|
||||
![p].donor_epoch = max_epoch,
|
||||
![p].vcl = max_vote.flush_lsn,
|
||||
![p].next_send_lsn = next_send_lsn]
|
||||
/\ UNCHANGED <<acc_state, commit_lsns>>
|
||||
|
||||
|
||||
\* acceptor a learns about elected proposer p's term.
|
||||
UpdateTerm(p, a) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ acc_state[a].term < prop_state[p].term
|
||||
/\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
|
||||
/\ UNCHANGED <<prop_state, commit_lsns>>
|
||||
|
||||
|
||||
\* Acceptor a which didn't participate in voting connects to elected proposer p
|
||||
\* and p sets the streaming point
|
||||
HandshakeWithLeader(p, a) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ acc_state[a].term = prop_state[p].term
|
||||
/\ a \notin DOMAIN prop_state[p].next_send_lsn
|
||||
/\ LET
|
||||
next_send_lsn == prop_state[p].next_send_lsn @@ (a :> 1)
|
||||
IN
|
||||
prop_state' = [prop_state EXCEPT ![p].next_send_lsn = next_send_lsn]
|
||||
/\ UNCHANGED <<acc_state, commit_lsns>>
|
||||
|
||||
|
||||
\* Append new log entry to elected proposer
|
||||
NewEntry(p) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ Len(prop_state[p].wal) < max_entries \* model constraint
|
||||
/\ LET
|
||||
new_lsn == IF Len(prop_state[p].wal) = 0 THEN
|
||||
prop_state[p].vcl + 1
|
||||
ELSE
|
||||
\* lsn of last record + 1
|
||||
prop_state[p].wal[Len(prop_state[p].wal)].lsn + 1
|
||||
new_entry == [lsn |-> new_lsn, epoch |-> prop_state[p].term]
|
||||
IN
|
||||
/\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)]
|
||||
/\ UNCHANGED <<acc_state, commit_lsns>>
|
||||
|
||||
|
||||
\* Write entry new_e to log wal, rolling back all higher entries if e is different.
|
||||
\* If bump_epoch is TRUE, it means we get record with lsn=vcl and going to update
|
||||
\* the epoch. Truncate log in this case as well, as we might have correct <= vcl
|
||||
\* part and some outdated entries behind it which we want to purge before
|
||||
\* declaring us as recovered. Another way to accomplish this (in previous commit)
|
||||
\* is wait for first-entry-from-new-epoch before bumping it.
|
||||
WriteEntry(wal, new_e, bump_epoch) ==
|
||||
(new_e.lsn :> new_e) @@
|
||||
\* If wal has entry with such lsn and it is different, truncate all higher log.
|
||||
IF \/ (new_e.lsn \in DOMAIN wal /\ wal[new_e.lsn] /= new_e)
|
||||
\/ bump_epoch THEN
|
||||
SelectSeq(wal, LAMBDA e: e.lsn < new_e.lsn)
|
||||
ELSE
|
||||
wal
|
||||
|
||||
|
||||
\* Try to transfer entry from elected proposer p to acceptor a
|
||||
TransferEntry(p, a) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ prop_state[p].term = acc_state[a].term
|
||||
/\ a \in DOMAIN prop_state[p].next_send_lsn
|
||||
/\ LET
|
||||
next_e == NextEntry(p, a)
|
||||
IN
|
||||
/\ next_e /= NULL
|
||||
/\ LET
|
||||
\* Consider bumping epoch if getting this entry recovers the acceptor,
|
||||
\* that is, we reach first record behind VCL.
|
||||
new_epoch ==
|
||||
IF /\ acc_state[a].epoch < prop_state[p].term
|
||||
/\ next_e.lsn >= prop_state[p].vcl
|
||||
THEN
|
||||
prop_state[p].term
|
||||
ELSE
|
||||
acc_state[a].epoch
|
||||
\* Also check whether this entry allows to advance commit_lsn and
|
||||
\* if so, bump it where possible. Modeling this as separate action
|
||||
\* significantly bloats the space (5m vs 15m on max_entries=3 max_term=3,
|
||||
\* so act immediately.
|
||||
entry_owners == {o \in acceptors:
|
||||
/\ o /= a
|
||||
\* only recovered acceptors advance commit_lsn
|
||||
/\ acc_state[o].epoch = prop_state[p].term
|
||||
/\ next_e \in FValues(acc_state[o].wal)} \cup {a}
|
||||
IN
|
||||
/\ acc_state' = [acc_state EXCEPT ![a].wal = WriteEntry(acc_state[a].wal, next_e, new_epoch /= acc_state[a].epoch),
|
||||
![a].epoch = new_epoch]
|
||||
/\ prop_state' = [prop_state EXCEPT ![p].next_send_lsn[a] =
|
||||
prop_state[p].next_send_lsn[a] + 1]
|
||||
/\ commit_lsns' = IF /\ new_epoch = prop_state[p].term
|
||||
/\ Quorum(entry_owners)
|
||||
THEN
|
||||
[acc \in acceptors |->
|
||||
IF /\ acc \in entry_owners
|
||||
/\ next_e.lsn > commit_lsns[acc]
|
||||
THEN
|
||||
next_e.lsn
|
||||
ELSE
|
||||
commit_lsns[acc]]
|
||||
ELSE
|
||||
commit_lsns
|
||||
|
||||
|
||||
\*******************************************************************************
|
||||
\* Final spec
|
||||
\*******************************************************************************
|
||||
|
||||
Next ==
|
||||
\/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q)
|
||||
\/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
|
||||
\/ \E p \in proposers: BecomeLeader(p)
|
||||
\/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
|
||||
\/ \E p \in proposers: \E a \in acceptors: HandshakeWithLeader(p, a)
|
||||
\/ \E p \in proposers: NewEntry(p)
|
||||
\/ \E p \in proposers: \E a \in acceptors: TransferEntry(p, a)
|
||||
|
||||
Spec == Init /\ [][Next]_<<prop_state, acc_state, commit_lsns>>
|
||||
|
||||
|
||||
\********************************************************************************
|
||||
\* Invariants
|
||||
\********************************************************************************
|
||||
|
||||
\* we don't track history, but this property is fairly convincing anyway
|
||||
ElectionSafety ==
|
||||
\A p1, p2 \in proposers:
|
||||
(/\ prop_state[p1].state = "leader"
|
||||
/\ prop_state[p2].state = "leader"
|
||||
/\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2)
|
||||
|
||||
LogIsMonotonic ==
|
||||
\A a \in acceptors:
|
||||
\A i \in DOMAIN acc_state[a].wal: \A j \in DOMAIN acc_state[a].wal:
|
||||
(i > j) => (/\ acc_state[a].wal[i].lsn > acc_state[a].wal[j].lsn
|
||||
/\ acc_state[a].wal[i].epoch >= acc_state[a].wal[j].epoch)
|
||||
|
||||
\* Main invariant: log under commit_lsn must match everywhere.
|
||||
LogSafety ==
|
||||
\A a1 \in acceptors: \A a2 \in acceptors:
|
||||
LET
|
||||
common_len == Min(commit_lsns[a1], commit_lsns[a2])
|
||||
IN
|
||||
SubSeq(acc_state[a1].wal, 1, common_len) = SubSeq(acc_state[a2].wal, 1, common_len)
|
||||
|
||||
\* Next record we are going to push to acceptor must never overwrite committed
|
||||
\* different record.
|
||||
CommittedNotOverwritten ==
|
||||
\A p \in proposers: \A a \in acceptors:
|
||||
(/\ prop_state[p].state = "leader"
|
||||
/\ prop_state[p].term = acc_state[a].term
|
||||
/\ a \in DOMAIN prop_state[p].next_send_lsn) =>
|
||||
LET
|
||||
next_e == NextEntry(p, a)
|
||||
IN
|
||||
(next_e /= NULL) =>
|
||||
((commit_lsns[a] >= next_e.lsn) => (acc_state[a].wal[next_e.lsn] = next_e))
|
||||
|
||||
|
||||
====
|
||||
310
safekeeper/spec/ProposerAcceptorReconfig.tla
Normal file
310
safekeeper/spec/ProposerAcceptorReconfig.tla
Normal file
@@ -0,0 +1,310 @@
|
||||
---- MODULE ProposerAcceptorReconfig ----
|
||||
|
||||
(*
|
||||
Spec for https://github.com/neondatabase/neon/blob/538e2312a617c65d489d391892c70b2e4d7407b5/docs/rfcs/035-safekeeper-dynamic-membership-change.md
|
||||
*)
|
||||
|
||||
EXTENDS Integers, Sequences, FiniteSets, TLC
|
||||
|
||||
VARIABLES
|
||||
\* state which is the same in the static spec
|
||||
prop_state,
|
||||
acc_state,
|
||||
committed,
|
||||
elected_history,
|
||||
\* reconfiguration only state
|
||||
prop_conf, \* prop_conf[p] is current configuration of proposer p
|
||||
acc_conf, \* acc_conf[a] is current configuration of acceptor a
|
||||
conf_store \* configuration in the configuration store.
|
||||
|
||||
CONSTANT
|
||||
acceptors,
|
||||
proposers
|
||||
|
||||
CONSTANT NULL
|
||||
|
||||
\* Import ProposerAcceptorStatic under PAS.
|
||||
\*
|
||||
\* Note that all vars and consts are named the same and thus substituted
|
||||
\* implicitly.
|
||||
PAS == INSTANCE ProposerAcceptorStatic
|
||||
|
||||
\********************************************************************************
|
||||
\* Helpers
|
||||
\********************************************************************************
|
||||
|
||||
\********************************************************************************
|
||||
\* Type assertion
|
||||
\********************************************************************************
|
||||
|
||||
\* Is c a valid config?
|
||||
IsConfig(c) ==
|
||||
/\ DOMAIN c = {"generation", "members", "newMembers"}
|
||||
\* Unique id of the configuration.
|
||||
/\ c.generation \in Nat
|
||||
/\ c.members \in SUBSET acceptors
|
||||
\* newMembers is NULL when it is not a joint conf.
|
||||
/\ \/ c.newMembers = NULL
|
||||
\/ c.newMembers \in SUBSET acceptors
|
||||
|
||||
TypeOk ==
|
||||
/\ PAS!TypeOk
|
||||
/\ \A p \in proposers: IsConfig(prop_conf[p])
|
||||
/\ \A a \in acceptors: IsConfig(acc_conf[a])
|
||||
/\ IsConfig(conf_store)
|
||||
|
||||
\********************************************************************************
|
||||
\* Initial
|
||||
\********************************************************************************
|
||||
|
||||
Init ==
|
||||
/\ PAS!Init
|
||||
/\ \E init_members \in SUBSET acceptors:
|
||||
LET init_conf == [generation |-> 1, members |-> init_members, newMembers |-> NULL] IN
|
||||
\* refer to RestartProposer why it is not NULL
|
||||
/\ prop_conf = [p \in proposers |-> init_conf]
|
||||
/\ acc_conf = [a \in acceptors |-> init_conf]
|
||||
/\ conf_store = init_conf
|
||||
\* We could start with anything, but to reduce space state start with
|
||||
\* the most reasonable total acceptors - 1 conf size, which e.g.
|
||||
\* makes basic {a1} -> {a2} change in {a1, a2} acceptors and {a1, a2,
|
||||
\* a3} -> {a2, a3, a4} in {a1, a2, a3, a4} acceptors models even in
|
||||
\* the smallest models with single change.
|
||||
/\ Cardinality(init_members) = Cardinality(acceptors) - 1
|
||||
|
||||
\********************************************************************************
|
||||
\* Actions
|
||||
\********************************************************************************
|
||||
|
||||
\* Proposer p loses all state, restarting. In the static spec we bump restarted
|
||||
\* proposer term to max of some quorum + 1 which is a minimal term which can win
|
||||
\* election. With reconfigurations it's harder to calculate such a term, so keep
|
||||
\* it simple and take random acceptor one + 1.
|
||||
\*
|
||||
\* Also make proposer to adopt configuration of another random acceptor. In the
|
||||
\* impl proposer starts with NULL configuration until handshake with first
|
||||
\* acceptor. Removing this NULL special case makes the spec a bit simpler.
|
||||
RestartProposer(p) ==
|
||||
/\ \E a \in acceptors: PAS!RestartProposerWithTerm(p, acc_state[a].term + 1)
|
||||
/\ \E a \in acceptors: prop_conf' = [prop_conf EXCEPT ![p] = acc_conf[a]]
|
||||
/\ UNCHANGED <<acc_conf, conf_store>>
|
||||
|
||||
\* Acceptor a immediately votes for proposer p.
|
||||
Vote(p, a) ==
|
||||
\* Configuration must be the same.
|
||||
/\ prop_conf[p].generation = acc_conf[a].generation
|
||||
\* And a is expected be a member of it. This is likely redundant as long as
|
||||
\* becoming leader checks membership (though vote also contributes to max
|
||||
\* <term, lsn> calculation).
|
||||
/\ \/ a \in prop_conf[p].members
|
||||
\/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers)
|
||||
/\ PAS!Vote(p, a)
|
||||
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
|
||||
|
||||
\* Proposer p gets elected.
|
||||
BecomeLeader(p) ==
|
||||
/\ prop_state[p].state = "campaign"
|
||||
\* Votes must form quorum in both sets (if the newMembers exists).
|
||||
/\ PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].members)
|
||||
/\ \/ prop_conf[p].newMembers = NULL
|
||||
\* TLA+ disjunction evaluation doesn't short-circuit for a good reason:
|
||||
\* https://groups.google.com/g/tlaplus/c/U6tOJ4dsjVM/m/UdOznPCVBwAJ
|
||||
\* so repeat the null check.
|
||||
\/ (prop_conf[p].newMembers /= NULL) /\ (PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].newMembers))
|
||||
\* DoBecomeLeader will copy WAL of the highest voter to proposer's WAL, so
|
||||
\* ensure its conf is still the same. In the impl WAL fetching also has to
|
||||
\* check the configuration.
|
||||
/\ prop_conf[p].generation = acc_conf[PAS!MaxVoteAcc(p)].generation
|
||||
/\ \A a \in DOMAIN prop_state[p].votes: prop_conf[p].generation = acc_conf[a].generation
|
||||
/\ PAS!DoBecomeLeader(p)
|
||||
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
|
||||
|
||||
UpdateTerm(p, a) ==
|
||||
/\ PAS!UpdateTerm(p, a)
|
||||
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
|
||||
|
||||
TruncateWal(p, a) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
\* Configuration must be the same.
|
||||
/\ prop_conf[p].generation = acc_conf[a].generation
|
||||
/\ PAS!TruncateWal(p, a)
|
||||
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
|
||||
|
||||
NewEntry(p) ==
|
||||
/\ PAS!NewEntry(p)
|
||||
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
|
||||
|
||||
AppendEntry(p, a) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
\* Configuration must be the same.
|
||||
/\ prop_conf[p].generation = acc_conf[a].generation
|
||||
\* And a is member of it. Ignoring this likely wouldn't hurt, but not useful
|
||||
\* either.
|
||||
/\ \/ a \in prop_conf[p].members
|
||||
\/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers)
|
||||
/\ PAS!AppendEntry(p, a)
|
||||
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
|
||||
|
||||
\* see PAS!CommitEntries for comments.
|
||||
CommitEntries(p) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ \E q1 \in PAS!AllMinQuorums(prop_conf[p].members):
|
||||
LET q1_commit_lsn == PAS!QuorumCommitLsn(p, q1) IN
|
||||
\* Configuration must be the same.
|
||||
/\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation
|
||||
/\ q1_commit_lsn /= NULL
|
||||
\* We must collect acks from both quorums, if newMembers is present.
|
||||
/\ IF prop_conf[p].newMembers = NULL THEN
|
||||
PAS!DoCommitEntries(p, q1_commit_lsn)
|
||||
ELSE
|
||||
\E q2 \in PAS!AllMinQuorums(prop_conf[p].newMembers):
|
||||
LET q2_commit_lsn == PAS!QuorumCommitLsn(p, q2) IN
|
||||
\* Configuration must be the same.
|
||||
/\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation
|
||||
/\ q2_commit_lsn /= NULL
|
||||
/\ PAS!DoCommitEntries(p, PAS!Min(q1_commit_lsn, q2_commit_lsn))
|
||||
/\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
|
||||
|
||||
\* Proposer p adopts higher conf from acceptor a.
|
||||
ProposerSwitchConf(p, a) ==
|
||||
\* p's conf is lower than a's.
|
||||
/\ (acc_conf[a].generation > prop_conf[p].generation)
|
||||
\* We allow to seamlessly bump conf only when proposer is already elected.
|
||||
\* If it isn't, some of the votes already collected could have been from
|
||||
\* non-members of updated conf. It is easier (both here and in the impl) to
|
||||
\* restart instead of figuring and removing these out.
|
||||
\*
|
||||
\* So if proposer is in 'campaign' in the impl we would restart preserving
|
||||
\* conf and increasing term. In the spec this transition is already covered
|
||||
\* by more a generic RestartProposer, so we don't specify it here.
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ prop_conf' = [prop_conf EXCEPT ![p] = acc_conf[a]]
|
||||
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, acc_conf, conf_store>>
|
||||
|
||||
\* Do CAS on the conf store, starting change into the new_members conf.
|
||||
StartChange(new_members) ==
|
||||
\* Possible only if we don't already have the change in progress.
|
||||
/\ conf_store.newMembers = NULL
|
||||
\* Not necessary, but reduces space a bit.
|
||||
/\ new_members /= conf_store.members
|
||||
/\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> new_members]
|
||||
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf>>
|
||||
|
||||
\* Acceptor's last_log_term.
|
||||
AccLastLogTerm(acc) ==
|
||||
PAS!LastLogTerm(PAS!AcceptorTermHistory(acc))
|
||||
|
||||
\* Do CAS on the conf store, transferring joint conf into the newMembers only.
|
||||
FinishChange ==
|
||||
\* have joint conf
|
||||
/\ conf_store.newMembers /= NULL
|
||||
\* The conditions for finishing the change are:
|
||||
/\ \E qo \in PAS!AllMinQuorums(conf_store.members):
|
||||
\* 1) Old majority must be aware of the joint conf.
|
||||
/\ \A a \in qo: conf_store.generation = acc_conf[a].generation
|
||||
\* 2) New member set must have log synced, i.e. some majority needs
|
||||
\* to have <last_log_term, lsn> at least as high as max of some
|
||||
\* old majority.
|
||||
\* 3) Term must be synced, i.e. some majority of the new set must
|
||||
\* have term >= than max term of some old majority.
|
||||
\* This ensures that two leaders are never elected with the same
|
||||
\* term even after config change (which would be bad unless we treat
|
||||
\* generation as a part of term which we don't).
|
||||
\* 4) A majority of the new set must be aware of the joint conf.
|
||||
\* This allows to safely destoy acceptor state if it is not a
|
||||
\* member of its current conf (which is useful for cleanup after
|
||||
\* migration as well as for aborts).
|
||||
/\ LET sync_pos == PAS!MaxTermLsn({[term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)]: a \in qo})
|
||||
sync_term == PAS!Maximum({acc_state[a].term: a \in qo})
|
||||
IN
|
||||
\E qn \in PAS!AllMinQuorums(conf_store.newMembers):
|
||||
\A a \in qn:
|
||||
/\ PAS!TermLsnGE([term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)], sync_pos)
|
||||
/\ acc_state[a].term >= sync_term
|
||||
/\ conf_store.generation = acc_conf[a].generation
|
||||
/\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.newMembers, newMembers |-> NULL]
|
||||
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf>>
|
||||
|
||||
\* Do CAS on the conf store, aborting the change in progress.
|
||||
AbortChange ==
|
||||
\* have joint conf
|
||||
/\ conf_store.newMembers /= NULL
|
||||
/\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> NULL]
|
||||
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf>>
|
||||
|
||||
\* Acceptor a switches to higher configuration (we model it as taken from the
|
||||
\* store, but it can be from proposer/peers as well).
|
||||
AccSwitchConf(a) ==
|
||||
/\ acc_conf[a].generation < conf_store.generation
|
||||
/\ acc_conf' = [acc_conf EXCEPT ![a] = conf_store]
|
||||
/\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, conf_store>>
|
||||
|
||||
\* Nuke all acceptor state if it is not a member of its current conf. Models
|
||||
\* cleanup after migration/abort.
|
||||
AccReset(a) ==
|
||||
/\ \/ (acc_conf[a].newMembers = NULL) /\ (a \notin acc_conf[a].members)
|
||||
\/ (acc_conf[a].newMembers /= NULL) /\ (a \notin (acc_conf[a].members \union acc_conf[a].newMembers))
|
||||
/\ acc_state' = [acc_state EXCEPT ![a] = PAS!InitAcc]
|
||||
\* Set nextSendLsn to `a` to NULL everywhere. nextSendLsn serves as a mark
|
||||
\* that elected proposer performed TruncateWal on the acceptor, which isn't
|
||||
\* true anymore after state reset. In the impl local deletion is expected to
|
||||
\* terminate all existing connections.
|
||||
/\ prop_state' = [p \in proposers |-> [prop_state[p] EXCEPT !.nextSendLsn[a] = NULL]]
|
||||
/\ UNCHANGED <<committed, elected_history, prop_conf, acc_conf, conf_store>>
|
||||
|
||||
|
||||
\*******************************************************************************
|
||||
\* Final spec
|
||||
\*******************************************************************************
|
||||
|
||||
Next ==
|
||||
\/ \E p \in proposers: RestartProposer(p)
|
||||
\/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
|
||||
\/ \E p \in proposers: BecomeLeader(p)
|
||||
\/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
|
||||
\/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a)
|
||||
\/ \E p \in proposers: NewEntry(p)
|
||||
\/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a)
|
||||
\/ \E p \in proposers: CommitEntries(p)
|
||||
\/ \E new_members \in SUBSET acceptors: StartChange(new_members)
|
||||
\/ FinishChange
|
||||
\/ AbortChange
|
||||
\/ \E p \in proposers: \E a \in acceptors: ProposerSwitchConf(p, a)
|
||||
\/ \E a \in acceptors: AccSwitchConf(a)
|
||||
\/ \E a \in acceptors: AccReset(a)
|
||||
|
||||
Spec == Init /\ [][Next]_<<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf, conf_store>>
|
||||
|
||||
\********************************************************************************
|
||||
\* Invariants
|
||||
\********************************************************************************
|
||||
|
||||
ElectionSafety == PAS!ElectionSafety
|
||||
|
||||
ElectionSafetyFull == PAS!ElectionSafetyFull
|
||||
|
||||
LogIsMonotonic == PAS!LogIsMonotonic
|
||||
|
||||
LogSafety == PAS!LogSafety
|
||||
|
||||
\********************************************************************************
|
||||
\* Invariants which don't need to hold, but useful for playing/debugging.
|
||||
\********************************************************************************
|
||||
|
||||
CommittedNotTruncated == PAS!CommittedNotTruncated
|
||||
|
||||
MaxTerm == PAS!MaxTerm
|
||||
\* MaxTerm == \A p \in proposers: prop_state[p].term <= 1
|
||||
|
||||
MaxStoreConf == conf_store.generation <= 1
|
||||
|
||||
\* Check that we ever switch into non joint conf.
|
||||
MaxAccConf == ~ \E a \in acceptors:
|
||||
/\ acc_conf[a].generation = 3
|
||||
/\ acc_conf[a].newMembers /= NULL
|
||||
|
||||
MaxAccWalLen == PAS!MaxAccWalLen
|
||||
|
||||
MaxCommitLsn == PAS!MaxCommitLsn
|
||||
|
||||
====
|
||||
517
safekeeper/spec/ProposerAcceptorStatic.tla
Normal file
517
safekeeper/spec/ProposerAcceptorStatic.tla
Normal file
@@ -0,0 +1,517 @@
|
||||
---- MODULE ProposerAcceptorStatic ----
|
||||
|
||||
(*
|
||||
The protocol is very similar to Raft. The key differences are:
|
||||
- Leaders (proposers) are separated from storage nodes (acceptors), which has
|
||||
been already an established way to think about Paxos.
|
||||
- We don't want to stamp each log record with term, so instead carry around
|
||||
term histories which are sequences of <term, LSN where term begins> pairs.
|
||||
As a bonus (and subtlety) this allows the proposer to commit entries from
|
||||
previous terms without writing new records -- if acceptor's log is caught
|
||||
up, update of term history on it updates last_log_term as well.
|
||||
*)
|
||||
|
||||
\* Model simplifications:
|
||||
\* - Instant message delivery. Notably, ProposerElected message (TruncateWal action) is not
|
||||
\* delayed, so we don't attempt to truncate WAL when the same wp already appended something
|
||||
\* on the acceptor since common point had been calculated (this should be rejected).
|
||||
\* - old WAL is immediately copied to proposer on its election, without on-demand fetch later.
|
||||
|
||||
\* Some ideas how to break it to play around to get a feeling:
|
||||
\* - replace Quorum with BadQuorum.
|
||||
\* - remove 'don't commit entries from previous terms separately' rule in
|
||||
\* CommitEntries and observe figure 8 from the raft paper.
|
||||
\* With p2a3t4l4 32 steps error was found in 1h on 80 cores.
|
||||
|
||||
EXTENDS Integers, Sequences, FiniteSets, TLC
|
||||
|
||||
VARIABLES
|
||||
prop_state, \* prop_state[p] is state of proposer p
|
||||
acc_state, \* acc_state[a] is state of acceptor a
|
||||
committed, \* bag (set) of ever committed <<term, lsn>> entries
|
||||
elected_history \* counter for elected terms, see TypeOk for details
|
||||
|
||||
CONSTANT
|
||||
acceptors,
|
||||
proposers
|
||||
|
||||
CONSTANT NULL
|
||||
|
||||
\********************************************************************************
|
||||
\* Helpers
|
||||
\********************************************************************************
|
||||
|
||||
Maximum(S) ==
|
||||
(*************************************************************************)
|
||||
(* If S is a set of numbers, then this define Maximum(S) to be the *)
|
||||
(* maximum of those numbers, or -1 if S is empty. *)
|
||||
(*************************************************************************)
|
||||
IF S = {} THEN -1 ELSE CHOOSE n \in S : \A m \in S : n \geq m
|
||||
|
||||
\* minimum of numbers in the set, error if set is empty
|
||||
Minimum(S) == CHOOSE min \in S : \A n \in S : min <= n
|
||||
|
||||
\* Min of two numbers
|
||||
Min(a, b) == IF a < b THEN a ELSE b
|
||||
|
||||
\* Sort of 0 for functions
|
||||
EmptyF == [x \in {} |-> 42]
|
||||
IsEmptyF(f) == DOMAIN f = {}
|
||||
|
||||
\* Set of values (image) of the function f. Apparently no such builtin.
|
||||
Range(f) == {f[x] : x \in DOMAIN f}
|
||||
|
||||
\* If key k is in function f, map it using l, otherwise insert v. Returns the
|
||||
\* updated function.
|
||||
Upsert(f, k, v, l(_)) ==
|
||||
LET new_val == IF k \in DOMAIN f THEN l(f[k]) ELSE v IN
|
||||
(k :> new_val) @@ f
|
||||
|
||||
\*****************
|
||||
|
||||
\* Does set of acceptors `acc_set` form the quorum in the member set `members`?
|
||||
\* Acceptors not from `members` are excluded (matters only for reconfig).
|
||||
FormsQuorum(acc_set, members) ==
|
||||
Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2 + 1)
|
||||
|
||||
\* Like FormsQuorum, but for minimal quorum.
|
||||
FormsMinQuorum(acc_set, members) ==
|
||||
Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2 + 1)
|
||||
|
||||
\* All sets of acceptors forming minimal quorums in the member set `members`.
|
||||
AllQuorums(members) == {subset \in SUBSET members: FormsQuorum(subset, members)}
|
||||
AllMinQuorums(members) == {subset \in SUBSET acceptors: FormsMinQuorum(subset, members)}
|
||||
|
||||
\* For substituting Quorum and seeing what happens.
|
||||
FormsBadQuorum(acc_set, members) ==
|
||||
Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2)
|
||||
FormsMinBadQuorum(acc_set, members) ==
|
||||
Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2)
|
||||
AllBadQuorums(members) == {subset \in SUBSET acceptors: FormsBadQuorum(subset, members)}
|
||||
AllMinBadQuorums(members) == {subset \in SUBSET acceptors: FormsMinBadQuorum(subset, members)}
|
||||
|
||||
\* flushLsn (end of WAL, i.e. index of next entry) of acceptor a.
|
||||
FlushLsn(a) == Len(acc_state[a].wal) + 1
|
||||
|
||||
\* Typedefs. Note that TLA+ Nat includes zero.
|
||||
Terms == Nat
|
||||
Lsns == Nat
|
||||
|
||||
\********************************************************************************
|
||||
\* Type assertion
|
||||
\********************************************************************************
|
||||
\* Defining sets of all possible tuples and using them in TypeOk in usual
|
||||
\* all-tuples constructor is not practical because such definitions force
|
||||
\* TLC to enumerate them, while they are are horribly enormous
|
||||
\* (TLC screams "Attempted to construct a set with too many elements").
|
||||
\* So instead check types manually.
|
||||
|
||||
|
||||
\* Term history is a sequence of <term, LSN where term begins> pairs.
|
||||
IsTermHistory(th) ==
|
||||
\A th_entry \in Range(th): th_entry.term \in Terms /\ th_entry.lsn \in Lsns
|
||||
|
||||
IsWal(w) ==
|
||||
\A i \in DOMAIN w:
|
||||
/\ i \in Lsns
|
||||
/\ w[i] \in Terms
|
||||
|
||||
TypeOk ==
|
||||
/\ \A p \in proposers:
|
||||
\* '_' in field names hinders pretty printing
|
||||
\* https://github.com/tlaplus/tlaplus/issues/1051
|
||||
\* so use camel case.
|
||||
/\ DOMAIN prop_state[p] = {"state", "term", "votes", "termHistory", "wal", "nextSendLsn"}
|
||||
\* In campaign proposer sends RequestVote and waits for acks;
|
||||
\* in leader he is elected.
|
||||
/\ prop_state[p].state \in {"campaign", "leader"}
|
||||
\* term for which it will campaign, or won term in leader state
|
||||
/\ prop_state[p].term \in Terms
|
||||
\* votes received
|
||||
/\ \A voter \in DOMAIN prop_state[p].votes: voter \in acceptors
|
||||
/\ \A vote \in Range(prop_state[p].votes):
|
||||
/\ IsTermHistory(vote.termHistory)
|
||||
/\ vote.flushLsn \in Lsns
|
||||
\* Proposer's term history. Empty while proposer is in "campaign".
|
||||
/\ IsTermHistory(prop_state[p].termHistory)
|
||||
\* In the model we identify WAL entries only by <term, LSN> pairs
|
||||
\* without additional unique id, which is enough for its purposes.
|
||||
\* It means that with term history fully modeled wal becomes
|
||||
\* redundant as it can be computed from term history + WAL length.
|
||||
\* However, we still keep it here and at acceptors as explicit sequence
|
||||
\* where index is LSN and value is the term to avoid artificial mapping to
|
||||
\* figure out real entries. It shouldn't bloat model much because this
|
||||
\* doesn't increase number of distinct states.
|
||||
/\ IsWal(prop_state[p].wal)
|
||||
\* Map of acceptor -> next lsn to send. It is set when truncate_wal is
|
||||
\* done so sending entries is allowed only after that. In the impl TCP
|
||||
\* ensures this ordering. We use NULL instead of missing value to use
|
||||
\* EXCEPT in AccReset.
|
||||
/\ \A a \in DOMAIN prop_state[p].nextSendLsn:
|
||||
/\ a \in acceptors
|
||||
/\ prop_state[p].nextSendLsn[a] \in Lsns \union {NULL}
|
||||
/\ \A a \in acceptors:
|
||||
/\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"}
|
||||
/\ acc_state[a].term \in Terms
|
||||
/\ IsTermHistory(acc_state[a].termHistory)
|
||||
/\ IsWal(acc_state[a].wal)
|
||||
/\ \A c \in committed:
|
||||
/\ c.term \in Terms
|
||||
/\ c.lsn \in Lsns
|
||||
\* elected_history is a retrospective map of term -> number of times it was
|
||||
\* elected, for use in ElectionSafetyFull invariant. For static spec it is
|
||||
\* fairly convincing that it holds, but with membership change it is less
|
||||
\* trivial. And as we identify log entries only with <term, lsn>, importance
|
||||
\* of it is quite high as violation of log safety might go undetected if
|
||||
\* election safety is violated. Note though that this is not always the
|
||||
\* case, i.e. you can imagine (and TLC should find) schedule where log
|
||||
\* safety violation is still detected because two leaders with the same term
|
||||
\* commit histories which are different in previous terms, so it is not that
|
||||
\* crucial. Plus if spec allows ElectionSafetyFull violation, likely
|
||||
\* ElectionSafety will also be violated in some schedules. But neither it
|
||||
\* should bloat the model too much.
|
||||
/\ \A term \in DOMAIN elected_history:
|
||||
/\ term \in Terms
|
||||
/\ elected_history[term] \in Nat
|
||||
|
||||
\********************************************************************************
|
||||
\* Initial
|
||||
\********************************************************************************
|
||||
|
||||
InitAcc ==
|
||||
[
|
||||
\* There will be no leader in zero term, 1 is the first
|
||||
\* real.
|
||||
term |-> 0,
|
||||
\* Again, leader in term 0 doesn't exist, but we initialize
|
||||
\* term histories with it to always have common point in
|
||||
\* them. Lsn is 1 because TLA+ sequences are indexed from 1
|
||||
\* (we don't want to truncate WAL out of range).
|
||||
termHistory |-> << [term |-> 0, lsn |-> 1] >>,
|
||||
wal |-> << >>
|
||||
]
|
||||
|
||||
Init ==
|
||||
/\ prop_state = [p \in proposers |-> [
|
||||
state |-> "campaign",
|
||||
term |-> 1,
|
||||
votes |-> EmptyF,
|
||||
termHistory |-> << >>,
|
||||
wal |-> << >>,
|
||||
nextSendLsn |-> [a \in acceptors |-> NULL]
|
||||
]]
|
||||
/\ acc_state = [a \in acceptors |-> InitAcc]
|
||||
/\ committed = {}
|
||||
/\ elected_history = EmptyF
|
||||
|
||||
|
||||
\********************************************************************************
|
||||
\* Actions
|
||||
\********************************************************************************
|
||||
|
||||
RestartProposerWithTerm(p, new_term) ==
|
||||
/\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
|
||||
![p].term = new_term,
|
||||
![p].votes = EmptyF,
|
||||
![p].termHistory = << >>,
|
||||
![p].wal = << >>,
|
||||
![p].nextSendLsn = [a \in acceptors |-> NULL]]
|
||||
/\ UNCHANGED <<acc_state, committed, elected_history>>
|
||||
|
||||
\* Proposer p loses all state, restarting.
|
||||
\* For simplicity (and to reduct state space), we assume it immediately gets
|
||||
\* current state from quorum q of acceptors determining the term he will request
|
||||
\* to vote for.
|
||||
RestartProposer(p) ==
|
||||
\E q \in AllQuorums(acceptors):
|
||||
LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN
|
||||
RestartProposerWithTerm(p, new_term)
|
||||
|
||||
\* Term history of acceptor a's WAL: the one saved truncated to contain only <=
|
||||
\* local FlushLsn entries. Note that FlushLsn is the end LSN of the last entry
|
||||
\* (and begin LSN of the next). The mental model for non strict comparison is
|
||||
\* that once proposer is elected it immediately writes log record with zero
|
||||
\* length. This allows leader to commit existing log without writing any new
|
||||
\* entries. For example, assume acceptor has WAL
|
||||
\* 1.1, 1.2
|
||||
\* written by prop with term 1; its current <last_log_term, flush_lsn>
|
||||
\* is <1, 3>. Now prop with term 2 and max vote from this acc is elected.
|
||||
\* Once TruncateWAL is done, <last_log_term, flush_lsn> becomes <2, 3>
|
||||
\* without any new records explicitly written.
|
||||
AcceptorTermHistory(a) ==
|
||||
SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a))
|
||||
|
||||
\* Acceptor a immediately votes for proposer p.
|
||||
Vote(p, a) ==
|
||||
/\ prop_state[p].state = "campaign"
|
||||
/\ acc_state[a].term < prop_state[p].term \* main voting condition
|
||||
/\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
|
||||
/\ LET
|
||||
vote == [termHistory |-> AcceptorTermHistory(a), flushLsn |-> FlushLsn(a)]
|
||||
IN
|
||||
prop_state' = [prop_state EXCEPT ![p].votes = (a :> vote) @@ prop_state[p].votes]
|
||||
/\ UNCHANGED <<committed, elected_history>>
|
||||
|
||||
|
||||
\* Get lastLogTerm from term history th.
|
||||
LastLogTerm(th) == th[Len(th)].term
|
||||
|
||||
\* Compares <term, lsn> pairs: returns true if tl1 >= tl2.
|
||||
TermLsnGE(tl1, tl2) ==
|
||||
/\ tl1.term >= tl2.term
|
||||
/\ (tl1.term = tl2.term => tl1.lsn >= tl2.lsn)
|
||||
|
||||
\* Choose max <term, lsn> pair in the non empty set of them.
|
||||
MaxTermLsn(term_lsn_set) ==
|
||||
CHOOSE max_tl \in term_lsn_set: \A tl \in term_lsn_set: TermLsnGE(max_tl, tl)
|
||||
|
||||
\* Find acceptor with the highest <last_log_term, lsn> vote in proposer p's votes.
|
||||
MaxVoteAcc(p) ==
|
||||
CHOOSE a \in DOMAIN prop_state[p].votes:
|
||||
LET a_vote == prop_state[p].votes[a]
|
||||
a_vote_term_lsn == [term |-> LastLogTerm(a_vote.termHistory), lsn |-> a_vote.flushLsn]
|
||||
vote_term_lsns == {[term |-> LastLogTerm(v.termHistory), lsn |-> v.flushLsn]: v \in Range(prop_state[p].votes)}
|
||||
IN
|
||||
a_vote_term_lsn = MaxTermLsn(vote_term_lsns)
|
||||
|
||||
\* Workhorse for BecomeLeader.
|
||||
\* Assumes the check prop_state[p] votes is quorum has been done *outside*.
|
||||
DoBecomeLeader(p) ==
|
||||
LET
|
||||
\* Find acceptor with the highest <last_log_term, lsn> vote.
|
||||
max_vote_acc == MaxVoteAcc(p)
|
||||
max_vote == prop_state[p].votes[max_vote_acc]
|
||||
prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn])
|
||||
IN
|
||||
\* We copy all log preceding proposer's term from the max vote node so
|
||||
\* make sure it is still on one term with us. This is a model
|
||||
\* simplification which can be removed, in impl we fetch WAL on demand
|
||||
\* from safekeeper which has it later. Note though that in case of on
|
||||
\* demand fetch we must check on donor not only term match, but that
|
||||
\* truncate_wal had already been done (if it is not max_vote_acc).
|
||||
/\ acc_state[max_vote_acc].term = prop_state[p].term
|
||||
/\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
|
||||
![p].termHistory = prop_th,
|
||||
![p].wal = acc_state[max_vote_acc].wal
|
||||
]
|
||||
/\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1)
|
||||
/\ UNCHANGED <<acc_state, committed>>
|
||||
|
||||
\* Proposer p gets elected.
|
||||
BecomeLeader(p) ==
|
||||
/\ prop_state[p].state = "campaign"
|
||||
/\ FormsQuorum(DOMAIN prop_state[p].votes, acceptors)
|
||||
/\ DoBecomeLeader(p)
|
||||
|
||||
\* Acceptor a learns about elected proposer p's term. In impl it matches to
|
||||
\* VoteRequest/VoteResponse exchange when leader is already elected and is not
|
||||
\* interested in the vote result.
|
||||
UpdateTerm(p, a) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ acc_state[a].term < prop_state[p].term
|
||||
/\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
|
||||
/\ UNCHANGED <<prop_state, committed, elected_history>>
|
||||
|
||||
\* Find highest common point (LSN of the first divergent record) in the logs of
|
||||
\* proposer p and acceptor a. Returns <term, lsn> of the highest common point.
|
||||
FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) ==
|
||||
LET
|
||||
\* First find index of the highest common term.
|
||||
\* It must exist because we initialize th with <0, 1>.
|
||||
last_common_idx == Maximum({i \in 1..Min(Len(prop_th), Len(acc_th)): prop_th[i].term = acc_th[i].term})
|
||||
last_common_term == prop_th[last_common_idx].term
|
||||
\* Now find where it ends at both prop and acc and take min. End of term
|
||||
\* is the start of the next unless it is the last one; there it is
|
||||
\* flush_lsn in case of acceptor. In case of proposer it is the current
|
||||
\* writing position, but it can't be less than flush_lsn, so we
|
||||
\* take flush_lsn.
|
||||
acc_common_term_end == IF last_common_idx = Len(acc_th) THEN acc_flush_lsn ELSE acc_th[last_common_idx + 1].lsn
|
||||
prop_common_term_end == IF last_common_idx = Len(prop_th) THEN acc_flush_lsn ELSE prop_th[last_common_idx + 1].lsn
|
||||
IN
|
||||
[term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)]
|
||||
|
||||
\* Elected proposer p immediately truncates WAL (and sets term history) of
|
||||
\* acceptor a before starting streaming. Establishes nextSendLsn for a.
|
||||
\*
|
||||
\* In impl this happens at each reconnection, here we also allow to do it
|
||||
\* multiple times.
|
||||
TruncateWal(p, a) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ acc_state[a].term = prop_state[p].term
|
||||
/\ LET
|
||||
hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
|
||||
next_send_lsn == (a :> hcp.lsn) @@ prop_state[p].nextSendLsn
|
||||
IN
|
||||
\* Acceptor persists full history immediately; reads adjust it to the
|
||||
\* really existing wal with AcceptorTermHistory.
|
||||
/\ acc_state' = [acc_state EXCEPT ![a].termHistory = prop_state[p].termHistory,
|
||||
\* note: SubSeq is inclusive, hence -1.
|
||||
![a].wal = SubSeq(acc_state[a].wal, 1, hcp.lsn - 1)
|
||||
]
|
||||
/\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn = next_send_lsn]
|
||||
/\ UNCHANGED <<committed, elected_history>>
|
||||
|
||||
\* Append new log entry to elected proposer
|
||||
NewEntry(p) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ LET
|
||||
\* entry consists only of term, index serves as LSN.
|
||||
new_entry == prop_state[p].term
|
||||
IN
|
||||
/\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)]
|
||||
/\ UNCHANGED <<acc_state, committed, elected_history>>
|
||||
|
||||
\* Immediately append next entry from elected proposer to acceptor a.
|
||||
AppendEntry(p, a) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ acc_state[a].term = prop_state[p].term
|
||||
/\ prop_state[p].nextSendLsn[a] /= NULL \* did TruncateWal
|
||||
/\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send
|
||||
/\ LET
|
||||
send_lsn == prop_state[p].nextSendLsn[a]
|
||||
entry == prop_state[p].wal[send_lsn]
|
||||
\* Since message delivery is instant we don't check that send_lsn follows
|
||||
\* the last acc record, it must always be true.
|
||||
IN
|
||||
/\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn[a] = send_lsn + 1]
|
||||
/\ acc_state' = [acc_state EXCEPT ![a].wal = Append(acc_state[a].wal, entry)]
|
||||
/\ UNCHANGED <<committed, elected_history>>
|
||||
|
||||
\* LSN where elected proposer p starts writing its records.
|
||||
PropStartLsn(p) ==
|
||||
IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL
|
||||
|
||||
\* LSN which can be committed by proposer p using min quorum q (check that q
|
||||
\* forms quorum must have been done outside). NULL if there is none.
|
||||
QuorumCommitLsn(p, q) ==
|
||||
IF
|
||||
/\ prop_state[p].state = "leader"
|
||||
/\ \A a \in q:
|
||||
\* Without explicit responses to appends this ensures that append
|
||||
\* up to FlushLsn has been accepted.
|
||||
/\ acc_state[a].term = prop_state[p].term
|
||||
\* nextSendLsn existence means TruncateWal has happened, it ensures
|
||||
\* acceptor's WAL (and FlushLsn) are from proper proposer's history.
|
||||
\* Alternatively we could compare LastLogTerm here, but that's closer to
|
||||
\* what we do in the impl (we check flushLsn in AppendResponse, but
|
||||
\* AppendRequest is processed only if HandleElected handling was good).
|
||||
/\ prop_state[p].nextSendLsn[a] /= NULL
|
||||
THEN
|
||||
\* Now find the LSN present on all the quorum.
|
||||
LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN
|
||||
\* This is the basic Raft rule of not committing entries from previous
|
||||
\* terms except along with current term entry (commit them only when
|
||||
\* quorum recovers, i.e. last_log_term on it reaches leader's term).
|
||||
IF quorum_lsn >= PropStartLsn(p) THEN
|
||||
quorum_lsn
|
||||
ELSE
|
||||
NULL
|
||||
ELSE
|
||||
NULL
|
||||
|
||||
\* Commit all entries on proposer p with record lsn < commit_lsn.
|
||||
DoCommitEntries(p, commit_lsn) ==
|
||||
/\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(commit_lsn - 1)}
|
||||
/\ UNCHANGED <<prop_state, acc_state, elected_history>>
|
||||
|
||||
\* Proposer p commits all entries it can using some quorum. Note that unlike
|
||||
\* will62794/logless-reconfig this allows to commit entries from previous terms
|
||||
\* (when conditions for that are met).
|
||||
CommitEntries(p) ==
|
||||
/\ prop_state[p].state = "leader"
|
||||
\* Using min quorums here is better because 1) QuorumCommitLsn for
|
||||
\* simplicity checks min across all accs in q. 2) it probably makes
|
||||
\* evaluation faster.
|
||||
/\ \E q \in AllMinQuorums(acceptors):
|
||||
LET commit_lsn == QuorumCommitLsn(p, q) IN
|
||||
/\ commit_lsn /= NULL
|
||||
/\ DoCommitEntries(p, commit_lsn)
|
||||
|
||||
\*******************************************************************************
|
||||
\* Final spec
|
||||
\*******************************************************************************
|
||||
|
||||
Next ==
|
||||
\/ \E p \in proposers: RestartProposer(p)
|
||||
\/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
|
||||
\/ \E p \in proposers: BecomeLeader(p)
|
||||
\/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
|
||||
\/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a)
|
||||
\/ \E p \in proposers: NewEntry(p)
|
||||
\/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a)
|
||||
\/ \E p \in proposers: CommitEntries(p)
|
||||
|
||||
Spec == Init /\ [][Next]_<<prop_state, acc_state, committed, elected_history>>
|
||||
|
||||
|
||||
\********************************************************************************
|
||||
\* Invariants
|
||||
\********************************************************************************
|
||||
|
||||
\* Lighter version of ElectionSafetyFull which doesn't require elected_history.
|
||||
ElectionSafety ==
|
||||
\A p1, p2 \in proposers:
|
||||
(/\ prop_state[p1].state = "leader"
|
||||
/\ prop_state[p2].state = "leader"
|
||||
/\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2)
|
||||
|
||||
\* Single term must never be elected more than once.
|
||||
ElectionSafetyFull == \A term \in DOMAIN elected_history: elected_history[term] <= 1
|
||||
|
||||
\* Log is expected to be monotonic by <term, lsn> comparison. This is not true
|
||||
\* in variants of multi Paxos, but in Raft (and here) it is.
|
||||
LogIsMonotonic ==
|
||||
\A a \in acceptors:
|
||||
\A i, j \in DOMAIN acc_state[a].wal:
|
||||
(i > j) => (acc_state[a].wal[i] >= acc_state[a].wal[j])
|
||||
|
||||
\* Main invariant: If two entries are committed at the same LSN, they must be
|
||||
\* the same entry.
|
||||
LogSafety ==
|
||||
\A c1, c2 \in committed: (c1.lsn = c2.lsn) => (c1 = c2)
|
||||
|
||||
|
||||
\********************************************************************************
|
||||
\* Invariants which don't need to hold, but useful for playing/debugging.
|
||||
\********************************************************************************
|
||||
|
||||
\* Limits term of elected proposers
|
||||
MaxTerm == \A p \in proposers: (prop_state[p].state = "leader" => prop_state[p].term < 2)
|
||||
|
||||
MaxAccWalLen == \A a \in acceptors: Len(acc_state[a].wal) < 2
|
||||
|
||||
\* Limits max number of committed entries. That way we can check that we'are
|
||||
\* actually committing something.
|
||||
MaxCommitLsn == Cardinality(committed) < 2
|
||||
|
||||
\* How many records with different terms can be removed in single WAL
|
||||
\* truncation.
|
||||
MaxTruncatedTerms ==
|
||||
\A p \in proposers: \A a \in acceptors:
|
||||
(/\ prop_state[p].state = "leader"
|
||||
/\ prop_state[p].term = acc_state[a].term) =>
|
||||
LET
|
||||
hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
|
||||
truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn}
|
||||
truncated_records_terms == {acc_state[a].wal[lsn]: lsn \in truncated_lsns}
|
||||
IN
|
||||
Cardinality(truncated_records_terms) < 2
|
||||
|
||||
\* Check that TruncateWal never deletes committed record.
|
||||
\* It might seem that this should an invariant, but it is not.
|
||||
\* With 5 nodes, it is legit to truncate record which had been
|
||||
\* globally committed: e.g. nodes abc can commit record of term 1 in
|
||||
\* term 3, and after that leader of term 2 can delete such record
|
||||
\* on d. On 10 cores TLC can find such a trace in ~7 hours.
|
||||
CommittedNotTruncated ==
|
||||
\A p \in proposers: \A a \in acceptors:
|
||||
(/\ prop_state[p].state = "leader"
|
||||
/\ prop_state[p].term = acc_state[a].term) =>
|
||||
LET
|
||||
hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
|
||||
truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn}
|
||||
truncated_records == {[term |-> acc_state[a].wal[lsn], lsn |-> lsn]: lsn \in truncated_lsns}
|
||||
IN
|
||||
\A r \in truncated_records: r \notin committed
|
||||
|
||||
====
|
||||
51
safekeeper/spec/modelcheck.sh
Executable file
51
safekeeper/spec/modelcheck.sh
Executable file
@@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Usage: ./modelcheck.sh <config_file> <spec_file>, e.g.
|
||||
# ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla
|
||||
# ./modelcheck.sh models/MCProposerAcceptorReconfig_p2_a3_t3_l3_c3.cfg MCProposerAcceptorReconfig.tla
|
||||
CONFIG=$1
|
||||
SPEC=$2
|
||||
|
||||
MEM=7G
|
||||
TOOLSPATH="/opt/TLA+Toolbox/tla2tools.jar"
|
||||
|
||||
mkdir -p "tlc-results"
|
||||
CONFIG_FILE=$(basename -- "$CONFIG")
|
||||
outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log
|
||||
outfile="tlc-results/$outfilename"
|
||||
touch $outfile
|
||||
|
||||
# Save some info about the run.
|
||||
GIT_REV=`git rev-parse --short HEAD`
|
||||
INFO=`uname -a`
|
||||
|
||||
# First for Linux, second for Mac.
|
||||
CPUNAMELinux=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1')
|
||||
CPUCORESLinux=`nproc`
|
||||
CPUNAMEMac=`sysctl -n machdep.cpu.brand_string`
|
||||
CPUCORESMac=`sysctl -n machdep.cpu.thread_count`
|
||||
|
||||
echo "git revision: $GIT_REV" >> $outfile
|
||||
echo "Platform: $INFO" >> $outfile
|
||||
echo "CPU Info Linux: $CPUNAMELinux" >> $outfile
|
||||
echo "CPU Cores Linux: $CPUCORESLinux" >> $outfile
|
||||
echo "CPU Info Mac: $CPUNAMEMac" >> $outfile
|
||||
echo "CPU Cores Mac: $CPUCORESMac" >> $outfile
|
||||
echo "Spec: $SPEC" >> $outfile
|
||||
echo "Config: $CONFIG" >> $outfile
|
||||
echo "----" >> $outfile
|
||||
cat $CONFIG >> $outfile
|
||||
echo "" >> $outfile
|
||||
echo "----" >> $outfile
|
||||
echo "" >> $outfile
|
||||
|
||||
# see
|
||||
# https://lamport.azurewebsites.net/tla/current-tools.pdf
|
||||
# for TLC options.
|
||||
# OffHeapDiskFPSet is the optimal fingerprint set implementation
|
||||
# https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets
|
||||
#
|
||||
# Add -simulate to run in infinite simulation mode.
|
||||
# -coverage 1 is useful for profiling (check how many times actions are taken).
|
||||
java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \
|
||||
-cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile
|
||||
@@ -0,0 +1,18 @@
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2}
|
||||
max_term = 2
|
||||
max_entries = 2
|
||||
max_generation = 3
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafetyFull
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
\* CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
@@ -0,0 +1,18 @@
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2}
|
||||
max_term = 2
|
||||
max_entries = 2
|
||||
max_generation = 5
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafetyFull
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
\* CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
@@ -0,0 +1,19 @@
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3}
|
||||
max_term = 2
|
||||
max_entries = 2
|
||||
max_generation = 3
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafetyFull
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3, a4}
|
||||
max_term = 2
|
||||
max_entries = 2
|
||||
max_generation = 3
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafetyFull
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
\* A very small model just to play.
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3}
|
||||
max_term = 2
|
||||
max_entries = 2
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafetyFull
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
\* A model next to the smallest one.
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3}
|
||||
max_term = 3
|
||||
max_entries = 2
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafetyFull
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3}
|
||||
max_term = 3
|
||||
max_entries = 3
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
@@ -0,0 +1,17 @@
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3}
|
||||
max_term = 4
|
||||
max_entries = 4
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
@@ -0,0 +1,16 @@
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3, a4, a5}
|
||||
max_term = 2
|
||||
max_entries = 2
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
@@ -0,0 +1,16 @@
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3, a4, a5}
|
||||
max_term = 3
|
||||
max_entries = 3
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
@@ -0,0 +1,16 @@
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3, a4, a5}
|
||||
max_term = 4
|
||||
max_entries = 3
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
12
safekeeper/spec/readme.md
Normal file
12
safekeeper/spec/readme.md
Normal file
@@ -0,0 +1,12 @@
|
||||
The specifications, models and results of running of them of the compute <->
|
||||
safekeepers consensus algorithm for committing WAL on the fleet of safekeepers.
|
||||
Following Paxos parlance, compute which writes WAL is called (WAL) proposer here
|
||||
and safekeepers which persist it are called (WAL) acceptors.
|
||||
|
||||
Directory structure:
|
||||
- Use modelcheck.sh to run TLC.
|
||||
- MC*.tla contains bits of TLA+ needed for TLC like constraining the state space, and models/ actual models.
|
||||
- Other .tla files are the actual specs.
|
||||
|
||||
Structure is partially borrowed from
|
||||
[logless-reconfig](https://github.com/will62794/logless-reconfig), thanks to it.
|
||||
@@ -0,0 +1,63 @@
|
||||
git revision: 864f4667d
|
||||
Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
|
||||
CPU Info Linux: Neoverse-N1
|
||||
CPU Cores Linux: 80
|
||||
CPU Info Mac:
|
||||
CPU Cores Mac:
|
||||
Spec: MCProposerAcceptorStatic.tla
|
||||
Config: models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
|
||||
----
|
||||
\* A very small model just to play.
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3}
|
||||
max_term = 2
|
||||
max_entries = 2
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
|
||||
|
||||
----
|
||||
|
||||
TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
|
||||
Running breadth-first search Model-Checking with fp 110 and seed 3949669318051689745 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 46037] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
|
||||
Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
|
||||
Parsing file /tmp/tlc-11123278435718411444/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
|
||||
Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
|
||||
Parsing file /tmp/tlc-11123278435718411444/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
|
||||
Parsing file /tmp/tlc-11123278435718411444/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
|
||||
Parsing file /tmp/tlc-11123278435718411444/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
|
||||
Parsing file /tmp/tlc-11123278435718411444/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
|
||||
Parsing file /tmp/tlc-11123278435718411444/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
|
||||
Parsing file /tmp/tlc-11123278435718411444/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
|
||||
Semantic processing of module Naturals
|
||||
Semantic processing of module Sequences
|
||||
Semantic processing of module FiniteSets
|
||||
Semantic processing of module TLC
|
||||
Semantic processing of module Integers
|
||||
Semantic processing of module ProposerAcceptorStatic
|
||||
Semantic processing of module TLCExt
|
||||
Semantic processing of module _TLCTrace
|
||||
Semantic processing of module MCProposerAcceptorStatic
|
||||
Starting... (2024-11-06 13:44:18)
|
||||
Computing initial states...
|
||||
Finished computing initial states: 1 distinct state generated at 2024-11-06 13:44:20.
|
||||
Model checking completed. No error has been found.
|
||||
Estimates of the probability that TLC did not check all reachable states
|
||||
because two distinct states had the same fingerprint:
|
||||
calculated (optimistic): val = 2.9E-9
|
||||
based on the actual fingerprints: val = 4.1E-10
|
||||
922134 states generated, 61249 distinct states found, 0 states left on queue.
|
||||
The depth of the complete state graph search is 31.
|
||||
The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 6 and the 95th percentile is 3).
|
||||
Finished in 11s at (2024-11-06 13:44:28)
|
||||
@@ -0,0 +1,69 @@
|
||||
git revision: bcbff084a
|
||||
Platform: Linux nonlibrem 6.10.11-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.10.11-1 (2024-09-22) x86_64 GNU/Linux
|
||||
CPU Info Linux: 13th Gen Intel(R) Core(TM) i7-1355U
|
||||
CPU Cores Linux: 10
|
||||
CPU Info Mac:
|
||||
CPU Cores Mac:
|
||||
Spec: MCProposerAcceptorStatic.tla
|
||||
Config: models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
|
||||
----
|
||||
\* A model next to the smallest one.
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3}
|
||||
max_term = 3
|
||||
max_entries = 2
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
|
||||
|
||||
----
|
||||
|
||||
TLC2 Version 2.20 of Day Month 20?? (rev: cc65eef)
|
||||
Running breadth-first search Model-Checking with fp 41 and seed -3061068726727581619 with 10 workers on 10 cores with 6372MB heap and 7168MB offheap memory [pid: 1250346] (Linux 6.10.11-amd64 amd64, Debian 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue).
|
||||
Parsing file /home/ars/neon/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
|
||||
Parsing file /tmp/tlc-3023124431504466774/TLC.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
|
||||
Parsing file /home/ars/neon/neon/safekeeper/spec/ProposerAcceptorStatic.tla
|
||||
Parsing file /tmp/tlc-3023124431504466774/_TLCTrace.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
|
||||
Parsing file /tmp/tlc-3023124431504466774/Integers.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
|
||||
Parsing file /tmp/tlc-3023124431504466774/Sequences.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
|
||||
Parsing file /tmp/tlc-3023124431504466774/FiniteSets.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
|
||||
Parsing file /tmp/tlc-3023124431504466774/Naturals.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
|
||||
Parsing file /tmp/tlc-3023124431504466774/TLCExt.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
|
||||
Semantic processing of module Naturals
|
||||
Semantic processing of module Sequences
|
||||
Semantic processing of module FiniteSets
|
||||
Semantic processing of module TLC
|
||||
Semantic processing of module Integers
|
||||
Semantic processing of module ProposerAcceptorStatic
|
||||
Semantic processing of module TLCExt
|
||||
Semantic processing of module _TLCTrace
|
||||
Semantic processing of module MCProposerAcceptorStatic
|
||||
Starting... (2024-11-15 12:09:59)
|
||||
Computing initial states...
|
||||
Finished computing initial states: 1 distinct state generated at 2024-11-15 12:10:00.
|
||||
Progress(19) at 2024-11-15 12:10:03: 464,696 states generated (464,696 s/min), 57,859 distinct states found (57,859 ds/min), 21,435 states left on queue.
|
||||
Progress(26) at 2024-11-15 12:11:03: 8,813,399 states generated (8,348,703 s/min), 877,254 distinct states found (819,395 ds/min), 214,794 states left on queue.
|
||||
Progress(27) at 2024-11-15 12:12:03: 16,121,858 states generated (7,308,459 s/min), 1,464,707 distinct states found (587,453 ds/min), 274,230 states left on queue.
|
||||
Progress(29) at 2024-11-15 12:13:03: 23,073,903 states generated (6,952,045 s/min), 1,948,802 distinct states found (484,095 ds/min), 263,697 states left on queue.
|
||||
Progress(31) at 2024-11-15 12:14:03: 29,740,681 states generated (6,666,778 s/min), 2,331,052 distinct states found (382,250 ds/min), 185,484 states left on queue.
|
||||
Progress(34) at 2024-11-15 12:15:03: 36,085,876 states generated (6,345,195 s/min), 2,602,370 distinct states found (271,318 ds/min), 31,659 states left on queue.
|
||||
Model checking completed. No error has been found.
|
||||
Estimates of the probability that TLC did not check all reachable states
|
||||
because two distinct states had the same fingerprint:
|
||||
calculated (optimistic): val = 4.9E-6
|
||||
based on the actual fingerprints: val = 6.9E-7
|
||||
36896322 states generated, 2623542 distinct states found, 0 states left on queue.
|
||||
The depth of the complete state graph search is 39.
|
||||
The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3).
|
||||
Finished in 05min 14s at (2024-11-15 12:15:13)
|
||||
@@ -0,0 +1,72 @@
|
||||
git revision: 864f4667d
|
||||
Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
|
||||
CPU Info Linux: Neoverse-N1
|
||||
CPU Cores Linux: 80
|
||||
CPU Info Mac:
|
||||
CPU Cores Mac:
|
||||
Spec: MCProposerAcceptorStatic.tla
|
||||
Config: models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
|
||||
----
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3}
|
||||
max_term = 3
|
||||
max_entries = 3
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
CommittedNotTruncated
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
|
||||
----
|
||||
|
||||
TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
|
||||
Running breadth-first search Model-Checking with fp 126 and seed 2302892334567572769 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 39701] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
|
||||
Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
|
||||
Parsing file /tmp/tlc-15178810317173795942/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
|
||||
Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
|
||||
Parsing file /tmp/tlc-15178810317173795942/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
|
||||
Parsing file /tmp/tlc-15178810317173795942/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
|
||||
Parsing file /tmp/tlc-15178810317173795942/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
|
||||
Parsing file /tmp/tlc-15178810317173795942/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
|
||||
Parsing file /tmp/tlc-15178810317173795942/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
|
||||
Parsing file /tmp/tlc-15178810317173795942/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
|
||||
Semantic processing of module Naturals
|
||||
Semantic processing of module Sequences
|
||||
Semantic processing of module FiniteSets
|
||||
Semantic processing of module TLC
|
||||
Semantic processing of module Integers
|
||||
Semantic processing of module ProposerAcceptorStatic
|
||||
Semantic processing of module TLCExt
|
||||
Semantic processing of module _TLCTrace
|
||||
Semantic processing of module MCProposerAcceptorStatic
|
||||
Starting... (2024-11-06 13:03:52)
|
||||
Computing initial states...
|
||||
Finished computing initial states: 1 distinct state generated at 2024-11-06 13:03:55.
|
||||
Progress(21) at 2024-11-06 13:03:58: 846,240 states generated (846,240 s/min), 106,298 distinct states found (106,298 ds/min), 41,028 states left on queue.
|
||||
Progress(28) at 2024-11-06 13:04:58: 27,538,211 states generated (26,691,971 s/min), 2,768,793 distinct states found (2,662,495 ds/min), 782,984 states left on queue.
|
||||
Progress(30) at 2024-11-06 13:05:58: 54,048,763 states generated (26,510,552 s/min), 5,076,745 distinct states found (2,307,952 ds/min), 1,241,301 states left on queue.
|
||||
Progress(31) at 2024-11-06 13:06:58: 80,554,724 states generated (26,505,961 s/min), 7,199,201 distinct states found (2,122,456 ds/min), 1,541,574 states left on queue.
|
||||
Progress(32) at 2024-11-06 13:07:58: 106,991,261 states generated (26,436,537 s/min), 9,121,549 distinct states found (1,922,348 ds/min), 1,686,289 states left on queue.
|
||||
Progress(33) at 2024-11-06 13:08:58: 133,354,665 states generated (26,363,404 s/min), 10,935,451 distinct states found (1,813,902 ds/min), 1,739,977 states left on queue.
|
||||
Progress(34) at 2024-11-06 13:09:58: 159,631,385 states generated (26,276,720 s/min), 12,605,372 distinct states found (1,669,921 ds/min), 1,677,447 states left on queue.
|
||||
Progress(35) at 2024-11-06 13:10:58: 185,862,196 states generated (26,230,811 s/min), 14,138,409 distinct states found (1,533,037 ds/min), 1,501,760 states left on queue.
|
||||
Progress(36) at 2024-11-06 13:11:58: 212,021,688 states generated (26,159,492 s/min), 15,538,990 distinct states found (1,400,581 ds/min), 1,216,621 states left on queue.
|
||||
Progress(37) at 2024-11-06 13:12:58: 238,046,160 states generated (26,024,472 s/min), 16,778,583 distinct states found (1,239,593 ds/min), 797,230 states left on queue.
|
||||
Progress(39) at 2024-11-06 13:13:58: 263,931,163 states generated (25,885,003 s/min), 17,820,786 distinct states found (1,042,203 ds/min), 209,400 states left on queue.
|
||||
Model checking completed. No error has been found.
|
||||
Estimates of the probability that TLC did not check all reachable states
|
||||
because two distinct states had the same fingerprint:
|
||||
calculated (optimistic): val = 2.5E-4
|
||||
based on the actual fingerprints: val = 7.9E-5
|
||||
270257170 states generated, 18005639 distinct states found, 0 states left on queue.
|
||||
The depth of the complete state graph search is 47.
|
||||
The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3).
|
||||
Finished in 10min 25s at (2024-11-06 13:14:17)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,89 @@
|
||||
git revision: 864f4667d
|
||||
Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
|
||||
CPU Info Linux: Neoverse-N1
|
||||
CPU Cores Linux: 80
|
||||
CPU Info Mac:
|
||||
CPU Cores Mac:
|
||||
Spec: MCProposerAcceptorStatic.tla
|
||||
Config: models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
|
||||
----
|
||||
CONSTANTS
|
||||
NULL = NULL
|
||||
proposers = {p1, p2}
|
||||
acceptors = {a1, a2, a3, a4, a5}
|
||||
max_term = 2
|
||||
max_entries = 2
|
||||
SPECIFICATION Spec
|
||||
CONSTRAINT StateConstraint
|
||||
INVARIANT
|
||||
TypeOk
|
||||
ElectionSafety
|
||||
LogIsMonotonic
|
||||
LogSafety
|
||||
SYMMETRY ProposerAcceptorSymmetry
|
||||
CHECK_DEADLOCK FALSE
|
||||
ALIAS Alias
|
||||
|
||||
----
|
||||
|
||||
TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
|
||||
Running breadth-first search Model-Checking with fp 90 and seed 2164066158568118414 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 30788] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
|
||||
Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
|
||||
Parsing file /tmp/tlc-13824636513165485309/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
|
||||
Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
|
||||
Parsing file /tmp/tlc-13824636513165485309/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
|
||||
Parsing file /tmp/tlc-13824636513165485309/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
|
||||
Parsing file /tmp/tlc-13824636513165485309/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
|
||||
Parsing file /tmp/tlc-13824636513165485309/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
|
||||
Parsing file /tmp/tlc-13824636513165485309/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
|
||||
Parsing file /tmp/tlc-13824636513165485309/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
|
||||
Semantic processing of module Naturals
|
||||
Semantic processing of module Sequences
|
||||
Semantic processing of module FiniteSets
|
||||
Semantic processing of module TLC
|
||||
Semantic processing of module Integers
|
||||
Semantic processing of module ProposerAcceptorStatic
|
||||
Semantic processing of module TLCExt
|
||||
Semantic processing of module _TLCTrace
|
||||
Semantic processing of module MCProposerAcceptorStatic
|
||||
Starting... (2024-11-06 12:09:33)
|
||||
Computing initial states...
|
||||
Finished computing initial states: 1 distinct state generated at 2024-11-06 12:09:36.
|
||||
Progress(16) at 2024-11-06 12:09:39: 405,675 states generated (405,675 s/min), 18,042 distinct states found (18,042 ds/min), 7,612 states left on queue.
|
||||
Progress(23) at 2024-11-06 12:10:39: 12,449,257 states generated (12,043,582 s/min), 467,293 distinct states found (449,251 ds/min), 161,057 states left on queue.
|
||||
Progress(25) at 2024-11-06 12:11:39: 24,461,332 states generated (12,012,075 s/min), 861,011 distinct states found (393,718 ds/min), 267,072 states left on queue.
|
||||
Progress(26) at 2024-11-06 12:12:39: 36,440,377 states generated (11,979,045 s/min), 1,234,052 distinct states found (373,041 ds/min), 355,372 states left on queue.
|
||||
Progress(26) at 2024-11-06 12:13:39: 48,327,873 states generated (11,887,496 s/min), 1,583,736 distinct states found (349,684 ds/min), 425,209 states left on queue.
|
||||
Progress(27) at 2024-11-06 12:14:39: 60,246,136 states generated (11,918,263 s/min), 1,933,499 distinct states found (349,763 ds/min), 494,269 states left on queue.
|
||||
Progress(28) at 2024-11-06 12:15:39: 71,977,716 states generated (11,731,580 s/min), 2,265,302 distinct states found (331,803 ds/min), 553,777 states left on queue.
|
||||
Progress(28) at 2024-11-06 12:16:39: 83,644,537 states generated (11,666,821 s/min), 2,575,451 distinct states found (310,149 ds/min), 594,142 states left on queue.
|
||||
Progress(29) at 2024-11-06 12:17:39: 95,287,089 states generated (11,642,552 s/min), 2,888,793 distinct states found (313,342 ds/min), 639,273 states left on queue.
|
||||
Progress(29) at 2024-11-06 12:18:39: 107,000,972 states generated (11,713,883 s/min), 3,194,255 distinct states found (305,462 ds/min), 673,353 states left on queue.
|
||||
Progress(29) at 2024-11-06 12:19:39: 118,305,248 states generated (11,304,276 s/min), 3,467,775 distinct states found (273,520 ds/min), 692,915 states left on queue.
|
||||
Progress(29) at 2024-11-06 12:20:39: 129,954,327 states generated (11,649,079 s/min), 3,763,186 distinct states found (295,411 ds/min), 720,349 states left on queue.
|
||||
Progress(29) at 2024-11-06 12:21:39: 141,251,359 states generated (11,297,032 s/min), 4,020,407 distinct states found (257,221 ds/min), 724,036 states left on queue.
|
||||
Progress(30) at 2024-11-06 12:22:39: 152,551,873 states generated (11,300,514 s/min), 4,284,278 distinct states found (263,871 ds/min), 733,726 states left on queue.
|
||||
Progress(30) at 2024-11-06 12:23:39: 164,324,788 states generated (11,772,915 s/min), 4,569,569 distinct states found (285,291 ds/min), 746,476 states left on queue.
|
||||
Progress(30) at 2024-11-06 12:24:39: 175,121,317 states generated (10,796,529 s/min), 4,779,505 distinct states found (209,936 ds/min), 723,070 states left on queue.
|
||||
Progress(31) at 2024-11-06 12:25:39: 186,238,236 states generated (11,116,919 s/min), 5,016,034 distinct states found (236,529 ds/min), 712,944 states left on queue.
|
||||
Progress(31) at 2024-11-06 12:26:39: 197,884,578 states generated (11,646,342 s/min), 5,276,094 distinct states found (260,060 ds/min), 705,471 states left on queue.
|
||||
Progress(31) at 2024-11-06 12:27:39: 208,535,096 states generated (10,650,518 s/min), 5,463,450 distinct states found (187,356 ds/min), 665,661 states left on queue.
|
||||
Progress(32) at 2024-11-06 12:28:39: 219,424,829 states generated (10,889,733 s/min), 5,673,673 distinct states found (210,223 ds/min), 637,975 states left on queue.
|
||||
Progress(32) at 2024-11-06 12:29:39: 230,906,372 states generated (11,481,543 s/min), 5,903,516 distinct states found (229,843 ds/min), 606,255 states left on queue.
|
||||
Progress(33) at 2024-11-06 12:30:39: 241,261,887 states generated (10,355,515 s/min), 6,065,731 distinct states found (162,215 ds/min), 552,728 states left on queue.
|
||||
Progress(33) at 2024-11-06 12:31:39: 252,028,921 states generated (10,767,034 s/min), 6,255,487 distinct states found (189,756 ds/min), 509,620 states left on queue.
|
||||
Progress(33) at 2024-11-06 12:32:39: 262,856,171 states generated (10,827,250 s/min), 6,431,063 distinct states found (175,576 ds/min), 448,834 states left on queue.
|
||||
Progress(34) at 2024-11-06 12:33:39: 273,211,882 states generated (10,355,711 s/min), 6,586,644 distinct states found (155,581 ds/min), 386,905 states left on queue.
|
||||
Progress(34) at 2024-11-06 12:34:39: 283,843,415 states generated (10,631,533 s/min), 6,743,916 distinct states found (157,272 ds/min), 315,135 states left on queue.
|
||||
Progress(35) at 2024-11-06 12:35:39: 293,931,115 states generated (10,087,700 s/min), 6,878,405 distinct states found (134,489 ds/min), 241,126 states left on queue.
|
||||
Progress(36) at 2024-11-06 12:36:39: 303,903,441 states generated (9,972,326 s/min), 6,996,394 distinct states found (117,989 ds/min), 152,775 states left on queue.
|
||||
Progress(37) at 2024-11-06 12:37:39: 313,501,886 states generated (9,598,445 s/min), 7,093,031 distinct states found (96,637 ds/min), 54,009 states left on queue.
|
||||
Model checking completed. No error has been found.
|
||||
Estimates of the probability that TLC did not check all reachable states
|
||||
because two distinct states had the same fingerprint:
|
||||
calculated (optimistic): val = 1.2E-4
|
||||
based on the actual fingerprints: val = 2.1E-6
|
||||
318172398 states generated, 7127950 distinct states found, 0 states left on queue.
|
||||
The depth of the complete state graph search is 44.
|
||||
The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3).
|
||||
Finished in 28min 43s at (2024-11-06 12:38:16)
|
||||
@@ -562,6 +562,9 @@ impl WalAcceptor {
|
||||
// Don't flush the WAL on every append, only periodically via flush_ticker.
|
||||
// This batches multiple appends per fsync. If the channel is empty after
|
||||
// sending the reply, we'll schedule an immediate flush.
|
||||
//
|
||||
// Note that a flush can still happen on segment bounds, which will result
|
||||
// in an AppendResponse.
|
||||
if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
|
||||
msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
||||
dirty = true;
|
||||
|
||||
@@ -947,6 +947,7 @@ where
|
||||
// while first connection still gets some packets later. It might be
|
||||
// better to not log this as error! above.
|
||||
let write_lsn = self.wal_store.write_lsn();
|
||||
let flush_lsn = self.wal_store.flush_lsn();
|
||||
if write_lsn > msg.h.begin_lsn {
|
||||
bail!(
|
||||
"append request rewrites WAL written before, write_lsn={}, msg lsn={}",
|
||||
@@ -1004,7 +1005,9 @@ where
|
||||
);
|
||||
|
||||
// If flush_lsn hasn't updated, AppendResponse is not very useful.
|
||||
if !require_flush {
|
||||
// This is the common case for !require_flush, but a flush can still
|
||||
// happen on segment bounds.
|
||||
if !require_flush && flush_lsn == self.flush_lsn() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
|
||||
@@ -113,6 +113,13 @@ pub struct PhysicalStorage {
|
||||
/// non-aligned chunks of data.
|
||||
write_record_lsn: Lsn,
|
||||
|
||||
/// The last LSN flushed to disk. May be in the middle of a record.
|
||||
///
|
||||
/// NB: when the rest of the system refers to `flush_lsn`, it usually
|
||||
/// actually refers to `flush_record_lsn`. This ambiguity can be dangerous
|
||||
/// and should be resolved.
|
||||
flush_lsn: Lsn,
|
||||
|
||||
/// The LSN of the last WAL record flushed to disk.
|
||||
flush_record_lsn: Lsn,
|
||||
|
||||
@@ -127,23 +134,29 @@ pub struct PhysicalStorage {
|
||||
/// - doesn't point to the end of the segment
|
||||
file: Option<File>,
|
||||
|
||||
/// When false, we have just initialized storage using the LSN from find_end_of_wal().
|
||||
/// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
|
||||
/// there can be a case with unexpected .partial file.
|
||||
/// When true, WAL truncation potentially has been interrupted and we need
|
||||
/// to finish it before allowing WAL writes; see truncate_wal for details.
|
||||
/// In this case [`write_lsn`] can be less than actually written WAL on
|
||||
/// disk. In particular, there can be a case with unexpected .partial file.
|
||||
///
|
||||
/// Imagine the following:
|
||||
/// - 000000010000000000000001
|
||||
/// - it was fully written, but the last record is split between 2 segments
|
||||
/// - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in the end of this segment
|
||||
/// - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were initialized to 0/1FFFFF0
|
||||
/// - it was fully written, but the last record is split between 2
|
||||
/// segments
|
||||
/// - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in
|
||||
/// the end of this segment
|
||||
/// - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were
|
||||
/// initialized to 0/1FFFFF0
|
||||
/// - 000000010000000000000002.partial
|
||||
/// - it has only 1 byte written, which is not enough to make a full WAL record
|
||||
/// - it has only 1 byte written, which is not enough to make a full WAL
|
||||
/// record
|
||||
///
|
||||
/// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
|
||||
/// This flag will be set to true after the first truncate_wal() call.
|
||||
/// Partial segment 002 has no WAL records, and it will be removed by the
|
||||
/// next truncate_wal(). This flag will be set to true after the first
|
||||
/// truncate_wal() call.
|
||||
///
|
||||
/// [`write_lsn`]: Self::write_lsn
|
||||
is_truncated_after_restart: bool,
|
||||
pending_wal_truncation: bool,
|
||||
}
|
||||
|
||||
impl PhysicalStorage {
|
||||
@@ -205,10 +218,11 @@ impl PhysicalStorage {
|
||||
system_id: state.server.system_id,
|
||||
write_lsn,
|
||||
write_record_lsn: write_lsn,
|
||||
flush_lsn,
|
||||
flush_record_lsn: flush_lsn,
|
||||
decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
|
||||
file: None,
|
||||
is_truncated_after_restart: false,
|
||||
pending_wal_truncation: true,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -289,8 +303,9 @@ impl PhysicalStorage {
|
||||
}
|
||||
}
|
||||
|
||||
/// Write WAL bytes, which are known to be located in a single WAL segment.
|
||||
async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
|
||||
/// Write WAL bytes, which are known to be located in a single WAL segment. Returns true if the
|
||||
/// segment was completed, closed, and flushed to disk.
|
||||
async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<bool> {
|
||||
let mut file = if let Some(file) = self.file.take() {
|
||||
file
|
||||
} else {
|
||||
@@ -314,20 +329,24 @@ impl PhysicalStorage {
|
||||
let (wal_file_path, wal_file_partial_path) =
|
||||
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size);
|
||||
fs::rename(wal_file_partial_path, wal_file_path).await?;
|
||||
Ok(true)
|
||||
} else {
|
||||
// otherwise, file can be reused later
|
||||
self.file = Some(file);
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Writes WAL to the segment files, until everything is writed. If some segments
|
||||
/// are fully written, they are flushed to disk. The last (partial) segment can
|
||||
/// be flushed separately later.
|
||||
///
|
||||
/// Updates `write_lsn`.
|
||||
/// Updates `write_lsn` and `flush_lsn`.
|
||||
async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
|
||||
// TODO: this shouldn't be possible, except possibly with write_lsn == 0.
|
||||
// Rename this method to `append_exact`, and make it append-only, removing
|
||||
// the `pos` parameter and this check. For this reason, we don't update
|
||||
// `flush_lsn` here.
|
||||
if self.write_lsn != pos {
|
||||
// need to flush the file before discarding it
|
||||
if let Some(file) = self.file.take() {
|
||||
@@ -349,9 +368,13 @@ impl PhysicalStorage {
|
||||
buf.len()
|
||||
};
|
||||
|
||||
self.write_in_segment(segno, xlogoff, &buf[..bytes_write])
|
||||
let flushed = self
|
||||
.write_in_segment(segno, xlogoff, &buf[..bytes_write])
|
||||
.await?;
|
||||
self.write_lsn += bytes_write as u64;
|
||||
if flushed {
|
||||
self.flush_lsn = self.write_lsn;
|
||||
}
|
||||
buf = &buf[bytes_write..];
|
||||
}
|
||||
|
||||
@@ -365,6 +388,9 @@ impl Storage for PhysicalStorage {
|
||||
self.write_lsn
|
||||
}
|
||||
/// flush_lsn returns LSN of last durably stored WAL record.
|
||||
///
|
||||
/// TODO: flush_lsn() returns flush_record_lsn, but write_lsn() returns write_lsn: confusing.
|
||||
#[allow(clippy::misnamed_getters)]
|
||||
fn flush_lsn(&self) -> Lsn {
|
||||
self.flush_record_lsn
|
||||
}
|
||||
@@ -405,14 +431,22 @@ impl Storage for PhysicalStorage {
|
||||
startpos
|
||||
);
|
||||
}
|
||||
if self.pending_wal_truncation {
|
||||
bail!(
|
||||
"write_wal called with pending WAL truncation, write_lsn={}, startpos={}",
|
||||
self.write_lsn,
|
||||
startpos
|
||||
);
|
||||
}
|
||||
|
||||
let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?;
|
||||
// WAL is written, updating write metrics
|
||||
self.metrics.observe_write_seconds(write_seconds);
|
||||
self.metrics.observe_write_bytes(buf.len());
|
||||
|
||||
// figure out last record's end lsn for reporting (if we got the
|
||||
// whole record)
|
||||
// Figure out the last record's end LSN and update `write_record_lsn`
|
||||
// (if we got a whole record). The write may also have closed and
|
||||
// flushed a segment, so update `flush_record_lsn` as well.
|
||||
if self.decoder.available() != startpos {
|
||||
info!(
|
||||
"restart decoder from {} to {}",
|
||||
@@ -423,12 +457,15 @@ impl Storage for PhysicalStorage {
|
||||
self.decoder = WalStreamDecoder::new(startpos, pg_version);
|
||||
}
|
||||
self.decoder.feed_bytes(buf);
|
||||
loop {
|
||||
match self.decoder.poll_decode()? {
|
||||
None => break, // no full record yet
|
||||
Some((lsn, _rec)) => {
|
||||
self.write_record_lsn = lsn;
|
||||
}
|
||||
|
||||
if self.write_record_lsn <= self.flush_lsn {
|
||||
// We may have flushed a previously written record.
|
||||
self.flush_record_lsn = self.write_record_lsn;
|
||||
}
|
||||
while let Some((lsn, _rec)) = self.decoder.poll_decode()? {
|
||||
self.write_record_lsn = lsn;
|
||||
if lsn <= self.flush_lsn {
|
||||
self.flush_record_lsn = lsn;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -445,19 +482,17 @@ impl Storage for PhysicalStorage {
|
||||
self.fdatasync_file(&unflushed_file).await?;
|
||||
self.file = Some(unflushed_file);
|
||||
} else {
|
||||
// We have unflushed data (write_lsn != flush_lsn), but no file.
|
||||
// This should only happen if last file was fully written and flushed,
|
||||
// but haven't updated flush_lsn yet.
|
||||
if self.write_lsn.segment_offset(self.wal_seg_size) != 0 {
|
||||
bail!(
|
||||
"unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}",
|
||||
self.write_lsn,
|
||||
self.flush_record_lsn
|
||||
);
|
||||
}
|
||||
// We have unflushed data (write_lsn != flush_lsn), but no file. This
|
||||
// shouldn't happen, since the segment is flushed on close.
|
||||
bail!(
|
||||
"unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}",
|
||||
self.write_lsn,
|
||||
self.flush_record_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// everything is flushed now, let's update flush_lsn
|
||||
self.flush_lsn = self.write_lsn;
|
||||
self.flush_record_lsn = self.write_record_lsn;
|
||||
Ok(())
|
||||
}
|
||||
@@ -479,15 +514,35 @@ impl Storage for PhysicalStorage {
|
||||
);
|
||||
}
|
||||
|
||||
// Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
|
||||
// disk (this happens on each connect).
|
||||
if self.is_truncated_after_restart
|
||||
// Quick exit if nothing to do and we know that the state is clean to
|
||||
// avoid writing up to 16 MiB of zeros on disk (this happens on each
|
||||
// connect).
|
||||
if !self.pending_wal_truncation
|
||||
&& end_pos == self.write_lsn
|
||||
&& end_pos == self.flush_record_lsn
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Atomicity: we start with LSNs reset because once on disk deletion is
|
||||
// started it can't be reversed. However, we might crash/error in the
|
||||
// middle, leaving garbage above the truncation point. In theory,
|
||||
// concatenated with previous records it might form bogus WAL (though
|
||||
// very unlikely in practice because CRC would guard from that). To
|
||||
// protect, set pending_wal_truncation flag before beginning: it means
|
||||
// truncation must be retried and WAL writes are prohibited until it
|
||||
// succeeds. Flag is also set on boot because we don't know if the last
|
||||
// state was clean.
|
||||
//
|
||||
// Protocol (HandleElected before first AppendRequest) ensures we'll
|
||||
// always try to ensure clean truncation before any writes.
|
||||
self.pending_wal_truncation = true;
|
||||
|
||||
self.write_lsn = end_pos;
|
||||
self.flush_lsn = end_pos;
|
||||
self.write_record_lsn = end_pos;
|
||||
self.flush_record_lsn = end_pos;
|
||||
|
||||
// Close previously opened file, if any
|
||||
if let Some(unflushed_file) = self.file.take() {
|
||||
self.fdatasync_file(&unflushed_file).await?;
|
||||
@@ -513,11 +568,7 @@ impl Storage for PhysicalStorage {
|
||||
fs::rename(wal_file_path, wal_file_partial_path).await?;
|
||||
}
|
||||
|
||||
// Update LSNs
|
||||
self.write_lsn = end_pos;
|
||||
self.write_record_lsn = end_pos;
|
||||
self.flush_record_lsn = end_pos;
|
||||
self.is_truncated_after_restart = true;
|
||||
self.pending_wal_truncation = false;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -3642,6 +3642,7 @@ impl Service {
|
||||
match res {
|
||||
Ok(ok) => Ok(ok),
|
||||
Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
|
||||
Err(mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg)) => Err(ApiError::ResourceUnavailable(msg.into())),
|
||||
Err(e) => {
|
||||
Err(
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
@@ -6355,6 +6356,19 @@ impl Service {
|
||||
|
||||
// Pick the biggest tenant to split first
|
||||
top_n.sort_by_key(|i| i.resident_size);
|
||||
|
||||
// Filter out tenants in a prohibiting scheduling mode
|
||||
{
|
||||
let locked = self.inner.read().unwrap();
|
||||
top_n.retain(|i| {
|
||||
if let Some(shard) = locked.tenants.get(&i.id) {
|
||||
matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let Some(split_candidate) = top_n.into_iter().next() else {
|
||||
tracing::debug!("No split-elegible shards found");
|
||||
return;
|
||||
|
||||
@@ -2379,6 +2379,17 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
#
|
||||
# The entries in the list are regular experessions.
|
||||
self.allowed_errors: list[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
|
||||
# Store persistent failpoints that should be reapplied on each start
|
||||
self._persistent_failpoints: dict[str, str] = {}
|
||||
|
||||
def add_persistent_failpoint(self, name: str, action: str):
|
||||
"""
|
||||
Add a failpoint that will be automatically reapplied each time the pageserver starts.
|
||||
The failpoint will be set immediately if the pageserver is running.
|
||||
"""
|
||||
self._persistent_failpoints[name] = action
|
||||
if self.running:
|
||||
self.http_client().configure_failpoints([(name, action)])
|
||||
|
||||
def timeline_dir(
|
||||
self,
|
||||
@@ -2446,6 +2457,15 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
"""
|
||||
assert self.running is False
|
||||
|
||||
if self._persistent_failpoints:
|
||||
# Tests shouldn't use this mechanism _and_ set FAILPOINTS explicitly
|
||||
assert extra_env_vars is None or "FAILPOINTS" not in extra_env_vars
|
||||
if extra_env_vars is None:
|
||||
extra_env_vars = {}
|
||||
extra_env_vars["FAILPOINTS"] = ",".join(
|
||||
f"{k}={v}" for (k, v) in self._persistent_failpoints.items()
|
||||
)
|
||||
|
||||
storage = self.env.pageserver_remote_storage
|
||||
if isinstance(storage, S3Storage):
|
||||
s3_env_vars = storage.access_env_vars()
|
||||
@@ -4522,7 +4542,7 @@ def pytest_addoption(parser: Parser):
|
||||
|
||||
|
||||
SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile(
|
||||
r"config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
|
||||
r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from collections.abc import Iterator
|
||||
from contextlib import contextmanager
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
import psycopg2
|
||||
@@ -18,7 +20,7 @@ if TYPE_CHECKING:
|
||||
from fixtures.benchmark_fixture import NeonBenchmarker
|
||||
from fixtures.neon_api import NeonApiEndpoint
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin, VanillaPostgres
|
||||
from psycopg2.extensions import cursor
|
||||
from psycopg2.extensions import connection, cursor
|
||||
|
||||
|
||||
@pytest.mark.timeout(1000)
|
||||
@@ -292,6 +294,48 @@ def test_snap_files(
|
||||
then runs pgbench inserts while generating large numbers of snapfiles. Then restarts
|
||||
the node and tries to peek the replication changes.
|
||||
"""
|
||||
|
||||
@contextmanager
|
||||
def replication_slot(conn: connection, slot_name: str) -> Iterator[None]:
|
||||
"""
|
||||
Make sure that the replication slot doesn't outlive the test. Normally
|
||||
we wouldn't want this behavior, but since the test creates and drops
|
||||
the replication slot, we do.
|
||||
|
||||
We've had problems in the past where this slot sticking around caused
|
||||
issues with the publisher retaining WAL during the execution of the
|
||||
other benchmarks in this suite.
|
||||
"""
|
||||
|
||||
def __drop_replication_slot(c: cursor) -> None:
|
||||
c.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_replication_slots
|
||||
WHERE slot_name = %(slot_name)s
|
||||
) THEN
|
||||
PERFORM pg_drop_replication_slot(%(slot_name)s);
|
||||
END IF;
|
||||
END $$;
|
||||
""",
|
||||
{"slot_name": slot_name},
|
||||
)
|
||||
|
||||
with conn.cursor() as c:
|
||||
__drop_replication_slot(c)
|
||||
c.execute(
|
||||
"SELECT pg_create_logical_replication_slot(%(slot_name)s, 'test_decoding')",
|
||||
{"slot_name": slot_name},
|
||||
)
|
||||
|
||||
yield
|
||||
|
||||
with conn.cursor() as c:
|
||||
__drop_replication_slot(c)
|
||||
|
||||
test_duration_min = 60
|
||||
test_interval_min = 5
|
||||
pgbench_duration = f"-T{test_duration_min * 60 * 2}"
|
||||
@@ -314,48 +358,35 @@ def test_snap_files(
|
||||
conn = psycopg2.connect(connstr)
|
||||
conn.autocommit = True
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_replication_slots
|
||||
WHERE slot_name = 'slotter'
|
||||
) THEN
|
||||
PERFORM pg_drop_replication_slot('slotter');
|
||||
END IF;
|
||||
END $$;
|
||||
"""
|
||||
with replication_slot(conn, "slotter"):
|
||||
workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env
|
||||
)
|
||||
cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
|
||||
try:
|
||||
start = time.time()
|
||||
prev_measurement = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
conn = psycopg2.connect(connstr)
|
||||
conn.autocommit = True
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
|
||||
)
|
||||
check_pgbench_still_running(workload)
|
||||
cur.execute(
|
||||
"SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
|
||||
)
|
||||
|
||||
conn.close()
|
||||
|
||||
# Measure storage
|
||||
if time.time() - prev_measurement > test_interval_min * 60:
|
||||
storage = benchmark_project_pub.get_synthetic_storage_size()
|
||||
zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER)
|
||||
prev_measurement = time.time()
|
||||
time.sleep(test_interval_min * 60 / 3)
|
||||
finally:
|
||||
workload.terminate()
|
||||
|
||||
conn.close()
|
||||
|
||||
workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
|
||||
try:
|
||||
start = time.time()
|
||||
prev_measurement = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
conn = psycopg2.connect(connstr)
|
||||
conn.autocommit = True
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
|
||||
)
|
||||
check_pgbench_still_running(workload)
|
||||
cur.execute("SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())")
|
||||
|
||||
conn.close()
|
||||
|
||||
# Measure storage
|
||||
if time.time() - prev_measurement > test_interval_min * 60:
|
||||
storage = benchmark_project_pub.get_synthetic_storage_size()
|
||||
zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER)
|
||||
prev_measurement = time.time()
|
||||
time.sleep(test_interval_min * 60 / 3)
|
||||
|
||||
finally:
|
||||
workload.terminate()
|
||||
|
||||
@@ -122,6 +122,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/9754")
|
||||
def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test static endpoint is protected from GC by acquiring and renewing lsn leases.
|
||||
|
||||
@@ -1,22 +1,33 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
|
||||
import requests
|
||||
from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty, list_prefix
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
assert_prefix_not_empty,
|
||||
list_prefix,
|
||||
wait_until_tenant_active,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.remote_storage import S3Storage, s3_storage
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.utils import run_only_on_default_postgres, wait_until
|
||||
from mypy_boto3_s3.type_defs import (
|
||||
ObjectTypeDef,
|
||||
)
|
||||
from psycopg2.errors import IoError, UndefinedTable
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shard_count", [0, 4])
|
||||
@@ -378,8 +389,279 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("offload_child", ["offload", "offload-corrupt", "archive", None])
|
||||
def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Optional[str]):
|
||||
@run_only_on_default_postgres("this test isn't sensitive to the contents of timelines")
|
||||
def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
A general consistency check on archival/offload timeline state, and its intersection
|
||||
with tenant migrations and timeline deletions.
|
||||
"""
|
||||
|
||||
# Offloading is off by default at time of writing: remove this line when it's on by default
|
||||
neon_env_builder.pageserver_config_override = "timeline_offloading = true"
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
|
||||
# We will exercise migrations, so need multiple pageservers
|
||||
neon_env_builder.num_pageservers = 2
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
"compaction_period": "1s",
|
||||
}
|
||||
)
|
||||
tenant_id = env.initial_tenant
|
||||
tenant_shard_id = TenantShardId(tenant_id, 0, 0)
|
||||
|
||||
# Unavailable pageservers during timeline CRUD operations can be logged as errors on the storage controller
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*error sending request.*",
|
||||
# FIXME: the pageserver should not return 500s on cancellation (https://github.com/neondatabase/neon/issues/97680)
|
||||
".*InternalServerError(Error deleting timeline .* on .* on .*: pageserver API: error: Cancelled",
|
||||
]
|
||||
)
|
||||
|
||||
for ps in env.pageservers:
|
||||
# We will do unclean restarts, which results in these messages when cleaning up files
|
||||
ps.allowed_errors.extend(
|
||||
[
|
||||
".*removing local file.*because it has unexpected length.*",
|
||||
".*__temp.*",
|
||||
# FIXME: there are still anyhow::Error paths in timeline creation/deletion which
|
||||
# generate 500 results when called during shutdown (https://github.com/neondatabase/neon/issues/9768)
|
||||
".*InternalServerError.*",
|
||||
# FIXME: there are still anyhow::Error paths in timeline deletion that generate
|
||||
# log lines at error severity (https://github.com/neondatabase/neon/issues/9768)
|
||||
".*delete_timeline.*Error",
|
||||
]
|
||||
)
|
||||
|
||||
class TimelineState:
|
||||
def __init__(self):
|
||||
self.timeline_id = TimelineId.generate()
|
||||
self.created = False
|
||||
self.archived = False
|
||||
self.offloaded = False
|
||||
self.deleted = False
|
||||
|
||||
controller_ps_api = env.storage_controller.pageserver_api()
|
||||
|
||||
shutdown = threading.Event()
|
||||
|
||||
violations = []
|
||||
|
||||
timelines_deleted = []
|
||||
|
||||
def list_timelines(tenant_id) -> tuple[set[TimelineId], set[TimelineId]]:
|
||||
"""Get the list of active and offloaded TimelineId"""
|
||||
listing = controller_ps_api.timeline_and_offloaded_list(tenant_id)
|
||||
active_ids = set([TimelineId(t["timeline_id"]) for t in listing.timelines])
|
||||
offloaded_ids = set([TimelineId(t["timeline_id"]) for t in listing.offloaded])
|
||||
|
||||
return (active_ids, offloaded_ids)
|
||||
|
||||
def timeline_objects(tenant_shard_id, timeline_id):
|
||||
response = list_prefix(
|
||||
env.pageserver_remote_storage, # type: ignore
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_shard_id),
|
||||
"timelines",
|
||||
str(timeline_id),
|
||||
)
|
||||
)
|
||||
+ "/",
|
||||
)
|
||||
|
||||
return [k["Key"] for k in response.get("Contents", [])]
|
||||
|
||||
def worker():
|
||||
"""
|
||||
Background thread which drives timeline lifecycle operations, and checks that between steps
|
||||
it obeys invariants. This should detect errors in pageserver persistence and in errors in
|
||||
concurrent operations on different timelines when it is run many times in parallel.
|
||||
"""
|
||||
state = TimelineState()
|
||||
|
||||
# Jitter worker startup, we're not interested in exercising lots of concurrent creations
|
||||
# as we know that's I/O bound.
|
||||
shutdown.wait(random.random() * 10)
|
||||
|
||||
while not shutdown.is_set():
|
||||
# A little wait between actions to jitter out the API calls rather than having them
|
||||
# all queue up at once
|
||||
shutdown.wait(random.random())
|
||||
|
||||
try:
|
||||
if not state.created:
|
||||
log.info(f"Creating timeline {state.timeline_id}")
|
||||
controller_ps_api.timeline_create(
|
||||
PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=state.timeline_id
|
||||
)
|
||||
state.created = True
|
||||
|
||||
if (
|
||||
timeline_objects(
|
||||
tenant_shard_id=tenant_shard_id, timeline_id=state.timeline_id
|
||||
)
|
||||
== []
|
||||
):
|
||||
msg = f"Timeline {state.timeline_id} unexpectedly not present in remote storage"
|
||||
violations.append(msg)
|
||||
|
||||
elif state.deleted:
|
||||
# Try to confirm its deletion completed.
|
||||
# Deleted timeline should not appear in listing API, either as offloaded or active
|
||||
(active_ids, offloaded_ids) = list_timelines(tenant_id)
|
||||
if state.timeline_id in active_ids or state.timeline_id in offloaded_ids:
|
||||
msg = f"Timeline {state.timeline_id} appeared in listing after deletion was acked"
|
||||
violations.append(msg)
|
||||
raise RuntimeError(msg)
|
||||
|
||||
objects = timeline_objects(tenant_shard_id, state.timeline_id)
|
||||
if len(objects) == 0:
|
||||
log.info(f"Confirmed deletion of timeline {state.timeline_id}")
|
||||
timelines_deleted.append(state.timeline_id)
|
||||
state = TimelineState() # A new timeline ID to create on next iteration
|
||||
else:
|
||||
# Deletion of objects doesn't have to be synchronous, we will keep polling
|
||||
log.info(f"Timeline {state.timeline_id} objects still exist: {objects}")
|
||||
shutdown.wait(random.random())
|
||||
else:
|
||||
# The main lifetime of a timeline: proceed active->archived->offloaded->deleted
|
||||
if not state.archived:
|
||||
log.info(f"Archiving timeline {state.timeline_id}")
|
||||
controller_ps_api.timeline_archival_config(
|
||||
tenant_id, state.timeline_id, TimelineArchivalState.ARCHIVED
|
||||
)
|
||||
state.archived = True
|
||||
elif state.archived and not state.offloaded:
|
||||
log.info(f"Waiting for offload of timeline {state.timeline_id}")
|
||||
# Wait for offload: this should happen fast because we configured a short compaction interval
|
||||
while not shutdown.is_set():
|
||||
(active_ids, offloaded_ids) = list_timelines(tenant_id)
|
||||
if state.timeline_id in active_ids:
|
||||
log.info(f"Timeline {state.timeline_id} is still active")
|
||||
shutdown.wait(0.5)
|
||||
elif state.timeline_id in offloaded_ids:
|
||||
log.info(f"Timeline {state.timeline_id} is now offloaded")
|
||||
state.offloaded = True
|
||||
break
|
||||
else:
|
||||
# Timeline is neither offloaded nor active, this is unexpected: the pageserver
|
||||
# should ensure that the timeline appears in either the offloaded list or main list
|
||||
msg = f"Timeline {state.timeline_id} disappeared!"
|
||||
violations.append(msg)
|
||||
raise RuntimeError(msg)
|
||||
elif state.offloaded:
|
||||
# Once it's offloaded it should only be in offloaded or deleted state: check
|
||||
# it didn't revert back to active. This tests that the manfiest is doing its
|
||||
# job to suppress loading of offloaded timelines as active.
|
||||
(active_ids, offloaded_ids) = list_timelines(tenant_id)
|
||||
if state.timeline_id in active_ids:
|
||||
msg = f"Timeline {state.timeline_id} is active, should be offloaded or deleted"
|
||||
violations.append(msg)
|
||||
raise RuntimeError(msg)
|
||||
|
||||
log.info(f"Deleting timeline {state.timeline_id}")
|
||||
controller_ps_api.timeline_delete(tenant_id, state.timeline_id)
|
||||
state.deleted = True
|
||||
else:
|
||||
raise RuntimeError("State should be unreachable")
|
||||
except PageserverApiException as e:
|
||||
# This is expected: we are injecting chaos, API calls will sometimes fail.
|
||||
# TODO: can we narrow this to assert we are getting friendly 503s?
|
||||
log.info(f"Iteration error, will retry: {e}")
|
||||
shutdown.wait(random.random())
|
||||
except requests.exceptions.RetryError as e:
|
||||
# Retryable error repeated more times than `requests` is configured to tolerate, this
|
||||
# is expected when a pageserver remains unavailable for a couple seconds
|
||||
log.info(f"Iteration error, will retry: {e}")
|
||||
shutdown.wait(random.random())
|
||||
except Exception as e:
|
||||
log.warning(
|
||||
f"Unexpected worker exception (current timeline {state.timeline_id}): {e}"
|
||||
)
|
||||
else:
|
||||
# In the non-error case, use a jitterd but small wait, we want to keep
|
||||
# a high rate of operations going
|
||||
shutdown.wait(random.random() * 0.1)
|
||||
|
||||
n_workers = 4
|
||||
threads = []
|
||||
for _i in range(0, n_workers):
|
||||
t = threading.Thread(target=worker)
|
||||
t.start()
|
||||
threads.append(t)
|
||||
|
||||
# Set delay failpoints so that deletions and migrations take some time, and have a good
|
||||
# chance to interact with other concurrent timeline mutations.
|
||||
env.storage_controller.configure_failpoints(
|
||||
[("reconciler-live-migrate-pre-await-lsn", "sleep(1)")]
|
||||
)
|
||||
for ps in env.pageservers:
|
||||
ps.add_persistent_failpoint("in_progress_delete", "sleep(1)")
|
||||
|
||||
# Generate some chaos, while our workers are trying to complete their timeline operations
|
||||
rng = random.Random()
|
||||
try:
|
||||
chaos_rounds = 48
|
||||
for _i in range(0, chaos_rounds):
|
||||
action = rng.choice([0, 1])
|
||||
if action == 0:
|
||||
# Pick a random pageserver to gracefully restart
|
||||
pageserver = rng.choice(env.pageservers)
|
||||
|
||||
# Whether to use a graceful shutdown or SIGKILL
|
||||
immediate = random.choice([True, False])
|
||||
log.info(f"Restarting pageserver {pageserver.id}, immediate={immediate}")
|
||||
|
||||
t1 = time.time()
|
||||
pageserver.restart(immediate=immediate)
|
||||
restart_duration = time.time() - t1
|
||||
|
||||
# Make sure we're up for as long as we spent restarting, to ensure operations can make progress
|
||||
log.info(f"Staying alive for {restart_duration}s")
|
||||
time.sleep(restart_duration)
|
||||
else:
|
||||
# Migrate our tenant between pageservers
|
||||
origin_ps = env.get_tenant_pageserver(tenant_shard_id)
|
||||
dest_ps = rng.choice([ps for ps in env.pageservers if ps.id != origin_ps.id])
|
||||
log.info(f"Migrating {tenant_shard_id} {origin_ps.id}->{dest_ps.id}")
|
||||
env.storage_controller.tenant_shard_migrate(
|
||||
tenant_shard_id=tenant_shard_id, dest_ps_id=dest_ps.id
|
||||
)
|
||||
|
||||
log.info(f"Full timeline lifecycles so far: {len(timelines_deleted)}")
|
||||
finally:
|
||||
shutdown.set()
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
# Sanity check that during our run we did exercise some full timeline lifecycles, in case
|
||||
# one of our workers got stuck
|
||||
assert len(timelines_deleted) > 10
|
||||
|
||||
# That no invariant-violations were reported by workers
|
||||
assert violations == []
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_intermediary", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"offload_child",
|
||||
[
|
||||
"offload",
|
||||
"offload-corrupt",
|
||||
"offload-no-restart",
|
||||
"offload-parent",
|
||||
"archive",
|
||||
None,
|
||||
],
|
||||
)
|
||||
def test_timeline_retain_lsn(
|
||||
neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: Optional[str]
|
||||
):
|
||||
"""
|
||||
Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones
|
||||
"""
|
||||
@@ -387,6 +669,7 @@ def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Op
|
||||
# Our corruption code only works with S3 compatible storage
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
|
||||
neon_env_builder.rust_log_override = "info,[gc_timeline]=debug"
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
@@ -394,22 +677,30 @@ def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Op
|
||||
tenant_id, root_timeline_id = env.create_tenant(
|
||||
conf={
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": 128 * 1024,
|
||||
"checkpoint_distance": 32 * 1024,
|
||||
"compaction_threshold": 1,
|
||||
"compaction_target_size": 128 * 1024,
|
||||
"compaction_target_size": 32 * 1024,
|
||||
# set small image creation thresholds so that gc deletes data
|
||||
"image_creation_threshold": 2,
|
||||
"image_creation_threshold": 1,
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# Disable pitr, we only want the latest lsn
|
||||
"pitr_interval": "0s",
|
||||
"gc_horizon": 0,
|
||||
# Don't rely on endpoint lsn leases
|
||||
"lsn_lease_length": "0s",
|
||||
}
|
||||
)
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
if with_intermediary:
|
||||
parent_branch_name = "test_archived_parent"
|
||||
parent_timeline_id = env.create_branch("test_archived_parent", tenant_id)
|
||||
else:
|
||||
parent_branch_name = "main"
|
||||
parent_timeline_id = root_timeline_id
|
||||
|
||||
with env.endpoints.create_start(parent_branch_name, tenant_id=tenant_id) as endpoint:
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
"CREATE TABLE foo(v int, key serial primary key, t text default 'data_content')",
|
||||
@@ -419,14 +710,16 @@ def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Op
|
||||
)
|
||||
pre_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
|
||||
log.info(f"Pre branch sum: {pre_branch_sum}")
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id)
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, parent_timeline_id)
|
||||
|
||||
# Create a branch and write some additional data to the parent
|
||||
child_timeline_id = env.create_branch("test_archived_branch", tenant_id)
|
||||
child_timeline_id = env.create_branch(
|
||||
"test_archived_branch", tenant_id, ancestor_branch_name=parent_branch_name
|
||||
)
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
# Do some churn of the data. This is important so that we can overwrite image layers.
|
||||
for i in range(10):
|
||||
with env.endpoints.create_start(parent_branch_name, tenant_id=tenant_id) as endpoint:
|
||||
# Do some overwriting churn with compactions in between. This is important so that we can overwrite image layers.
|
||||
for i in range(5):
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
f"SELECT setseed(0.23{i})",
|
||||
@@ -435,9 +728,9 @@ def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Op
|
||||
"UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 0",
|
||||
]
|
||||
)
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, parent_timeline_id)
|
||||
post_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
|
||||
log.info(f"Post branch sum: {post_branch_sum}")
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id)
|
||||
|
||||
if offload_child is not None:
|
||||
ps_http.timeline_archival_config(
|
||||
@@ -452,9 +745,19 @@ def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Op
|
||||
assert leaf_detail["is_archived"] is True
|
||||
if "offload" in offload_child:
|
||||
ps_http.timeline_offload(tenant_id, child_timeline_id)
|
||||
if "offload-parent" in offload_child:
|
||||
# Also offload the parent to ensure the retain_lsn of the child
|
||||
# is entered in the parent at unoffloading
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
parent_timeline_id,
|
||||
state=TimelineArchivalState.ARCHIVED,
|
||||
)
|
||||
ps_http.timeline_offload(tenant_id, parent_timeline_id)
|
||||
|
||||
# Do a restart to get rid of any in-memory objects (we only init gc info once, at attach)
|
||||
env.pageserver.stop()
|
||||
if offload_child is None or "no-restart" not in offload_child:
|
||||
env.pageserver.stop()
|
||||
if offload_child == "offload-corrupt":
|
||||
assert isinstance(env.pageserver_remote_storage, S3Storage)
|
||||
listing = list_prefix(
|
||||
@@ -489,13 +792,21 @@ def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Op
|
||||
".*page_service_conn_main.*could not find data for key.*",
|
||||
]
|
||||
)
|
||||
env.pageserver.start()
|
||||
if offload_child is None or "no-restart" not in offload_child:
|
||||
env.pageserver.start()
|
||||
if offload_child == "offload-parent":
|
||||
wait_until_tenant_active(ps_http, tenant_id=tenant_id)
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
parent_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
|
||||
# Do an agressive gc and compaction of the parent branch
|
||||
ps_http.timeline_gc(tenant_id=tenant_id, timeline_id=root_timeline_id, gc_horizon=0)
|
||||
ps_http.timeline_gc(tenant_id=tenant_id, timeline_id=parent_timeline_id, gc_horizon=0)
|
||||
ps_http.timeline_checkpoint(
|
||||
tenant_id,
|
||||
root_timeline_id,
|
||||
parent_timeline_id,
|
||||
force_l0_compaction=True,
|
||||
force_repartition=True,
|
||||
wait_until_uploaded=True,
|
||||
@@ -511,10 +822,15 @@ def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Op
|
||||
|
||||
# Now, after unarchival, the child timeline should still have its data accessible (or corrupted)
|
||||
if offload_child == "offload-corrupt":
|
||||
with pytest.raises(RuntimeError, match=".*failed to get basebackup.*"):
|
||||
env.endpoints.create_start(
|
||||
if with_intermediary:
|
||||
error_regex = "(.*could not read .* from page server.*|.*relation .* does not exist)"
|
||||
else:
|
||||
error_regex = ".*failed to get basebackup.*"
|
||||
with pytest.raises((RuntimeError, IoError, UndefinedTable), match=error_regex):
|
||||
with env.endpoints.create_start(
|
||||
"test_archived_branch", tenant_id=tenant_id, basebackup_request_tries=1
|
||||
)
|
||||
) as endpoint:
|
||||
endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
|
||||
else:
|
||||
with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint:
|
||||
sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
|
||||
|
||||
33
test_runner/regress/test_vm_truncate.py
Normal file
33
test_runner/regress/test_vm_truncate.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
#
|
||||
# Test that VM is properly truncated
|
||||
#
|
||||
def test_vm_truncate(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
con = endpoint.connect()
|
||||
cur = con.cursor()
|
||||
cur.execute("CREATE EXTENSION neon_test_utils")
|
||||
cur.execute("CREATE EXTENSION pageinspect")
|
||||
|
||||
cur.execute(
|
||||
"create table t(pk integer primary key, counter integer default 0, filler text default repeat('?', 200))"
|
||||
)
|
||||
cur.execute("insert into t (pk) values (generate_series(1,1000))")
|
||||
cur.execute("delete from t where pk>10")
|
||||
cur.execute("vacuum t") # truncates the relation, including its VM and FSM
|
||||
# get image of the first block of the VM excluding the page header. It's expected
|
||||
# to still be in the buffer cache.
|
||||
# ignore page header (24 bytes, 48 - it's hex representation)
|
||||
cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)")
|
||||
pg_bitmap = cur.fetchall()[0][0]
|
||||
# flush shared buffers
|
||||
cur.execute("SELECT clear_buffer_cache()")
|
||||
# now download the first block of the VM from the pageserver ...
|
||||
cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)")
|
||||
ps_bitmap = cur.fetchall()[0][0]
|
||||
# and check that content of bitmaps are equal, i.e. PS is producing the same VM page as Postgres
|
||||
assert pg_bitmap == ps_bitmap
|
||||
@@ -58,7 +58,7 @@ num-integer = { version = "0.1", features = ["i128"] }
|
||||
num-traits = { version = "0.2", features = ["i128", "libm"] }
|
||||
once_cell = { version = "1" }
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", default-features = false, features = ["with-serde_json-1"] }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", default-features = false, features = ["with-serde_json-1"] }
|
||||
prost = { version = "0.13", features = ["prost-derive"] }
|
||||
rand = { version = "0.8", features = ["small_rng"] }
|
||||
regex = { version = "1" }
|
||||
@@ -75,10 +75,10 @@ smallvec = { version = "1", default-features = false, features = ["const_new", "
|
||||
spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
|
||||
subtle = { version = "2" }
|
||||
sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
|
||||
tikv-jemalloc-sys = { version = "0.5" }
|
||||
tikv-jemalloc-sys = { version = "0.6", features = ["stats"] }
|
||||
time = { version = "0.3", features = ["macros", "serde-well-known"] }
|
||||
tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", features = ["with-serde_json-1"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", features = ["with-serde_json-1"] }
|
||||
tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
|
||||
tokio-stream = { version = "0.1", features = ["net"] }
|
||||
tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
|
||||
|
||||
Reference in New Issue
Block a user