diff --git a/walkeeper/README b/walkeeper/README index 593417299a..4a63d13b33 100644 --- a/walkeeper/README +++ b/walkeeper/README @@ -33,4 +33,5 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only one primary node can be actively streaming WAL to the quorum of safekeepers. -See README.md for a more detailed desription of the consensus protocol. +See README_PROTO.md for a more detailed desription of the consensus protocol. spec/ +contains TLA+ specification of it. diff --git a/walkeeper/README.md b/walkeeper/README_PROTO.md similarity index 100% rename from walkeeper/README.md rename to walkeeper/README_PROTO.md diff --git a/walkeeper/spec/ProposerAcceptorConsensus.cfg b/walkeeper/spec/ProposerAcceptorConsensus.cfg new file mode 100644 index 0000000000..5e69ba9fcd --- /dev/null +++ b/walkeeper/spec/ProposerAcceptorConsensus.cfg @@ -0,0 +1,34 @@ +\* MV CONSTANT declarations +CONSTANT NULL = NULL +CONSTANTS +p1 = p1 +p2 = p2 +p3 = p3 +a1 = a1 +a2 = a2 +a3 = a3 +\* MV CONSTANT definitions +CONSTANT +proposers = {p1, p2, p3} +acceptors = {a1, a2, a3} +\* SYMMETRY definition +SYMMETRY perms +\* CONSTANT definitions +CONSTANT +max_entries = 3 +CONSTANT +max_term = 3 +\* INIT definition +INIT +Init +\* NEXT definition +NEXT +Next +\* INVARIANT definition +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotOverwritten +CHECK_DEADLOCK FALSE \ No newline at end of file diff --git a/walkeeper/spec/ProposerAcceptorConsensus.tla b/walkeeper/spec/ProposerAcceptorConsensus.tla new file mode 100644 index 0000000000..7194a27094 --- /dev/null +++ b/walkeeper/spec/ProposerAcceptorConsensus.tla @@ -0,0 +1,356 @@ +---- MODULE ProposerAcceptorConsensus ---- + +\* Differences from current implementation: +\* - unified not-globally-unique epoch & term (node_id) +\* Simplifications: +\* - instant message delivery +\* - feedback is not modeled separately, commit_lsn is updated directly + +EXTENDS Integers, Sequences, FiniteSets, TLC + +VARIABLES + prop_state, \* prop_state[p] is state of proposer p + acc_state, \* acc_state[a] is state of acceptor a + commit_lsns \* map of acceptor -> commit_lsn + +CONSTANT + acceptors, + proposers, + max_entries, \* model constraint: max log entries acceptor/proposer can hold + max_term \* model constraint: max allowed term + +CONSTANT NULL + +ASSUME max_entries \in Nat /\ max_term \in Nat + +\* For specifying symmetry set in manual cfg file, see +\* https://github.com/tlaplus/tlaplus/issues/404 +perms == Permutations(proposers) \union Permutations(acceptors) + +\******************************************************************************** +\* Helpers +\******************************************************************************** + +Maximum(S) == + (*************************************************************************) + (* If S is a set of numbers, then this define Maximum(S) to be the *) + (* maximum of those numbers, or -1 if S is empty. *) + (*************************************************************************) + IF S = {} THEN -1 + ELSE CHOOSE n \in S : \A m \in S : n \geq m + +\* minimum of numbers in the set, error if set is empty +Minimum(S) == + CHOOSE min \in S : \A n \in S : min <= n + +\* Min of two numbers +Min(a, b) == IF a < b THEN a ELSE b + +\* Set of values of function f. XXX is there a such builtin? +FValues(f) == {f[a] : a \in DOMAIN f} + +\* Sort of 0 for functions +EmptyF == [x \in {} |-> 42] +IsEmptyF(f) == DOMAIN f = {} + +\* Next entry proposer p will push to acceptor a or NULL. +NextEntry(p, a) == + IF Len(prop_state[p].wal) >= prop_state[p].next_send_lsn[a] THEN + CHOOSE r \in FValues(prop_state[p].wal) : r.lsn = prop_state[p].next_send_lsn[a] + ELSE + NULL + + +\***************** + +NumAccs == Cardinality(acceptors) + +\* does acc_set form the quorum? +Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1) +\* all quorums of acceptors +Quorums == {subset \in SUBSET acceptors: Quorum(subset)} + +\* flush_lsn of acceptor a. +FlushLsn(a) == Len(acc_state[a].wal) + + +\******************************************************************************** +\* Type assertion +\******************************************************************************** +\* Defining sets of all possible tuples and using them in TypeOk in usual +\* all-tuples constructor is not practical because such definitions force +\* TLC to enumerate them, while they are are horribly enormous +\* (TLC screams "Attempted to construct a set with too many elements"). +\* So instead check types manually. +TypeOk == + /\ \A p \in proposers: + /\ DOMAIN prop_state[p] = {"state", "term", "votes", "donor_epoch", "vcl", "wal", "next_send_lsn"} + \* in campaign proposer sends RequestVote and waits for acks; + \* in leader he is elected + /\ prop_state[p].state \in {"campaign", "leader"} + \* 0..max_term should be actually Nat in the unbouned model, but TLC won't + \* swallow it + /\ prop_state[p].term \in 0..max_term + \* votes received + /\ \A voter \in DOMAIN prop_state[p].votes: + /\ voter \in acceptors + /\ prop_state[p].votes[voter] \in [epoch: 0..max_term, flush_lsn: 0..max_entries] + /\ prop_state[p].donor_epoch \in 0..max_term + \* wal is sequence of just records + /\ \A i \in DOMAIN prop_state[p].wal: + prop_state[p].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term] + \* Following implementation, we skew the original Aurora meaning of this; + \* here it is lsn of highest definitely committed record as set by proposer + \* when it is elected; it doesn't change since then + /\ prop_state[p].vcl \in 0..max_entries + \* map of acceptor -> next lsn to send + /\ \A a \in DOMAIN prop_state[p].next_send_lsn: + /\ a \in acceptors + /\ prop_state[p].next_send_lsn[a] \in 1..(max_entries + 1) + /\ \A a \in acceptors: + /\ DOMAIN acc_state[a] = {"term", "epoch", "wal"} + /\ acc_state[a].term \in 0..max_term + /\ acc_state[a].epoch \in 0..max_term + /\ \A i \in DOMAIN acc_state[a].wal: + acc_state[a].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term] + /\ \A a \in DOMAIN commit_lsns: + /\ a \in acceptors + /\ commit_lsns[a] \in 0..max_entries + +\******************************************************************************** +\* Initial +\******************************************************************************** + +Init == + /\ prop_state = [p \in proposers |-> [ + state |-> "campaign", + term |-> 1, + votes |-> EmptyF, + donor_epoch |-> 0, + vcl |-> 0, + wal |-> << >>, + next_send_lsn |-> EmptyF + ]] + /\ acc_state = [a \in acceptors |-> [ + \* there will be no leader in this term, 1 is the first real + term |-> 0, + epoch |-> 0, + wal |-> << >> + ]] + /\ commit_lsns = [a \in acceptors |-> 0] + + +\******************************************************************************** +\* Actions +\******************************************************************************** + +\* Proposer loses all state. +\* For simplicity (and to reduct state space), we assume it immediately gets +\* current state from quorum q of acceptors determining the term he will request +\* to vote for. +RestartProposer(p, q) == + /\ Quorum(q) + /\ LET + new_term == Maximum({acc_state[a].term : a \in q}) + 1 + IN + /\ new_term <= max_term + /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", + ![p].term = new_term, + ![p].votes = EmptyF, + ![p].donor_epoch = 0, + ![p].vcl = 0, + ![p].wal = << >>, + ![p].next_send_lsn = EmptyF] + /\ UNCHANGED <> + +\* Acceptor a immediately votes for proposer p. +Vote(p, a) == + /\ prop_state[p].state = "campaign" + /\ acc_state[a].term < prop_state[p].term \* main voting condition + /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] + /\ LET + vote == [epoch |-> acc_state[a].epoch, flush_lsn |-> FlushLsn(a)] + IN + prop_state' = [prop_state EXCEPT ![p].votes = prop_state[p].votes @@ (a :> vote)] + /\ UNCHANGED <> + + +\* Proposer p gets elected. +BecomeLeader(p) == + /\ prop_state[p].state = "campaign" + /\ Quorum(DOMAIN prop_state[p].votes) + /\ LET + max_epoch == Maximum({v.epoch : v \in FValues(prop_state[p].votes)}) + max_epoch_votes == {v \in FValues(prop_state[p].votes) : v.epoch = max_epoch} + donor == CHOOSE dv \in DOMAIN prop_state[p].votes : + /\ prop_state[p].votes[dv].epoch = max_epoch + /\ \A v \in max_epoch_votes: + prop_state[p].votes[dv].flush_lsn >= v.flush_lsn + max_vote == prop_state[p].votes[donor] + \* Establish lsn to stream from for voters. + \* At some point it seemed like we can regard log as correct and only + \* append to it if has in the max_epoch, however TLC showed that's not + \* the case; we must always stream since first not matching record. + next_send_lsn == [voter \in DOMAIN prop_state[p].votes |-> 1] + IN + \* we fetch log from the most advanced node (this is separate + \* roundtrip), make sure node is still on one term with us + /\ acc_state[donor].term = prop_state[p].term + /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", + \* fetch the log from donor + ![p].wal = acc_state[donor].wal, + ![p].donor_epoch = max_epoch, + ![p].vcl = max_vote.flush_lsn, + ![p].next_send_lsn = next_send_lsn] + /\ UNCHANGED <> + + +\* acceptor a learns about elected proposer p's term. +UpdateTerm(p, a) == + /\ prop_state[p].state = "leader" + /\ acc_state[a].term < prop_state[p].term + /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] + /\ UNCHANGED <> + + +\* Acceptor a which didn't participate in voting connects to elected proposer p +\* and p sets the streaming point +HandshakeWithLeader(p, a) == + /\ prop_state[p].state = "leader" + /\ acc_state[a].term = prop_state[p].term + /\ a \notin DOMAIN prop_state[p].next_send_lsn + /\ LET + next_send_lsn == prop_state[p].next_send_lsn @@ (a :> 1) + IN + prop_state' = [prop_state EXCEPT ![p].next_send_lsn = next_send_lsn] + /\ UNCHANGED <> + + +\* Append new log entry to elected proposer +NewEntry(p) == + /\ prop_state[p].state = "leader" + /\ Len(prop_state[p].wal) < max_entries \* model constraint + /\ LET + new_lsn == IF Len(prop_state[p].wal) = 0 THEN + prop_state[p].vcl + 1 + ELSE + \* lsn of last record + 1 + prop_state[p].wal[Len(prop_state[p].wal)].lsn + 1 + new_entry == [lsn |-> new_lsn, epoch |-> prop_state[p].term] + IN + /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)] + /\ UNCHANGED <> + + +\* Try to transfer entry from elected proposer p to acceptor a +TransferEntry(p, a) == + /\ prop_state[p].state = "leader" + /\ prop_state[p].term = acc_state[a].term + /\ a \in DOMAIN prop_state[p].next_send_lsn + /\ LET + next_e == NextEntry(p, a) + IN + /\ next_e /= NULL + /\ LET + \* Consider bumping epoch if getting this entry recovers the acceptor. + \* Both vcl and flush_lsn must be reached in current impl as we don't + \* truncate the log. + new_epoch == + IF /\ acc_state[a].epoch < prop_state[p].term + /\ next_e.lsn >= prop_state[p].vcl + /\ next_e.lsn >= Len(acc_state[a].wal) + THEN + prop_state[p].term + ELSE + acc_state[a].epoch + \* TLC spits warning about non-existent entry if EXCEPT is used here, + \* so construct manually. + new_wal == (next_e.lsn :> next_e) @@ acc_state[a].wal + \* Also check whether this entry allows to advance commit_lsn and + \* if so, bump it where possible. Modeling this as separate action + \* significantly bloats the space (5m vs 15m on max_entries=3 max_term=3, + \* so act immediately. + entry_owners == {o \in acceptors: + /\ o /= a + \* only recovered acceptors advance commit_lsn + /\ acc_state[o].epoch = prop_state[p].term + /\ next_e \in FValues(acc_state[o].wal)} \cup {a} + IN + /\ acc_state' = [acc_state EXCEPT ![a].wal = new_wal, + ![a].epoch = new_epoch] + /\ prop_state' = [prop_state EXCEPT ![p].next_send_lsn[a] = + prop_state[p].next_send_lsn[a] + 1] + /\ commit_lsns' = IF /\ new_epoch = prop_state[p].term + /\ Quorum(entry_owners) + THEN + [acc \in acceptors |-> + IF /\ acc \in entry_owners + /\ next_e.lsn > commit_lsns[acc] + THEN + next_e.lsn + ELSE + commit_lsns[acc]] + ELSE + commit_lsns + + +\******************************************************************************* +\* Final spec +\******************************************************************************* + +Next == + \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q) + \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) + \/ \E p \in proposers: BecomeLeader(p) + \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) + \/ \E p \in proposers: \E a \in acceptors: HandshakeWithLeader(p, a) + \/ \E p \in proposers: NewEntry(p) + \/ \E p \in proposers: \E a \in acceptors: TransferEntry(p, a) + +Spec == Init /\ [][Next]_<> + + +\******************************************************************************** +\* Invariants +\******************************************************************************** + +\* we don't track history, but this property is fairly convincing anyway +ElectionSafety == + \A p1, p2 \in proposers: + (/\ prop_state[p1].state = "leader" + /\ prop_state[p2].state = "leader" + /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2) + +LogIsMonotonic == + \A a \in acceptors: + \A i \in DOMAIN acc_state[a].wal: \A j \in DOMAIN acc_state[a].wal: + (i > j) => (/\ acc_state[a].wal[i].lsn > acc_state[a].wal[j].lsn + \* currently we don't truncate log during recovery, so + \* uncommitted part can be non-monotonic; remove the + \* precondition once we switch to truncation + /\ (i <= commit_lsns[a] /\ j <= commit_lsns[a]) => + (acc_state[a].wal[i].epoch >= acc_state[a].wal[j].epoch)) + +\* Main invariant: log under commit_lsn must match everywhere. +LogSafety == + \A a1 \in acceptors: \A a2 \in acceptors: + LET + common_len == Min(commit_lsns[a1], commit_lsns[a2]) + IN + SubSeq(acc_state[a1].wal, 1, common_len) = SubSeq(acc_state[a2].wal, 1, common_len) + +\* Next record we are going to push to acceptor must never overwrite committed +\* different record. +CommittedNotOverwritten == + \A p \in proposers: \A a \in acceptors: + (/\ prop_state[p].state = "leader" + /\ prop_state[p].term = acc_state[a].term + /\ a \in DOMAIN prop_state[p].next_send_lsn) => + LET + next_e == NextEntry(p, a) + IN + (next_e /= NULL) => + ((commit_lsns[a] >= next_e.lsn) => (acc_state[a].wal[next_e.lsn] = next_e)) + + +==== \ No newline at end of file