mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-10 14:10:37 +00:00
Compare commits
29 Commits
sk-proto-v
...
bodobolero
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d0d29468d8 | ||
|
|
567a665dc4 | ||
|
|
b0007302d0 | ||
|
|
b35dd198c3 | ||
|
|
b66fbd6176 | ||
|
|
95588dab98 | ||
|
|
1686d9e733 | ||
|
|
abcd00181c | ||
|
|
01f0be03b5 | ||
|
|
81cd30e4d6 | ||
|
|
7fc6953da4 | ||
|
|
77f9e74d86 | ||
|
|
0ceeec9be3 | ||
|
|
733a57247b | ||
|
|
6699a30a49 | ||
|
|
133b89a83d | ||
|
|
fba22a7123 | ||
|
|
14e05276a3 | ||
|
|
ebc55e6ae8 | ||
|
|
f07119cca7 | ||
|
|
47975d06d9 | ||
|
|
472007dd7c | ||
|
|
f9009d6b80 | ||
|
|
cab60b6d9f | ||
|
|
06090bbccd | ||
|
|
dcf335a251 | ||
|
|
b6e9daea9a | ||
|
|
d5c3a4e2b9 | ||
|
|
8107140f7f |
@@ -24,3 +24,4 @@
|
||||
!storage_controller/
|
||||
!vendor/postgres-*/
|
||||
!workspace_hack/
|
||||
!build_tools/patches
|
||||
|
||||
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -682,7 +682,7 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
target: neon-pg-ext-test
|
||||
target: extension-tests
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
|
||||
55
Cargo.lock
generated
55
Cargo.lock
generated
@@ -206,6 +206,16 @@ dependencies = [
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "assert-json-diff"
|
||||
version = "2.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-channel"
|
||||
version = "1.9.0"
|
||||
@@ -1010,6 +1020,12 @@ dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boxcar"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "1.5.0"
|
||||
@@ -2433,6 +2449,16 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gettid"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gimli"
|
||||
version = "0.31.1"
|
||||
@@ -4212,6 +4238,16 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "papaya"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"seize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking"
|
||||
version = "2.1.1"
|
||||
@@ -4839,6 +4875,7 @@ dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"assert-json-diff",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"atomic-take",
|
||||
@@ -4846,6 +4883,7 @@ dependencies = [
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"base64 0.13.1",
|
||||
"boxcar",
|
||||
"bstr",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -4862,6 +4900,7 @@ dependencies = [
|
||||
"flate2",
|
||||
"framed-websockets",
|
||||
"futures",
|
||||
"gettid",
|
||||
"hashbrown 0.14.5",
|
||||
"hashlink",
|
||||
"hex",
|
||||
@@ -4884,7 +4923,9 @@ dependencies = [
|
||||
"measured",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"p256 0.13.2",
|
||||
"papaya",
|
||||
"parking_lot 0.12.1",
|
||||
"parquet",
|
||||
"parquet_derive",
|
||||
@@ -4931,6 +4972,9 @@ dependencies = [
|
||||
"tokio-tungstenite 0.21.0",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-log",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-serde",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"try-lock",
|
||||
@@ -5884,6 +5928,16 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "seize"
|
||||
version = "0.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.17"
|
||||
@@ -8145,6 +8199,7 @@ dependencies = [
|
||||
"tower 0.4.13",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"url",
|
||||
"zerocopy",
|
||||
"zeroize",
|
||||
|
||||
@@ -54,6 +54,7 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
backtrace = "0.3.74"
|
||||
flate2 = "1.0.26"
|
||||
assert-json-diff = "2"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
|
||||
@@ -193,7 +194,9 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
|
||||
tower-service = "0.3.3"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2"
|
||||
tracing-log = "0.2"
|
||||
tracing-opentelemetry = "0.28"
|
||||
tracing-serde = "0.2.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
|
||||
@@ -3,10 +3,17 @@ ARG DEBIAN_VERSION=bookworm
|
||||
FROM debian:bookworm-slim AS pgcopydb_builder
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Use strict mode for bash to catch errors early
|
||||
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
|
||||
|
||||
# By default, /bin/sh used in debian images will treat '\n' as eol,
|
||||
# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
|
||||
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
set -e && \
|
||||
apt update && \
|
||||
@@ -39,6 +46,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
mkdir /tmp/pgcopydb && \
|
||||
tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
|
||||
cd /tmp/pgcopydb && \
|
||||
patch -p1 < /pgcopydbv017.patch && \
|
||||
make -s clean && \
|
||||
make -s -j12 install && \
|
||||
libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
|
||||
@@ -55,7 +63,8 @@ ARG DEBIAN_VERSION
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
# Use strict mode for bash to catch errors early
|
||||
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
|
||||
|
||||
RUN mkdir -p /pgcopydb/bin && \
|
||||
mkdir -p /pgcopydb/lib && \
|
||||
@@ -66,7 +75,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
|
||||
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
|
||||
|
||||
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
# System deps
|
||||
@@ -190,8 +199,14 @@ RUN set -e \
|
||||
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
|
||||
# And patches from us:
|
||||
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
|
||||
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
|
||||
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
|
||||
RUN set +o pipefail && \
|
||||
for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \
|
||||
yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\
|
||||
done && \
|
||||
set -o pipefail
|
||||
# Split into separate step to debug flaky failures here
|
||||
RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
|
||||
&& ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \
|
||||
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
|
||||
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
|
||||
&& cd lcov \
|
||||
|
||||
57
build_tools/patches/pgcopydbv017.patch
Normal file
57
build_tools/patches/pgcopydbv017.patch
Normal file
@@ -0,0 +1,57 @@
|
||||
diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c
|
||||
index d730b03..69a9be9 100644
|
||||
--- a/src/bin/pgcopydb/copydb.c
|
||||
+++ b/src/bin/pgcopydb/copydb.c
|
||||
@@ -44,6 +44,7 @@ GUC dstSettings[] = {
|
||||
{ "synchronous_commit", "'off'" },
|
||||
{ "statement_timeout", "0" },
|
||||
{ "lock_timeout", "0" },
|
||||
+ { "idle_in_transaction_session_timeout", "0" },
|
||||
{ NULL, NULL },
|
||||
};
|
||||
|
||||
diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
|
||||
index 94f2f46..e051ba8 100644
|
||||
--- a/src/bin/pgcopydb/pgsql.c
|
||||
+++ b/src/bin/pgcopydb/pgsql.c
|
||||
@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql,
|
||||
|
||||
LinesBuffer lbuf = { 0 };
|
||||
|
||||
+ if (message != NULL){
|
||||
+ // make sure message is writable by splitLines
|
||||
+ message = strdup(message);
|
||||
+ }
|
||||
+
|
||||
if (!splitLines(&lbuf, message))
|
||||
{
|
||||
/* errors have already been logged */
|
||||
@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql,
|
||||
PQbackendPID(pgsql->connection),
|
||||
lbuf.lines[lineNumber]);
|
||||
}
|
||||
+ free(message); // free copy of message we created above
|
||||
|
||||
if (pgsql->logSQL)
|
||||
{
|
||||
@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
|
||||
/* errors have already been logged */
|
||||
return;
|
||||
}
|
||||
-
|
||||
if (res != NULL)
|
||||
{
|
||||
char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
|
||||
- strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
|
||||
+ if (sqlstate == NULL)
|
||||
+ {
|
||||
+ // PQresultErrorField returned NULL!
|
||||
+ pgsql->sqlstate[0] = '\0'; // Set to an empty string to avoid segfault
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
|
||||
+ }
|
||||
}
|
||||
|
||||
char *endpoint =
|
||||
File diff suppressed because it is too large
Load Diff
@@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
|
||||
[licenses]
|
||||
allow = [
|
||||
"0BSD",
|
||||
"Apache-2.0",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
|
||||
@@ -52,6 +52,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
|
||||
if [ $pg_version -ge 16 ]; then
|
||||
docker cp ext-src $TEST_CONTAINER_NAME:/
|
||||
docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
|
||||
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
|
||||
4
docker-compose/ext-src/pgjwt-src/neon-test.sh
Executable file
4
docker-compose/ext-src/pgjwt-src/neon-test.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
cd "$(dirname "${0}")"
|
||||
pg_prove test.sql
|
||||
15
docker-compose/ext-src/pgjwt-src/test-upgrade.patch
Normal file
15
docker-compose/ext-src/pgjwt-src/test-upgrade.patch
Normal file
@@ -0,0 +1,15 @@
|
||||
diff --git a/test.sql b/test.sql
|
||||
index d7a0ca8..f15bc76 100644
|
||||
--- a/test.sql
|
||||
+++ b/test.sql
|
||||
@@ -9,9 +9,7 @@
|
||||
\set ON_ERROR_STOP true
|
||||
\set QUIET 1
|
||||
|
||||
-CREATE EXTENSION pgcrypto;
|
||||
-CREATE EXTENSION pgtap;
|
||||
-CREATE EXTENSION pgjwt;
|
||||
+CREATE EXTENSION IF NOT EXISTS pgtap;
|
||||
|
||||
BEGIN;
|
||||
SELECT plan(23);
|
||||
5
docker-compose/ext-src/pgjwt-src/test-upgrade.sh
Executable file
5
docker-compose/ext-src/pgjwt-src/test-upgrade.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
pg_prove test.sql
|
||||
@@ -24,7 +24,7 @@ function wait_for_ready {
|
||||
}
|
||||
function create_extensions() {
|
||||
for ext in ${1}; do
|
||||
docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
|
||||
docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
|
||||
done
|
||||
}
|
||||
EXTENSIONS='[
|
||||
@@ -40,7 +40,8 @@ EXTENSIONS='[
|
||||
{"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
|
||||
{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
|
||||
{"extname": "semver", "extdir": "pg_semver-src"},
|
||||
{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
|
||||
{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
|
||||
{"extname": "pgjwt", "extdir": "pgjwt-src"}
|
||||
]'
|
||||
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
|
||||
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
|
||||
|
||||
@@ -204,14 +204,16 @@ impl RemoteExtSpec {
|
||||
|
||||
// Check if extension is present in public or custom.
|
||||
// If not, then it is not allowed to be used by this compute.
|
||||
if let Some(public_extensions) = &self.public_extensions {
|
||||
if !public_extensions.contains(&real_ext_name.to_string()) {
|
||||
if let Some(custom_extensions) = &self.custom_extensions {
|
||||
if !custom_extensions.contains(&real_ext_name.to_string()) {
|
||||
return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
if !self
|
||||
.public_extensions
|
||||
.as_ref()
|
||||
.is_some_and(|exts| exts.iter().any(|e| e == ext_name))
|
||||
&& !self
|
||||
.custom_extensions
|
||||
.as_ref()
|
||||
.is_some_and(|exts| exts.iter().any(|e| e == ext_name))
|
||||
{
|
||||
return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
|
||||
}
|
||||
|
||||
match self.extension_data.get(real_ext_name) {
|
||||
@@ -340,6 +342,96 @@ mod tests {
|
||||
use super::*;
|
||||
use std::fs::File;
|
||||
|
||||
#[test]
|
||||
fn allow_installing_remote_extensions() {
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": null,
|
||||
"custom_extensions": null,
|
||||
"library_index": {},
|
||||
"extension_data": {},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect_err("Extension should not be found");
|
||||
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": [],
|
||||
"custom_extensions": null,
|
||||
"library_index": {},
|
||||
"extension_data": {},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect_err("Extension should not be found");
|
||||
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": [],
|
||||
"custom_extensions": [],
|
||||
"library_index": {
|
||||
"ext": "ext"
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect_err("Extension should not be found");
|
||||
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": [],
|
||||
"custom_extensions": ["ext"],
|
||||
"library_index": {
|
||||
"ext": "ext"
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect("Extension should be found");
|
||||
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": ["ext"],
|
||||
"custom_extensions": [],
|
||||
"library_index": {
|
||||
"ext": "ext"
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect("Extension should be found");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
|
||||
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
};
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
@@ -45,11 +45,11 @@ impl RemoteStorageKind {
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
/// Helper to fetch the configured concurrency limit.
|
||||
pub fn concurrency_limit(&self) -> Option<usize> {
|
||||
pub fn concurrency_limit(&self) -> usize {
|
||||
match &self.storage {
|
||||
RemoteStorageKind::LocalFs { .. } => None,
|
||||
RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
|
||||
RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
|
||||
RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT,
|
||||
RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(),
|
||||
RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,6 +65,12 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// Here, a limit of max 20k concurrent connections was noted.
|
||||
/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
|
||||
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// Set this limit analogously to the S3 limit.
|
||||
///
|
||||
/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds
|
||||
/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the
|
||||
/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks.
|
||||
pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
@@ -38,12 +38,14 @@ impl Display for SafekeeperId {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(transparent)]
|
||||
pub struct MemberSet {
|
||||
pub m: Vec<SafekeeperId>,
|
||||
pub members: Vec<SafekeeperId>,
|
||||
}
|
||||
|
||||
impl MemberSet {
|
||||
pub fn empty() -> Self {
|
||||
MemberSet { m: Vec::new() }
|
||||
MemberSet {
|
||||
members: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
|
||||
@@ -51,11 +53,11 @@ impl MemberSet {
|
||||
if hs.len() != members.len() {
|
||||
bail!("duplicate safekeeper id in the set {:?}", members);
|
||||
}
|
||||
Ok(MemberSet { m: members })
|
||||
Ok(MemberSet { members })
|
||||
}
|
||||
|
||||
pub fn contains(&self, sk: &SafekeeperId) -> bool {
|
||||
self.m.iter().any(|m| m.id == sk.id)
|
||||
self.members.iter().any(|m| m.id == sk.id)
|
||||
}
|
||||
|
||||
pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
|
||||
@@ -65,7 +67,7 @@ impl MemberSet {
|
||||
sk.id, self
|
||||
));
|
||||
}
|
||||
self.m.push(sk);
|
||||
self.members.push(sk);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -73,7 +75,11 @@ impl MemberSet {
|
||||
impl Display for MemberSet {
|
||||
/// Display as a comma separated list of members.
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::<Vec<_>>();
|
||||
let sks_str = self
|
||||
.members
|
||||
.iter()
|
||||
.map(|m| m.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "({})", sks_str.join(", "))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -215,7 +215,6 @@ impl Wrapper {
|
||||
syncSafekeepers: config.sync_safekeepers,
|
||||
systemId: 0,
|
||||
pgTimeline: 1,
|
||||
proto_version: 3,
|
||||
callback_data,
|
||||
};
|
||||
let c_config = Box::into_raw(Box::new(c_config));
|
||||
@@ -277,7 +276,6 @@ mod tests {
|
||||
use core::panic;
|
||||
use std::{
|
||||
cell::Cell,
|
||||
ffi::CString,
|
||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||
};
|
||||
|
||||
@@ -498,64 +496,57 @@ mod tests {
|
||||
// Messages definitions are at walproposer.h
|
||||
// xxx: it would be better to extract them from safekeeper crate and
|
||||
// use serialization/deserialization here.
|
||||
let greeting_tag = (b'g').to_be_bytes();
|
||||
let tenant_id = CString::new(ttid.tenant_id.to_string())
|
||||
.unwrap()
|
||||
.into_bytes_with_nul();
|
||||
let timeline_id = CString::new(ttid.timeline_id.to_string())
|
||||
.unwrap()
|
||||
.into_bytes_with_nul();
|
||||
let mconf_gen = 0_u32.to_be_bytes();
|
||||
let mconf_members_len = 0_u32.to_be_bytes();
|
||||
let mconf_members_new_len = 0_u32.to_be_bytes();
|
||||
let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes();
|
||||
let system_id = 0_u64.to_be_bytes();
|
||||
let wal_seg_size = 16777216_u32.to_be_bytes();
|
||||
|
||||
let greeting_tag = (b'g' as u64).to_ne_bytes();
|
||||
let proto_version = 2_u32.to_ne_bytes();
|
||||
let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
|
||||
let proposer_id = [0; 16];
|
||||
let system_id = 0_u64.to_ne_bytes();
|
||||
let tenant_id = ttid.tenant_id.as_arr();
|
||||
let timeline_id = ttid.timeline_id.as_arr();
|
||||
let pg_tli = 1_u32.to_ne_bytes();
|
||||
let wal_seg_size = 16777216_u32.to_ne_bytes();
|
||||
let proposer_greeting = [
|
||||
greeting_tag.as_slice(),
|
||||
proto_version.as_slice(),
|
||||
pg_version.as_slice(),
|
||||
proposer_id.as_slice(),
|
||||
system_id.as_slice(),
|
||||
tenant_id.as_slice(),
|
||||
timeline_id.as_slice(),
|
||||
mconf_gen.as_slice(),
|
||||
mconf_members_len.as_slice(),
|
||||
mconf_members_new_len.as_slice(),
|
||||
pg_version.as_slice(),
|
||||
system_id.as_slice(),
|
||||
pg_tli.as_slice(),
|
||||
wal_seg_size.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let voting_tag = (b'v').to_be_bytes();
|
||||
let vote_request_term = 3_u64.to_be_bytes();
|
||||
let voting_tag = (b'v' as u64).to_ne_bytes();
|
||||
let vote_request_term = 3_u64.to_ne_bytes();
|
||||
let proposer_id = [0; 16];
|
||||
let vote_request = [
|
||||
voting_tag.as_slice(),
|
||||
mconf_gen.as_slice(),
|
||||
vote_request_term.as_slice(),
|
||||
proposer_id.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let acceptor_greeting_term = 2_u64.to_be_bytes();
|
||||
let acceptor_greeting_node_id = 1_u64.to_be_bytes();
|
||||
let acceptor_greeting_term = 2_u64.to_ne_bytes();
|
||||
let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
|
||||
let acceptor_greeting = [
|
||||
greeting_tag.as_slice(),
|
||||
acceptor_greeting_node_id.as_slice(),
|
||||
mconf_gen.as_slice(),
|
||||
mconf_members_len.as_slice(),
|
||||
mconf_members_new_len.as_slice(),
|
||||
acceptor_greeting_term.as_slice(),
|
||||
acceptor_greeting_node_id.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let vote_response_term = 3_u64.to_be_bytes();
|
||||
let vote_given = 1_u8.to_be_bytes();
|
||||
let flush_lsn = 0x539_u64.to_be_bytes();
|
||||
let truncate_lsn = 0x539_u64.to_be_bytes();
|
||||
let th_len = 1_u32.to_be_bytes();
|
||||
let th_term = 2_u64.to_be_bytes();
|
||||
let th_lsn = 0x539_u64.to_be_bytes();
|
||||
let vote_response_term = 3_u64.to_ne_bytes();
|
||||
let vote_given = 1_u64.to_ne_bytes();
|
||||
let flush_lsn = 0x539_u64.to_ne_bytes();
|
||||
let truncate_lsn = 0x539_u64.to_ne_bytes();
|
||||
let th_len = 1_u32.to_ne_bytes();
|
||||
let th_term = 2_u64.to_ne_bytes();
|
||||
let th_lsn = 0x539_u64.to_ne_bytes();
|
||||
let timeline_start_lsn = 0x539_u64.to_ne_bytes();
|
||||
let vote_response = [
|
||||
voting_tag.as_slice(),
|
||||
mconf_gen.as_slice(),
|
||||
vote_response_term.as_slice(),
|
||||
vote_given.as_slice(),
|
||||
flush_lsn.as_slice(),
|
||||
@@ -563,6 +554,7 @@ mod tests {
|
||||
th_len.as_slice(),
|
||||
th_term.as_slice(),
|
||||
th_lsn.as_slice(),
|
||||
timeline_start_lsn.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ use utils::id::TimelineId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext};
|
||||
use crate::pgdatadir_mapping::DatadirModificationStats;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
@@ -2378,11 +2379,40 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_observed: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) values_committed_metadata_images: IntCounter,
|
||||
pub(crate) values_committed_metadata_deltas: IntCounter,
|
||||
pub(crate) values_committed_data_images: IntCounter,
|
||||
pub(crate) values_committed_data_deltas: IntCounter,
|
||||
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
|
||||
pub(crate) clear_vm_bits_unknown: IntCounterVec,
|
||||
}
|
||||
|
||||
impl WalIngestMetrics {
|
||||
pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
|
||||
if stats.metadata_images > 0 {
|
||||
self.values_committed_metadata_images
|
||||
.inc_by(stats.metadata_images);
|
||||
}
|
||||
if stats.metadata_deltas > 0 {
|
||||
self.values_committed_metadata_deltas
|
||||
.inc_by(stats.metadata_deltas);
|
||||
}
|
||||
if stats.data_images > 0 {
|
||||
self.values_committed_data_images.inc_by(stats.data_images);
|
||||
}
|
||||
if stats.data_deltas > 0 {
|
||||
self.values_committed_data_deltas.inc_by(stats.data_deltas);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
let values_committed = register_int_counter_vec!(
|
||||
"pageserver_wal_ingest_values_committed",
|
||||
"Number of values committed to pageserver storage from WAL records",
|
||||
&["class", "kind"],
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
WalIngestMetrics {
|
||||
bytes_received: register_int_counter!(
|
||||
"pageserver_wal_ingest_bytes_received",
|
||||
@@ -2409,17 +2439,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
|
||||
values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
|
||||
values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
|
||||
values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
|
||||
gap_blocks_zeroed_on_rel_extend: register_int_counter!(
|
||||
"pageserver_gap_blocks_zeroed_on_rel_extend",
|
||||
"Total number of zero gap blocks written on relation extends"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
clear_vm_bits_unknown: register_int_counter_vec!(
|
||||
"pageserver_wal_ingest_clear_vm_bits_unknown",
|
||||
"Number of ignored ClearVmBits operations due to unknown pages/relations",
|
||||
&["entity"],
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -1280,8 +1280,6 @@ impl PageServerHandler {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
// and log the info! line inside the request span
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -2037,6 +2035,12 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
|
||||
if timeline.is_archived() == Some(true) {
|
||||
// TODO after a grace period, turn this log line into a hard error
|
||||
tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
|
||||
//return Err(QueryError::NotFound("timeline is archived".into()))
|
||||
}
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
|
||||
@@ -48,7 +48,7 @@ use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
use wal_decoder::serialized_batch::SerializedValueBatch;
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
@@ -1297,6 +1297,26 @@ impl DatadirModification<'_> {
|
||||
.is_some_and(|b| b.has_data())
|
||||
}
|
||||
|
||||
/// Returns statistics about the currently pending modifications.
|
||||
pub(crate) fn stats(&self) -> DatadirModificationStats {
|
||||
let mut stats = DatadirModificationStats::default();
|
||||
for (_, _, value) in self.pending_metadata_pages.values().flatten() {
|
||||
match value {
|
||||
Value::Image(_) => stats.metadata_images += 1,
|
||||
Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
|
||||
Value::WalRecord(_) => stats.metadata_deltas += 1,
|
||||
}
|
||||
}
|
||||
for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
|
||||
match valuemeta {
|
||||
ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
|
||||
ValueMeta::Serialized(_) => stats.data_deltas += 1,
|
||||
ValueMeta::Observed(_) => {}
|
||||
}
|
||||
}
|
||||
stats
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
@@ -2317,6 +2337,15 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Statistics for a DatadirModification.
|
||||
#[derive(Default)]
|
||||
pub struct DatadirModificationStats {
|
||||
pub metadata_images: u64,
|
||||
pub metadata_deltas: u64,
|
||||
pub data_images: u64,
|
||||
pub data_deltas: u64,
|
||||
}
|
||||
|
||||
/// This struct facilitates accessing either a committed key from the timeline at a
|
||||
/// specific LSN, or the latest uncommitted key from a pending modification.
|
||||
///
|
||||
|
||||
@@ -437,8 +437,7 @@ impl RemoteTimelineClient {
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
.map_or(0, |r| r.concurrency_limit());
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
@@ -461,8 +460,7 @@ impl RemoteTimelineClient {
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
.map_or(0, |r| r.concurrency_limit());
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
@@ -484,8 +482,7 @@ impl RemoteTimelineClient {
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
.map_or(0, |r| r.concurrency_limit());
|
||||
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
|
||||
@@ -9,13 +9,14 @@ use crate::{
|
||||
metrics::SECONDARY_MODE,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
mgr::GetTenantError,
|
||||
mgr::TenantManager,
|
||||
mgr::{GetTenantError, TenantManager},
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||
Tenant,
|
||||
},
|
||||
virtual_file::VirtualFile,
|
||||
TEMP_FILE_SUFFIX,
|
||||
};
|
||||
|
||||
use futures::Future;
|
||||
@@ -32,7 +33,10 @@ use super::{
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
|
||||
yielding_loop::yielding_loop,
|
||||
};
|
||||
|
||||
pub(super) async fn heatmap_uploader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
@@ -461,6 +465,18 @@ async fn upload_tenant_heatmap(
|
||||
}
|
||||
}
|
||||
|
||||
// After a successful upload persist the fresh heatmap to disk.
|
||||
// When restarting, the tenant will read the heatmap from disk
|
||||
// and additively generate a new heatmap (see [`Timeline::generate_heatmap`]).
|
||||
// If the heatmap is stale, the additive generation can lead to keeping previously
|
||||
// evicted timelines on the secondarie's disk.
|
||||
let tenant_shard_id = tenant.get_tenant_shard_id();
|
||||
let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id);
|
||||
let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
|
||||
if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await {
|
||||
tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}");
|
||||
}
|
||||
|
||||
tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
|
||||
|
||||
Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
|
||||
|
||||
@@ -211,7 +211,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
error_run_count = 0;
|
||||
// schedule the next compaction immediately in case there is a pending compaction task
|
||||
sleep_duration = if let CompactionOutcome::Pending = outcome {
|
||||
Duration::ZERO
|
||||
Duration::from_secs(1)
|
||||
} else {
|
||||
period
|
||||
};
|
||||
|
||||
@@ -192,7 +192,12 @@ pub enum ImageLayerCreationMode {
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub enum LastImageLayerCreationStatus {
|
||||
Incomplete, // TODO: record the last key being processed
|
||||
Incomplete {
|
||||
/// The last key of the partition (exclusive) that was processed in the last
|
||||
/// image layer creation attempt. We will continue from this key in the next
|
||||
/// attempt.
|
||||
last_key: Key,
|
||||
},
|
||||
Complete,
|
||||
#[default]
|
||||
Initial,
|
||||
@@ -4346,7 +4351,7 @@ impl Timeline {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
// Is it time to create a new image layer for the given partition? True if we want to generate.
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
@@ -4658,6 +4663,11 @@ impl Timeline {
|
||||
) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
|
||||
if partitioning.parts.is_empty() {
|
||||
warn!("no partitions to create image layers for");
|
||||
return Ok((vec![], LastImageLayerCreationStatus::Complete));
|
||||
}
|
||||
|
||||
// We need to avoid holes between generated image layers.
|
||||
// Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
|
||||
// image layer with hole between them. In this case such layer can not be utilized by GC.
|
||||
@@ -4669,28 +4679,65 @@ impl Timeline {
|
||||
// image layers <100000000..100000099> and <200000000..200000199> are not completely covering it.
|
||||
let mut start = Key::MIN;
|
||||
|
||||
let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
|
||||
info!(
|
||||
"resuming image layer creation: last_status={:?}",
|
||||
last_status
|
||||
);
|
||||
true
|
||||
} else {
|
||||
self.should_check_if_image_layers_required(lsn)
|
||||
};
|
||||
let check_for_image_layers =
|
||||
if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
|
||||
info!(
|
||||
"resuming image layer creation: last_status=incomplete, continue from {}",
|
||||
last_key
|
||||
);
|
||||
true
|
||||
} else {
|
||||
self.should_check_if_image_layers_required(lsn)
|
||||
};
|
||||
|
||||
let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
|
||||
|
||||
let mut all_generated = true;
|
||||
|
||||
let mut partition_processed = 0;
|
||||
let total_partitions = partitioning.parts.len();
|
||||
let mut total_partitions = partitioning.parts.len();
|
||||
let mut last_partition_processed = None;
|
||||
let mut partition_parts = partitioning.parts.clone();
|
||||
|
||||
for partition in partitioning.parts.iter() {
|
||||
if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
|
||||
// We need to skip the partitions that have already been processed.
|
||||
let mut found = false;
|
||||
for (i, partition) in partition_parts.iter().enumerate() {
|
||||
if last_key <= partition.end().unwrap() {
|
||||
// ```plain
|
||||
// |------|--------|----------|------|
|
||||
// ^last_key
|
||||
// ^start from this partition
|
||||
// ```
|
||||
// Why `i+1` instead of `i`?
|
||||
// It is possible that the user did some writes after the previous image layer creation attempt so that
|
||||
// a relation grows in size, and the last_key is now in the middle of the partition. In this case, we
|
||||
// still want to skip this partition, so that we can make progress and avoid generating image layers over
|
||||
// the same partition. Doing a mod to ensure we don't end up with an empty vec.
|
||||
if i + 1 >= total_partitions {
|
||||
// In general, this case should not happen -- if last_key is on the last partition, the previous
|
||||
// iteration of image layer creation should return a complete status.
|
||||
break; // with found=false
|
||||
}
|
||||
partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
|
||||
total_partitions = partition_parts.len();
|
||||
// Update the start key to the partition start.
|
||||
start = partition_parts[0].start().unwrap();
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
// Last key is within the last partition, or larger than all partitions.
|
||||
return Ok((vec![], LastImageLayerCreationStatus::Complete));
|
||||
}
|
||||
}
|
||||
|
||||
for partition in partition_parts.iter() {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CreateImageLayersError::Cancelled);
|
||||
}
|
||||
|
||||
partition_processed += 1;
|
||||
let img_range = start..partition.ranges.last().unwrap().end;
|
||||
let compact_metadata = partition.overlaps(&Key::metadata_key_range());
|
||||
if compact_metadata {
|
||||
@@ -4725,6 +4772,8 @@ impl Timeline {
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
|
||||
is_delta: false,
|
||||
}) {
|
||||
// TODO: this can be processed with the BatchLayerWriter::finish_with_discard
|
||||
// in the future.
|
||||
tracing::info!(
|
||||
"Skipping image layer at {lsn} {}..{}, already exists",
|
||||
img_range.start,
|
||||
@@ -4805,8 +4854,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
partition_processed += 1;
|
||||
|
||||
if let ImageLayerCreationMode::Try = mode {
|
||||
// We have at least made some progress
|
||||
if batch_image_writer.pending_layer_num() >= 1 {
|
||||
@@ -4822,8 +4869,10 @@ impl Timeline {
|
||||
* self.get_compaction_threshold();
|
||||
if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
|
||||
tracing::info!(
|
||||
"preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
|
||||
"preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
|
||||
partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
|
||||
);
|
||||
last_partition_processed = Some(partition.clone());
|
||||
all_generated = false;
|
||||
break;
|
||||
}
|
||||
@@ -4868,7 +4917,14 @@ impl Timeline {
|
||||
if all_generated {
|
||||
LastImageLayerCreationStatus::Complete
|
||||
} else {
|
||||
LastImageLayerCreationStatus::Incomplete
|
||||
LastImageLayerCreationStatus::Incomplete {
|
||||
last_key: if let Some(last_partition_processed) = last_partition_processed {
|
||||
last_partition_processed.end().unwrap_or(Key::MIN)
|
||||
} else {
|
||||
// This branch should be unreachable, but in case it happens, we can just return the start key.
|
||||
Key::MIN
|
||||
},
|
||||
}
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ use crate::page_cache;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::gc_block::GcBlock;
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::batch_split_writer::{
|
||||
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
|
||||
@@ -438,6 +439,11 @@ impl KeyHistoryRetention {
|
||||
if dry_run {
|
||||
return true;
|
||||
}
|
||||
if LayerMap::is_l0(&key.key_range, key.is_delta) {
|
||||
// gc-compaction should not produce L0 deltas, otherwise it will break the layer order.
|
||||
// We should ignore such layers.
|
||||
return true;
|
||||
}
|
||||
let layer_generation;
|
||||
{
|
||||
let guard = tline.layers.read().await;
|
||||
@@ -748,7 +754,7 @@ impl Timeline {
|
||||
.store(Arc::new(outcome.clone()));
|
||||
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
if let LastImageLayerCreationStatus::Incomplete = outcome {
|
||||
if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
|
||||
// Yield and do not do any other kind of compaction.
|
||||
info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
|
||||
return Ok(CompactionOutcome::Pending);
|
||||
|
||||
@@ -355,6 +355,19 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// advances it to its end LSN. 0 is just an initialization placeholder.
|
||||
let mut modification = timeline.begin_modification(Lsn(0));
|
||||
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
uncommitted: &mut u64,
|
||||
) -> anyhow::Result<()> {
|
||||
let stats = modification.stats();
|
||||
modification.commit(ctx).await?;
|
||||
WAL_INGEST.records_committed.inc_by(*uncommitted);
|
||||
WAL_INGEST.inc_values_committed(&stats);
|
||||
*uncommitted = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
if !records.is_empty() {
|
||||
timeline
|
||||
.metrics
|
||||
@@ -366,8 +379,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
modification.commit(&ctx).await?;
|
||||
uncommitted_records = 0;
|
||||
commit(&mut modification, &ctx, &mut uncommitted_records).await?;
|
||||
}
|
||||
|
||||
let local_next_record_lsn = interpreted.next_record_lsn;
|
||||
@@ -396,8 +408,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
modification.commit(&ctx).await?;
|
||||
uncommitted_records = 0;
|
||||
commit(&mut modification, &ctx, &mut uncommitted_records).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -415,7 +426,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
if uncommitted_records > 0 || needs_last_record_lsn_advance {
|
||||
// Commit any uncommitted records
|
||||
modification.commit(&ctx).await?;
|
||||
commit(&mut modification, &ctx, &mut uncommitted_records).await?;
|
||||
}
|
||||
|
||||
if !caught_up && streaming_lsn >= end_of_wal {
|
||||
@@ -442,10 +453,12 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let stats = modification.stats();
|
||||
modification.commit(ctx).await?;
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
modification.commit(ctx).await?;
|
||||
WAL_INGEST.inc_values_committed(&stats);
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
|
||||
@@ -28,17 +28,9 @@ use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::fsm_logical_to_physical;
|
||||
use postgres_ffi::walrecord::*;
|
||||
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
|
||||
use wal_decoder::models::*;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
use tracing::*;
|
||||
use utils::failpoint_support;
|
||||
use utils::rate_limit::RateLimit;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
@@ -50,11 +42,18 @@ use crate::ZERO_PAGE;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::fsm_logical_to_physical;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::walrecord::*;
|
||||
use postgres_ffi::TransactionId;
|
||||
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::{critical, failpoint_support};
|
||||
use wal_decoder::models::*;
|
||||
|
||||
enum_pgversion! {CheckPoint, pgv::CheckPoint}
|
||||
|
||||
@@ -327,93 +326,75 @@ impl WalIngest {
|
||||
let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
|
||||
let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
|
||||
|
||||
// Sometimes, Postgres seems to create heap WAL records with the
|
||||
// ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
|
||||
// not set. In fact, it's possible that the VM page does not exist at all.
|
||||
// In that case, we don't want to store a record to clear the VM bit;
|
||||
// replaying it would fail to find the previous image of the page, because
|
||||
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
|
||||
// record if it doesn't.
|
||||
//
|
||||
// TODO: analyze the metrics and tighten this up accordingly. This logic
|
||||
// implicitly assumes that VM pages see explicit WAL writes before
|
||||
// implicit ClearVmBits, and will otherwise silently drop updates.
|
||||
// VM bits can only be cleared on the shard(s) owning the VM relation, and must be within
|
||||
// its view of the VM relation size. Out of caution, error instead of failing WAL ingestion,
|
||||
// as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
|
||||
// https://github.com/neondatabase/neon/pull/10634.
|
||||
let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
|
||||
WAL_INGEST
|
||||
.clear_vm_bits_unknown
|
||||
.with_label_values(&["relation"])
|
||||
.inc();
|
||||
critical!("clear_vm_bits for unknown VM relation {vm_rel}");
|
||||
return Ok(());
|
||||
};
|
||||
if let Some(blknum) = new_vm_blk {
|
||||
if blknum >= vm_size {
|
||||
WAL_INGEST
|
||||
.clear_vm_bits_unknown
|
||||
.with_label_values(&["new_page"])
|
||||
.inc();
|
||||
critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
|
||||
new_vm_blk = None;
|
||||
}
|
||||
}
|
||||
if let Some(blknum) = old_vm_blk {
|
||||
if blknum >= vm_size {
|
||||
WAL_INGEST
|
||||
.clear_vm_bits_unknown
|
||||
.with_label_values(&["old_page"])
|
||||
.inc();
|
||||
critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
|
||||
old_vm_blk = None;
|
||||
}
|
||||
}
|
||||
|
||||
if new_vm_blk.is_some() || old_vm_blk.is_some() {
|
||||
if new_vm_blk == old_vm_blk {
|
||||
// An UPDATE record that needs to clear the bits for both old and the
|
||||
// new page, both of which reside on the same VM page.
|
||||
if new_vm_blk.is_none() && old_vm_blk.is_none() {
|
||||
return Ok(());
|
||||
} else if new_vm_blk == old_vm_blk {
|
||||
// An UPDATE record that needs to clear the bits for both old and the new page, both of
|
||||
// which reside on the same VM page.
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
new_vm_blk.unwrap(),
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
// Clear VM bits for one heap page, or for two pages that reside on different VM pages.
|
||||
if let Some(new_vm_blk) = new_vm_blk {
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
new_vm_blk.unwrap(),
|
||||
new_vm_blk,
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno: None,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
if let Some(old_vm_blk) = old_vm_blk {
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
old_vm_blk,
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno: None,
|
||||
old_heap_blkno,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
// Clear VM bits for one heap page, or for two pages that reside on
|
||||
// different VM pages.
|
||||
if let Some(new_vm_blk) = new_vm_blk {
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
new_vm_blk,
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno: None,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
if let Some(old_vm_blk) = old_vm_blk {
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
old_vm_blk,
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno: None,
|
||||
old_heap_blkno,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -509,47 +509,44 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
|
||||
tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
|
||||
chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
|
||||
if (!LFC_ENABLED())
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
return 0;
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
|
||||
if (LFC_ENABLED())
|
||||
{
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
int this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
|
||||
if (entry != NULL)
|
||||
if (entry != NULL)
|
||||
{
|
||||
for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
|
||||
{
|
||||
for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
|
||||
if ((entry->bitmap[chunk_offs >> 5] &
|
||||
((uint32)1 << (chunk_offs & 31))) != 0)
|
||||
{
|
||||
if ((entry->bitmap[chunk_offs >> 5] &
|
||||
((uint32)1 << (chunk_offs & 31))) != 0)
|
||||
{
|
||||
BITMAP_SET(bitmap, i);
|
||||
found++;
|
||||
}
|
||||
BITMAP_SET(bitmap, i);
|
||||
found++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
i += this_chunk;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
return found;
|
||||
i += this_chunk;
|
||||
}
|
||||
|
||||
/*
|
||||
* Break out of the iteration before doing expensive stuff for
|
||||
* a next iteration
|
||||
*/
|
||||
if (i + 1 >= nblocks)
|
||||
if (i >= nblocks)
|
||||
break;
|
||||
|
||||
/*
|
||||
@@ -563,8 +560,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
#if USE_ASSERT_CHECKING
|
||||
do {
|
||||
#ifdef USE_ASSERT_CHECKING
|
||||
{
|
||||
int count = 0;
|
||||
|
||||
for (int j = 0; j < nblocks; j++)
|
||||
@@ -574,7 +571,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
|
||||
Assert(count == found);
|
||||
} while (false);
|
||||
}
|
||||
#endif
|
||||
|
||||
return found;
|
||||
|
||||
@@ -36,6 +36,11 @@
|
||||
#include "pagestore_client.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/ioctl.h>
|
||||
#include <linux/sockios.h>
|
||||
#endif
|
||||
|
||||
#define PageStoreTrace DEBUG5
|
||||
|
||||
#define MIN_RECONNECT_INTERVAL_USEC 1000
|
||||
@@ -728,11 +733,36 @@ retry:
|
||||
INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
|
||||
if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS)
|
||||
{
|
||||
int sndbuf = -1;
|
||||
int recvbuf = -1;
|
||||
#ifdef __linux__
|
||||
int socketfd;
|
||||
#endif
|
||||
|
||||
since_start = now;
|
||||
INSTR_TIME_SUBTRACT(since_start, start_ts);
|
||||
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
|
||||
|
||||
#ifdef __linux__
|
||||
/*
|
||||
* get kernel's send and recv queue size via ioctl
|
||||
* https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27
|
||||
*/
|
||||
socketfd = PQsocket(pageserver_conn);
|
||||
if (socketfd != -1) {
|
||||
int ioctl_err;
|
||||
ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf);
|
||||
if (ioctl_err!= 0) {
|
||||
sndbuf = -errno;
|
||||
}
|
||||
ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf);
|
||||
if (ioctl_err != 0) {
|
||||
recvbuf = -errno;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
|
||||
INSTR_TIME_GET_DOUBLE(since_start),
|
||||
shard->nrequests_sent, shard->nresponses_received);
|
||||
shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
|
||||
last_log_ts = now;
|
||||
logged = true;
|
||||
}
|
||||
|
||||
@@ -51,26 +51,6 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint16 - get a binary 2-byte int from a message buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
uint16
|
||||
pq_getmsgint16(StringInfo msg)
|
||||
{
|
||||
return pq_getmsgint(msg, 2);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint32 - get a binary 4-byte int from a message buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
uint32
|
||||
pq_getmsgint32(StringInfo msg)
|
||||
{
|
||||
return pq_getmsgint(msg, 4);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order
|
||||
* --------------------------------
|
||||
|
||||
@@ -8,8 +8,6 @@
|
||||
#endif
|
||||
|
||||
bool HexDecodeString(uint8 *result, char *input, int nbytes);
|
||||
uint16 pq_getmsgint16(StringInfo msg);
|
||||
uint32 pq_getmsgint32(StringInfo msg);
|
||||
uint32 pq_getmsgint32_le(StringInfo msg);
|
||||
uint64 pq_getmsgint64_le(StringInfo msg);
|
||||
void pq_sendint32_le(StringInfo buf, uint32 i);
|
||||
|
||||
@@ -916,7 +916,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
|
||||
{
|
||||
uint64 min_ring_index;
|
||||
PrefetchRequest hashkey;
|
||||
#if USE_ASSERT_CHECKING
|
||||
#ifdef USE_ASSERT_CHECKING
|
||||
bool any_hits = false;
|
||||
#endif
|
||||
/* We will never read further ahead than our buffer can store. */
|
||||
@@ -955,7 +955,7 @@ Retry:
|
||||
else
|
||||
lsns = NULL;
|
||||
|
||||
#if USE_ASSERT_CHECKING
|
||||
#ifdef USE_ASSERT_CHECKING
|
||||
any_hits = true;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -70,7 +70,6 @@ static bool SendAppendRequests(Safekeeper *sk);
|
||||
static bool RecvAppendResponses(Safekeeper *sk);
|
||||
static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
|
||||
static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
|
||||
static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version);
|
||||
static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
|
||||
static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
|
||||
static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
|
||||
@@ -82,8 +81,6 @@ static char *FormatSafekeeperState(Safekeeper *sk);
|
||||
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
|
||||
static char *FormatEvents(WalProposer *wp, uint32 events);
|
||||
static void UpdateDonorShmem(WalProposer *wp);
|
||||
static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
|
||||
static void MembershipConfigurationFree(MembershipConfiguration *mconf);
|
||||
|
||||
WalProposer *
|
||||
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
@@ -140,21 +137,25 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
}
|
||||
wp->quorum = wp->n_safekeepers / 2 + 1;
|
||||
|
||||
if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
|
||||
wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
|
||||
wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
|
||||
|
||||
/* Fill the greeting package */
|
||||
wp->greetRequest.pam.tag = 'g';
|
||||
if (!wp->config->neon_tenant)
|
||||
wp_log(FATAL, "neon.tenant_id is not provided");
|
||||
wp->greetRequest.tenant_id = wp->config->neon_tenant;
|
||||
wp->greetRequest.tag = 'g';
|
||||
wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
|
||||
wp->greetRequest.pgVersion = PG_VERSION_NUM;
|
||||
wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
|
||||
wp->greetRequest.systemId = wp->config->systemId;
|
||||
if (!wp->config->neon_timeline)
|
||||
wp_log(FATAL, "neon.timeline_id is not provided");
|
||||
wp->greetRequest.timeline_id = wp->config->neon_timeline;
|
||||
wp->greetRequest.pg_version = PG_VERSION_NUM;
|
||||
wp->greetRequest.system_id = wp->config->systemId;
|
||||
wp->greetRequest.wal_seg_size = wp->config->wal_segment_size;
|
||||
if (*wp->config->neon_timeline != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
|
||||
wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
|
||||
if (!wp->config->neon_tenant)
|
||||
wp_log(FATAL, "neon.tenant_id is not provided");
|
||||
if (*wp->config->neon_tenant != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
|
||||
wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
|
||||
|
||||
wp->greetRequest.timeline = wp->config->pgTimeline;
|
||||
wp->greetRequest.walSegSize = wp->config->wal_segment_size;
|
||||
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
@@ -164,14 +165,12 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
void
|
||||
WalProposerFree(WalProposer *wp)
|
||||
{
|
||||
MembershipConfigurationFree(&wp->mconf);
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
Assert(sk->outbuf.data != NULL);
|
||||
pfree(sk->outbuf.data);
|
||||
MembershipConfigurationFree(&sk->greetResponse.mconf);
|
||||
if (sk->voteResponse.termHistory.entries)
|
||||
pfree(sk->voteResponse.termHistory.entries);
|
||||
sk->voteResponse.termHistory.entries = NULL;
|
||||
@@ -309,7 +308,6 @@ ShutdownConnection(Safekeeper *sk)
|
||||
sk->state = SS_OFFLINE;
|
||||
sk->streamingAt = InvalidXLogRecPtr;
|
||||
|
||||
MembershipConfigurationFree(&sk->greetResponse.mconf);
|
||||
if (sk->voteResponse.termHistory.entries)
|
||||
pfree(sk->voteResponse.termHistory.entries);
|
||||
sk->voteResponse.termHistory.entries = NULL;
|
||||
@@ -600,14 +598,11 @@ static void
|
||||
SendStartWALPush(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
#define CMD_LEN 512
|
||||
char cmd[CMD_LEN];
|
||||
|
||||
snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
|
||||
if (!wp->api.conn_send_query(sk, cmd))
|
||||
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
|
||||
{
|
||||
wp_log(WARNING, "failed to send %s query to safekeeper %s:%s: %s",
|
||||
cmd, sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
@@ -663,33 +658,23 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
|
||||
/*
|
||||
* Start handshake: first of all send information about the
|
||||
* walproposer. After sending, we wait on SS_HANDSHAKE_RECV for
|
||||
* safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
|
||||
* a response to finish the handshake.
|
||||
*/
|
||||
static void
|
||||
SendProposerGreeting(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
char *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf);
|
||||
|
||||
wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml);
|
||||
pfree(mconf_toml);
|
||||
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest,
|
||||
&sk->outbuf, wp->config->proto_version);
|
||||
|
||||
/*
|
||||
* On failure, logging & resetting the connection is handled. We just need
|
||||
* to handle the control flow.
|
||||
*/
|
||||
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
|
||||
BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV);
|
||||
}
|
||||
|
||||
static void
|
||||
RecvAcceptorGreeting(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
char *mconf_toml;
|
||||
|
||||
/*
|
||||
* If our reading doesn't immediately succeed, any necessary error
|
||||
@@ -700,10 +685,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
|
||||
return;
|
||||
|
||||
mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf);
|
||||
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT,
|
||||
sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
|
||||
pfree(mconf_toml);
|
||||
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);
|
||||
|
||||
/* Protocol is all good, move to voting. */
|
||||
sk->state = SS_VOTING;
|
||||
@@ -725,9 +707,12 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
wp->propTerm++;
|
||||
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||
|
||||
wp->voteRequest.pam.tag = 'v';
|
||||
wp->voteRequest.generation = wp->mconf.generation;
|
||||
wp->voteRequest.term = wp->propTerm;
|
||||
wp->voteRequest = (VoteRequest)
|
||||
{
|
||||
.tag = 'v',
|
||||
.term = wp->propTerm
|
||||
};
|
||||
memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN);
|
||||
}
|
||||
}
|
||||
else if (sk->greetResponse.term > wp->propTerm)
|
||||
@@ -774,14 +759,12 @@ SendVoteRequest(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest,
|
||||
&sk->outbuf, wp->config->proto_version);
|
||||
|
||||
/* We have quorum for voting, send our vote request */
|
||||
wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
|
||||
wp->voteRequest.generation, wp->voteRequest.term);
|
||||
wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
|
||||
/* On failure, logging & resetting is handled */
|
||||
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
|
||||
if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
|
||||
return;
|
||||
|
||||
/* If successful, wait for read-ready with SS_WAIT_VERDICT */
|
||||
}
|
||||
|
||||
@@ -795,12 +778,11 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
return;
|
||||
|
||||
wp_log(LOG,
|
||||
"got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
|
||||
sk->voteResponse.voteGiven,
|
||||
GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
|
||||
|
||||
/*
|
||||
* In case of acceptor rejecting our vote, bail out, but only if either it
|
||||
@@ -865,9 +847,9 @@ HandleElectedProposer(WalProposer *wp)
|
||||
* otherwise we must be sync-safekeepers and we have nothing to do then.
|
||||
*
|
||||
* Proceeding is not only pointless but harmful, because we'd give
|
||||
* safekeepers term history starting with 0/0. These hacks will go away
|
||||
* once we disable implicit timeline creation on safekeepers and create it
|
||||
* with non zero LSN from the start.
|
||||
* safekeepers term history starting with 0/0. These hacks will go away once
|
||||
* we disable implicit timeline creation on safekeepers and create it with
|
||||
* non zero LSN from the start.
|
||||
*/
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr)
|
||||
{
|
||||
@@ -960,6 +942,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->propEpochStartLsn = InvalidXLogRecPtr;
|
||||
wp->donorEpoch = 0;
|
||||
wp->truncateLsn = InvalidXLogRecPtr;
|
||||
wp->timelineStartLsn = InvalidXLogRecPtr;
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
@@ -976,6 +959,20 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->donor = i;
|
||||
}
|
||||
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
|
||||
|
||||
if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
|
||||
{
|
||||
/* timelineStartLsn should be the same everywhere or unknown */
|
||||
if (wp->timelineStartLsn != InvalidXLogRecPtr &&
|
||||
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
|
||||
{
|
||||
wp_log(WARNING,
|
||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
||||
}
|
||||
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -998,11 +995,22 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
|
||||
{
|
||||
wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
|
||||
if (wp->timelineStartLsn == InvalidXLogRecPtr)
|
||||
{
|
||||
wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
|
||||
}
|
||||
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
}
|
||||
pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
|
||||
|
||||
Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers);
|
||||
/*
|
||||
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so
|
||||
* it should never be zero at this point, if we know timelineStartLsn.
|
||||
*
|
||||
* timelineStartLsn can be zero only on the first syncSafekeepers run.
|
||||
*/
|
||||
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
|
||||
(wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));
|
||||
|
||||
/*
|
||||
* We will be generating WAL since propEpochStartLsn, so we should set
|
||||
@@ -1045,11 +1053,10 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
|
||||
{
|
||||
/*
|
||||
* However, allow to proceed if last_log_term on the node which
|
||||
* gave the highest vote (i.e. point where we are going to start
|
||||
* writing) actually had been won by me; plain restart of
|
||||
* walproposer not intervened by concurrent compute which wrote
|
||||
* WAL is ok.
|
||||
* However, allow to proceed if last_log_term on the node which gave
|
||||
* the highest vote (i.e. point where we are going to start writing)
|
||||
* actually had been won by me; plain restart of walproposer not
|
||||
* intervened by concurrent compute which wrote WAL is ok.
|
||||
*
|
||||
* This avoids compute crash after manual term_bump.
|
||||
*/
|
||||
@@ -1119,8 +1126,14 @@ SendProposerElected(Safekeeper *sk)
|
||||
{
|
||||
/* safekeeper is empty or no common point, start from the beginning */
|
||||
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
|
||||
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries);
|
||||
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
|
||||
|
||||
/*
|
||||
* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
|
||||
* is created manually (test_s3_wal_replay)
|
||||
*/
|
||||
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1145,19 +1158,29 @@ SendProposerElected(Safekeeper *sk)
|
||||
|
||||
Assert(sk->startStreamingAt <= wp->availableLsn);
|
||||
|
||||
msg.apm.tag = 'e';
|
||||
msg.generation = wp->mconf.generation;
|
||||
msg.tag = 'e';
|
||||
msg.term = wp->propTerm;
|
||||
msg.startStreamingAt = sk->startStreamingAt;
|
||||
msg.termHistory = &wp->propTermHistory;
|
||||
msg.timelineStartLsn = wp->timelineStartLsn;
|
||||
|
||||
lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0;
|
||||
wp_log(LOG,
|
||||
"sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
|
||||
sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt),
|
||||
lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
|
||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
pq_sendint64_le(&sk->outbuf, msg.tag);
|
||||
pq_sendint64_le(&sk->outbuf, msg.term);
|
||||
pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
|
||||
pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
|
||||
for (int i = 0; i < msg.termHistory->n_entries; i++)
|
||||
{
|
||||
pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
|
||||
pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
|
||||
}
|
||||
pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
|
||||
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version);
|
||||
if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
|
||||
return;
|
||||
|
||||
@@ -1223,13 +1246,14 @@ static void
|
||||
PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
|
||||
{
|
||||
Assert(endLsn >= beginLsn);
|
||||
req->apm.tag = 'a';
|
||||
req->generation = wp->mconf.generation;
|
||||
req->tag = 'a';
|
||||
req->term = wp->propTerm;
|
||||
req->epochStartLsn = wp->propEpochStartLsn;
|
||||
req->beginLsn = beginLsn;
|
||||
req->endLsn = endLsn;
|
||||
req->commitLsn = wp->commitLsn;
|
||||
req->truncateLsn = wp->truncateLsn;
|
||||
req->proposerId = wp->greetRequest.proposerId;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1330,8 +1354,7 @@ SendAppendRequests(Safekeeper *sk)
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
/* write AppendRequest header */
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version);
|
||||
/* prepare for reading WAL into the outbuf */
|
||||
appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
|
||||
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
|
||||
sk->active_state = SS_ACTIVE_READ_WAL;
|
||||
}
|
||||
@@ -1344,17 +1367,14 @@ SendAppendRequests(Safekeeper *sk)
|
||||
req = &sk->appendRequest;
|
||||
req_len = req->endLsn - req->beginLsn;
|
||||
|
||||
/*
|
||||
* We send zero sized AppenRequests as heartbeats; don't wal_read
|
||||
* for these.
|
||||
*/
|
||||
/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
|
||||
if (req_len > 0)
|
||||
{
|
||||
switch (wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req_len,
|
||||
&errmsg))
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req_len,
|
||||
&errmsg))
|
||||
{
|
||||
case NEON_WALREAD_SUCCESS:
|
||||
break;
|
||||
@@ -1362,7 +1382,7 @@ SendAppendRequests(Safekeeper *sk)
|
||||
return true;
|
||||
case NEON_WALREAD_ERROR:
|
||||
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||
sk->host, sk->port, errmsg);
|
||||
sk->host, sk->port, errmsg);
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1450,11 +1470,11 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
* Term has changed to higher one, probably another compute is
|
||||
* running. If this is the case we could PANIC as well because
|
||||
* likely it inserted some data and our basebackup is unsuitable
|
||||
* anymore. However, we also bump term manually (term_bump
|
||||
* endpoint) on safekeepers for migration purposes, in this case
|
||||
* we do want compute to stay alive. So restart walproposer with
|
||||
* FATAL instead of panicking; if basebackup is spoiled next
|
||||
* election will notice this.
|
||||
* anymore. However, we also bump term manually (term_bump endpoint)
|
||||
* on safekeepers for migration purposes, in this case we do want
|
||||
* compute to stay alive. So restart walproposer with FATAL instead
|
||||
* of panicking; if basebackup is spoiled next election will notice
|
||||
* this.
|
||||
*/
|
||||
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
sk->host, sk->port,
|
||||
@@ -1489,7 +1509,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
|
||||
for (i = 0; i < nkeys; i++)
|
||||
{
|
||||
const char *key = pq_getmsgrawstring(reply_message);
|
||||
const char *key = pq_getmsgstring(reply_message);
|
||||
unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32));
|
||||
|
||||
if (strcmp(key, "current_timeline_size") == 0)
|
||||
@@ -1730,208 +1750,6 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
|
||||
}
|
||||
}
|
||||
|
||||
/* Serialize MembershipConfiguration into buf. */
|
||||
static void
|
||||
MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf)
|
||||
{
|
||||
uint32 i;
|
||||
|
||||
pq_sendint32(buf, mconf->generation);
|
||||
|
||||
pq_sendint32(buf, mconf->members.len);
|
||||
for (i = 0; i < mconf->members.len; i++)
|
||||
{
|
||||
pq_sendint64(buf, mconf->members.m[i].node_id);
|
||||
pq_send_ascii_string(buf, mconf->members.m[i].host);
|
||||
pq_sendint16(buf, mconf->members.m[i].port);
|
||||
}
|
||||
|
||||
/*
|
||||
* There is no special mark for absent new_members; zero members in
|
||||
* invalid, so zero len means absent.
|
||||
*/
|
||||
pq_sendint32(buf, mconf->new_members.len);
|
||||
for (i = 0; i < mconf->new_members.len; i++)
|
||||
{
|
||||
pq_sendint64(buf, mconf->new_members.m[i].node_id);
|
||||
pq_send_ascii_string(buf, mconf->new_members.m[i].host);
|
||||
pq_sendint16(buf, mconf->new_members.m[i].port);
|
||||
}
|
||||
}
|
||||
|
||||
/* Serialize proposer -> acceptor message into buf using specified version */
|
||||
static void
|
||||
PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version)
|
||||
{
|
||||
/* both version are supported currently until we fully migrate to 3 */
|
||||
Assert(proto_version == 3 || proto_version == 2);
|
||||
|
||||
resetStringInfo(buf);
|
||||
|
||||
if (proto_version == 3)
|
||||
{
|
||||
/*
|
||||
* v2 sends structs for some messages as is, so commonly send tag only
|
||||
* for v3
|
||||
*/
|
||||
pq_sendint8(buf, msg->tag);
|
||||
|
||||
switch (msg->tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
ProposerGreeting *m = (ProposerGreeting *) msg;
|
||||
|
||||
pq_send_ascii_string(buf, m->tenant_id);
|
||||
pq_send_ascii_string(buf, m->timeline_id);
|
||||
MembershipConfigurationSerialize(&m->mconf, buf);
|
||||
pq_sendint32(buf, m->pg_version);
|
||||
pq_sendint64(buf, m->system_id);
|
||||
pq_sendint32(buf, m->wal_seg_size);
|
||||
break;
|
||||
}
|
||||
case 'v':
|
||||
{
|
||||
VoteRequest *m = (VoteRequest *) msg;
|
||||
|
||||
pq_sendint32(buf, m->generation);
|
||||
pq_sendint64(buf, m->term);
|
||||
break;
|
||||
|
||||
}
|
||||
case 'e':
|
||||
{
|
||||
ProposerElected *m = (ProposerElected *) msg;
|
||||
|
||||
pq_sendint32(buf, m->generation);
|
||||
pq_sendint64(buf, m->term);
|
||||
pq_sendint64(buf, m->startStreamingAt);
|
||||
pq_sendint32(buf, m->termHistory->n_entries);
|
||||
for (uint32 i = 0; i < m->termHistory->n_entries; i++)
|
||||
{
|
||||
pq_sendint64(buf, m->termHistory->entries[i].term);
|
||||
pq_sendint64(buf, m->termHistory->entries[i].lsn);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'a':
|
||||
{
|
||||
/*
|
||||
* Note: this serializes only AppendRequestHeader, caller
|
||||
* is expected to append WAL data later.
|
||||
*/
|
||||
AppendRequestHeader *m = (AppendRequestHeader *) msg;
|
||||
|
||||
pq_sendint32(buf, m->generation);
|
||||
pq_sendint64(buf, m->term);
|
||||
pq_sendint64(buf, m->beginLsn);
|
||||
pq_sendint64(buf, m->endLsn);
|
||||
pq_sendint64(buf, m->commitLsn);
|
||||
pq_sendint64(buf, m->truncateLsn);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (proto_version == 2)
|
||||
{
|
||||
switch (msg->tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
/* v2 sent struct as is */
|
||||
ProposerGreeting *m = (ProposerGreeting *) msg;
|
||||
ProposerGreetingV2 greetRequestV2;
|
||||
|
||||
/* Fill also v2 struct. */
|
||||
greetRequestV2.tag = 'g';
|
||||
greetRequestV2.protocolVersion = proto_version;
|
||||
greetRequestV2.pgVersion = m->pg_version;
|
||||
|
||||
/*
|
||||
* v3 removed this field because it's easier to pass as
|
||||
* libq or START_WAL_PUSH options
|
||||
*/
|
||||
memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId));
|
||||
greetRequestV2.systemId = wp->config->systemId;
|
||||
if (*m->timeline_id != '\0' &&
|
||||
!HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16))
|
||||
wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id);
|
||||
if (*m->tenant_id != '\0' &&
|
||||
!HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16))
|
||||
wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id);
|
||||
|
||||
greetRequestV2.timeline = wp->config->pgTimeline;
|
||||
greetRequestV2.walSegSize = wp->config->wal_segment_size;
|
||||
|
||||
pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2));
|
||||
break;
|
||||
}
|
||||
case 'v':
|
||||
{
|
||||
/* v2 sent struct as is */
|
||||
VoteRequest *m = (VoteRequest *) msg;
|
||||
VoteRequestV2 voteRequestV2;
|
||||
|
||||
voteRequestV2.tag = m->pam.tag;
|
||||
voteRequestV2.term = m->term;
|
||||
/* removed field */
|
||||
memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId));
|
||||
pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2));
|
||||
break;
|
||||
}
|
||||
case 'e':
|
||||
{
|
||||
ProposerElected *m = (ProposerElected *) msg;
|
||||
|
||||
pq_sendint64_le(buf, m->apm.tag);
|
||||
pq_sendint64_le(buf, m->term);
|
||||
pq_sendint64_le(buf, m->startStreamingAt);
|
||||
pq_sendint32_le(buf, m->termHistory->n_entries);
|
||||
for (int i = 0; i < m->termHistory->n_entries; i++)
|
||||
{
|
||||
pq_sendint64_le(buf, m->termHistory->entries[i].term);
|
||||
pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
|
||||
}
|
||||
pq_sendint64_le(buf, 0); /* removed timeline_start_lsn */
|
||||
break;
|
||||
}
|
||||
case 'a':
|
||||
|
||||
/*
|
||||
* Note: this serializes only AppendRequestHeader, caller is
|
||||
* expected to append WAL data later.
|
||||
*/
|
||||
{
|
||||
/* v2 sent struct as is */
|
||||
AppendRequestHeader *m = (AppendRequestHeader *) msg;
|
||||
AppendRequestHeaderV2 appendRequestHeaderV2;
|
||||
|
||||
appendRequestHeaderV2.tag = m->apm.tag;
|
||||
appendRequestHeaderV2.term = m->term;
|
||||
appendRequestHeaderV2.epochStartLsn = 0; /* removed field */
|
||||
appendRequestHeaderV2.beginLsn = m->beginLsn;
|
||||
appendRequestHeaderV2.endLsn = m->endLsn;
|
||||
appendRequestHeaderV2.commitLsn = m->commitLsn;
|
||||
appendRequestHeaderV2.truncateLsn = m->truncateLsn;
|
||||
/* removed field */
|
||||
memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId));
|
||||
|
||||
pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
|
||||
}
|
||||
return;
|
||||
}
|
||||
wp_log(FATAL, "unexpected proto_version %d", proto_version);
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to read CopyData message from i'th safekeeper, resetting connection on
|
||||
* failure.
|
||||
@@ -1961,37 +1779,6 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Deserialize membership configuration from buf to mconf. */
|
||||
static void
|
||||
MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf)
|
||||
{
|
||||
uint32 i;
|
||||
|
||||
mconf->generation = pq_getmsgint32(buf);
|
||||
mconf->members.len = pq_getmsgint32(buf);
|
||||
mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len);
|
||||
for (i = 0; i < mconf->members.len; i++)
|
||||
{
|
||||
const char *buf_host;
|
||||
|
||||
mconf->members.m[i].node_id = pq_getmsgint64(buf);
|
||||
buf_host = pq_getmsgrawstring(buf);
|
||||
strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host));
|
||||
mconf->members.m[i].port = pq_getmsgint16(buf);
|
||||
}
|
||||
mconf->new_members.len = pq_getmsgint32(buf);
|
||||
mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len);
|
||||
for (i = 0; i < mconf->new_members.len; i++)
|
||||
{
|
||||
const char *buf_host;
|
||||
|
||||
mconf->new_members.m[i].node_id = pq_getmsgint64(buf);
|
||||
buf_host = pq_getmsgrawstring(buf);
|
||||
strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host));
|
||||
mconf->new_members.m[i].port = pq_getmsgint16(buf);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read next message with known type into provided struct, by reading a CopyData
|
||||
* block from the safekeeper's postgres connection, returning whether the read
|
||||
@@ -2000,8 +1787,6 @@ MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo bu
|
||||
* If the read needs more polling, we return 'false' and keep the state
|
||||
* unmodified, waiting until it becomes read-ready to try again. If it fully
|
||||
* failed, a warning is emitted and the connection is reset.
|
||||
*
|
||||
* Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields.
|
||||
*/
|
||||
static bool
|
||||
AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
@@ -2010,154 +1795,82 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
|
||||
char *buf;
|
||||
int buf_size;
|
||||
uint8 tag;
|
||||
uint64 tag;
|
||||
StringInfoData s;
|
||||
|
||||
if (!(AsyncRead(sk, &buf, &buf_size)))
|
||||
return false;
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
|
||||
/* parse it */
|
||||
s.data = buf;
|
||||
s.len = buf_size;
|
||||
s.maxlen = buf_size;
|
||||
s.cursor = 0;
|
||||
|
||||
if (wp->config->proto_version == 3)
|
||||
tag = pq_getmsgint64_le(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
tag = pq_getmsgbyte(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
switch (tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
|
||||
|
||||
msg->nodeId = pq_getmsgint64(&s);
|
||||
MembershipConfigurationDeserialize(&msg->mconf, &s);
|
||||
msg->term = pq_getmsgint64(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
case 'v':
|
||||
{
|
||||
VoteResponse *msg = (VoteResponse *) anymsg;
|
||||
|
||||
msg->generation = pq_getmsgint32(&s);
|
||||
msg->term = pq_getmsgint64(&s);
|
||||
msg->voteGiven = pq_getmsgbyte(&s);
|
||||
msg->flushLsn = pq_getmsgint64(&s);
|
||||
msg->truncateLsn = pq_getmsgint64(&s);
|
||||
msg->termHistory.n_entries = pq_getmsgint32(&s);
|
||||
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
|
||||
for (uint32 i = 0; i < msg->termHistory.n_entries; i++)
|
||||
{
|
||||
msg->termHistory.entries[i].term = pq_getmsgint64(&s);
|
||||
msg->termHistory.entries[i].lsn = pq_getmsgint64(&s);
|
||||
}
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
case 'a':
|
||||
{
|
||||
AppendResponse *msg = (AppendResponse *) anymsg;
|
||||
|
||||
msg->generation = pq_getmsgint32(&s);
|
||||
msg->term = pq_getmsgint64(&s);
|
||||
msg->flushLsn = pq_getmsgint64(&s);
|
||||
msg->commitLsn = pq_getmsgint64(&s);
|
||||
msg->hs.ts = pq_getmsgint64(&s);
|
||||
msg->hs.xmin.value = pq_getmsgint64(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64(&s);
|
||||
if (s.len > s.cursor)
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
|
||||
else
|
||||
msg->ps_feedback.present = false;
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
{
|
||||
wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
else if (wp->config->proto_version == 2)
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
switch (tag)
|
||||
{
|
||||
tag = pq_getmsgint64_le(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
switch (tag)
|
||||
{
|
||||
case 'g':
|
||||
case 'g':
|
||||
{
|
||||
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->nodeId = pq_getmsgint64_le(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
case 'v':
|
||||
{
|
||||
VoteResponse *msg = (VoteResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->voteGiven = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->truncateLsn = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.n_entries = pq_getmsgint32_le(&s);
|
||||
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
|
||||
for (int i = 0; i < msg->termHistory.n_entries; i++)
|
||||
{
|
||||
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->nodeId = pq_getmsgint64_le(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
|
||||
}
|
||||
msg->timelineStartLsn = pq_getmsgint64_le(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
case 'v':
|
||||
{
|
||||
VoteResponse *msg = (VoteResponse *) anymsg;
|
||||
case 'a':
|
||||
{
|
||||
AppendResponse *msg = (AppendResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->voteGiven = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->truncateLsn = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.n_entries = pq_getmsgint32_le(&s);
|
||||
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
|
||||
for (int i = 0; i < msg->termHistory.n_entries; i++)
|
||||
{
|
||||
msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
|
||||
}
|
||||
pq_getmsgint64_le(&s); /* timelineStartLsn */
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->commitLsn = pq_getmsgint64_le(&s);
|
||||
msg->hs.ts = pq_getmsgint64_le(&s);
|
||||
msg->hs.xmin.value = pq_getmsgint64_le(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
|
||||
if (s.len > s.cursor)
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
|
||||
else
|
||||
msg->ps_feedback.present = false;
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
case 'a':
|
||||
{
|
||||
AppendResponse *msg = (AppendResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->commitLsn = pq_getmsgint64_le(&s);
|
||||
msg->hs.ts = pq_getmsgint64_le(&s);
|
||||
msg->hs.xmin.value = pq_getmsgint64_le(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
|
||||
if (s.len > s.cursor)
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
|
||||
else
|
||||
msg->ps_feedback.present = false;
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
default:
|
||||
{
|
||||
Assert(false);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
|
||||
return false; /* keep the compiler quiet */
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2533,45 +2246,3 @@ FormatEvents(WalProposer *wp, uint32 events)
|
||||
|
||||
return (char *) &return_str;
|
||||
}
|
||||
|
||||
/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */
|
||||
static char *
|
||||
MembershipConfigurationToString(MembershipConfiguration *mconf)
|
||||
{
|
||||
StringInfoData s;
|
||||
uint32 i;
|
||||
|
||||
initStringInfo(&s);
|
||||
appendStringInfo(&s, "{gen = %u", mconf->generation);
|
||||
appendStringInfoString(&s, ", members = [");
|
||||
for (i = 0; i < mconf->members.len; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
appendStringInfoString(&s, ", ");
|
||||
appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id);
|
||||
appendStringInfo(&s, ", host = %s", mconf->members.m[i].host);
|
||||
appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port);
|
||||
}
|
||||
appendStringInfo(&s, "], new_members = [");
|
||||
for (i = 0; i < mconf->new_members.len; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
appendStringInfoString(&s, ", ");
|
||||
appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id);
|
||||
appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host);
|
||||
appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port);
|
||||
}
|
||||
appendStringInfoString(&s, "]}");
|
||||
return s.data;
|
||||
}
|
||||
|
||||
static void
|
||||
MembershipConfigurationFree(MembershipConfiguration *mconf)
|
||||
{
|
||||
if (mconf->members.m)
|
||||
pfree(mconf->members.m);
|
||||
mconf->members.m = NULL;
|
||||
if (mconf->new_members.m)
|
||||
pfree(mconf->new_members.m);
|
||||
mconf->new_members.m = NULL;
|
||||
}
|
||||
|
||||
@@ -12,6 +12,9 @@
|
||||
#include "neon_walreader.h"
|
||||
#include "pagestore_client.h"
|
||||
|
||||
#define SK_MAGIC 0xCafeCeefu
|
||||
#define SK_PROTOCOL_VERSION 2
|
||||
|
||||
#define MAX_SAFEKEEPERS 32
|
||||
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL
|
||||
* message */
|
||||
@@ -140,71 +143,12 @@ typedef uint64 term_t;
|
||||
/* neon storage node id */
|
||||
typedef uint64 NNodeId;
|
||||
|
||||
/*
|
||||
* Number uniquely identifying safekeeper membership configuration.
|
||||
* This and following structs pair ones in membership.rs.
|
||||
*/
|
||||
typedef uint32 Generation;
|
||||
|
||||
typedef struct SafekeeperId
|
||||
{
|
||||
NNodeId node_id;
|
||||
char host[MAXCONNINFO];
|
||||
uint16 port;
|
||||
} SafekeeperId;
|
||||
|
||||
/* Set of safekeepers. */
|
||||
typedef struct MemberSet
|
||||
{
|
||||
uint32 len; /* number of members */
|
||||
SafekeeperId *m; /* ids themselves */
|
||||
} MemberSet;
|
||||
|
||||
/* Timeline safekeeper membership configuration. */
|
||||
typedef struct MembershipConfiguration
|
||||
{
|
||||
Generation generation;
|
||||
MemberSet members;
|
||||
/* Has 0 n_members in non joint conf. */
|
||||
MemberSet new_members;
|
||||
} MembershipConfiguration;
|
||||
|
||||
/*
|
||||
* Proposer <-> Acceptor messaging.
|
||||
*/
|
||||
|
||||
typedef struct ProposerAcceptorMessage
|
||||
{
|
||||
uint8 tag;
|
||||
} ProposerAcceptorMessage;
|
||||
|
||||
/* Initial Proposer -> Acceptor message */
|
||||
typedef struct ProposerGreeting
|
||||
{
|
||||
ProposerAcceptorMessage pam; /* message tag */
|
||||
|
||||
/*
|
||||
* tenant/timeline ids as C strings with standard hex notation for ease of
|
||||
* printing. In principle they are not strictly needed as ttid is also
|
||||
* passed as libpq options.
|
||||
*/
|
||||
char *tenant_id;
|
||||
char *timeline_id;
|
||||
/* Full conf is carried to allow safekeeper switch */
|
||||
MembershipConfiguration mconf;
|
||||
|
||||
/*
|
||||
* pg_version and wal_seg_size are used for timeline creation until we
|
||||
* fully migrate to doing externally. systemId is only used as a sanity
|
||||
* cross check.
|
||||
*/
|
||||
uint32 pg_version; /* in PG_VERSION_NUM format */
|
||||
uint64 system_id; /* Postgres system identifier. */
|
||||
uint32 wal_seg_size;
|
||||
} ProposerGreeting;
|
||||
|
||||
/* protocol v2 variant, kept while wp supports it */
|
||||
typedef struct ProposerGreetingV2
|
||||
{
|
||||
uint64 tag; /* message tag */
|
||||
uint32 protocolVersion; /* proposer-safekeeper protocol version */
|
||||
@@ -215,42 +159,32 @@ typedef struct ProposerGreetingV2
|
||||
uint8 tenant_id[16];
|
||||
TimeLineID timeline;
|
||||
uint32 walSegSize;
|
||||
} ProposerGreetingV2;
|
||||
} ProposerGreeting;
|
||||
|
||||
typedef struct AcceptorProposerMessage
|
||||
{
|
||||
uint8 tag;
|
||||
uint64 tag;
|
||||
} AcceptorProposerMessage;
|
||||
|
||||
/*
|
||||
* Acceptor -> Proposer initial response: the highest term acceptor voted for,
|
||||
* its node id and configuration.
|
||||
* Acceptor -> Proposer initial response: the highest term acceptor voted for.
|
||||
*/
|
||||
typedef struct AcceptorGreeting
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
NNodeId nodeId;
|
||||
MembershipConfiguration mconf;
|
||||
term_t term;
|
||||
NNodeId nodeId;
|
||||
} AcceptorGreeting;
|
||||
|
||||
/*
|
||||
* Proposer -> Acceptor vote request.
|
||||
*/
|
||||
typedef struct VoteRequest
|
||||
{
|
||||
ProposerAcceptorMessage pam; /* message tag */
|
||||
Generation generation; /* membership conf generation */
|
||||
term_t term;
|
||||
} VoteRequest;
|
||||
|
||||
/* protocol v2 variant, kept while wp supports it */
|
||||
typedef struct VoteRequestV2
|
||||
{
|
||||
uint64 tag;
|
||||
term_t term;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
} VoteRequestV2;
|
||||
} VoteRequest;
|
||||
|
||||
/* Element of term switching chain. */
|
||||
typedef struct TermSwitchEntry
|
||||
@@ -269,15 +203,8 @@ typedef struct TermHistory
|
||||
typedef struct VoteResponse
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
|
||||
/*
|
||||
* Membership conf generation. It's not strictly required because on
|
||||
* mismatch safekeeper is expected to ERROR the connection, but let's
|
||||
* sanity check it.
|
||||
*/
|
||||
Generation generation;
|
||||
term_t term;
|
||||
uint8 voteGiven;
|
||||
uint64 voteGiven;
|
||||
|
||||
/*
|
||||
* Safekeeper flush_lsn (end of WAL) + history of term switches allow
|
||||
@@ -287,6 +214,7 @@ typedef struct VoteResponse
|
||||
XLogRecPtr truncateLsn; /* minimal LSN which may be needed for*
|
||||
* recovery of some safekeeper */
|
||||
TermHistory termHistory;
|
||||
XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
|
||||
} VoteResponse;
|
||||
|
||||
/*
|
||||
@@ -295,37 +223,20 @@ typedef struct VoteResponse
|
||||
*/
|
||||
typedef struct ProposerElected
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
Generation generation; /* membership conf generation */
|
||||
uint64 tag;
|
||||
term_t term;
|
||||
/* proposer will send since this point */
|
||||
XLogRecPtr startStreamingAt;
|
||||
/* history of term switches up to this proposer */
|
||||
TermHistory *termHistory;
|
||||
/* timeline globally starts at this LSN */
|
||||
XLogRecPtr timelineStartLsn;
|
||||
} ProposerElected;
|
||||
|
||||
/*
|
||||
* Header of request with WAL message sent from proposer to safekeeper.
|
||||
*/
|
||||
typedef struct AppendRequestHeader
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
Generation generation; /* membership conf generation */
|
||||
term_t term; /* term of the proposer */
|
||||
XLogRecPtr beginLsn; /* start position of message in WAL */
|
||||
XLogRecPtr endLsn; /* end position of message in WAL */
|
||||
XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */
|
||||
|
||||
/*
|
||||
* minimal LSN which may be needed for recovery of some safekeeper (end
|
||||
* lsn + 1 of last chunk streamed to everyone)
|
||||
*/
|
||||
XLogRecPtr truncateLsn;
|
||||
/* in the AppendRequest message, WAL data follows */
|
||||
} AppendRequestHeader;
|
||||
|
||||
/* protocol v2 variant, kept while wp supports it */
|
||||
typedef struct AppendRequestHeaderV2
|
||||
{
|
||||
uint64 tag;
|
||||
term_t term; /* term of the proposer */
|
||||
@@ -345,8 +256,7 @@ typedef struct AppendRequestHeaderV2
|
||||
*/
|
||||
XLogRecPtr truncateLsn;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
/* in the AppendRequest message, WAL data follows */
|
||||
} AppendRequestHeaderV2;
|
||||
} AppendRequestHeader;
|
||||
|
||||
/*
|
||||
* Hot standby feedback received from replica
|
||||
@@ -399,13 +309,6 @@ typedef struct AppendResponse
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
|
||||
/*
|
||||
* Membership conf generation. It's not strictly required because on
|
||||
* mismatch safekeeper is expected to ERROR the connection, but let's
|
||||
* sanity check it.
|
||||
*/
|
||||
Generation generation;
|
||||
|
||||
/*
|
||||
* Current term of the safekeeper; if it is higher than proposer's, the
|
||||
* compute is out of date.
|
||||
@@ -741,8 +644,6 @@ typedef struct WalProposerConfig
|
||||
/* Will be passed to safekeepers in greet request. */
|
||||
TimeLineID pgTimeline;
|
||||
|
||||
int proto_version;
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void *callback_data;
|
||||
#endif
|
||||
@@ -755,14 +656,11 @@ typedef struct WalProposerConfig
|
||||
typedef struct WalProposer
|
||||
{
|
||||
WalProposerConfig *config;
|
||||
/* Current walproposer membership configuration */
|
||||
MembershipConfiguration mconf;
|
||||
int n_safekeepers;
|
||||
|
||||
/* (n_safekeepers / 2) + 1 */
|
||||
int quorum;
|
||||
|
||||
/* Number of occupied slots in safekeepers[] */
|
||||
int n_safekeepers;
|
||||
Safekeeper safekeeper[MAX_SAFEKEEPERS];
|
||||
|
||||
/* WAL has been generated up to this point */
|
||||
@@ -772,7 +670,6 @@ typedef struct WalProposer
|
||||
XLogRecPtr commitLsn;
|
||||
|
||||
ProposerGreeting greetRequest;
|
||||
ProposerGreetingV2 greetRequestV2;
|
||||
|
||||
/* Vote request for safekeeper */
|
||||
VoteRequest voteRequest;
|
||||
|
||||
@@ -117,13 +117,14 @@ pq_getmsgbytes(StringInfo msg, int datalen)
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgrawstring - get a null-terminated text string - NO conversion
|
||||
* pq_getmsgstring - get a null-terminated text string (with conversion)
|
||||
*
|
||||
* Returns a pointer directly into the message buffer.
|
||||
* May return a pointer directly into the message buffer, or a pointer
|
||||
* to a palloc'd conversion result.
|
||||
* --------------------------------
|
||||
*/
|
||||
const char *
|
||||
pq_getmsgrawstring(StringInfo msg)
|
||||
pq_getmsgstring(StringInfo msg)
|
||||
{
|
||||
char *str;
|
||||
int slen;
|
||||
@@ -154,45 +155,6 @@ pq_getmsgend(StringInfo msg)
|
||||
ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_sendbytes - append raw data to a StringInfo buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
void
|
||||
pq_sendbytes(StringInfo buf, const void *data, int datalen)
|
||||
{
|
||||
/* use variant that maintains a trailing null-byte, out of caution */
|
||||
appendBinaryStringInfo(buf, data, datalen);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_send_ascii_string - append a null-terminated text string (without conversion)
|
||||
*
|
||||
* This function intentionally bypasses encoding conversion, instead just
|
||||
* silently replacing any non-7-bit-ASCII characters with question marks.
|
||||
* It is used only when we are having trouble sending an error message to
|
||||
* the client with normal localization and encoding conversion. The caller
|
||||
* should already have taken measures to ensure the string is just ASCII;
|
||||
* the extra work here is just to make certain we don't send a badly encoded
|
||||
* string to the client (which might or might not be robust about that).
|
||||
*
|
||||
* NB: passed text string must be null-terminated, and so is the data
|
||||
* sent to the frontend.
|
||||
* --------------------------------
|
||||
*/
|
||||
void
|
||||
pq_send_ascii_string(StringInfo buf, const char *str)
|
||||
{
|
||||
while (*str)
|
||||
{
|
||||
char ch = *str++;
|
||||
|
||||
if (IS_HIGHBIT_SET(ch))
|
||||
ch = '?';
|
||||
appendStringInfoCharMacro(buf, ch);
|
||||
}
|
||||
appendStringInfoChar(buf, '\0');
|
||||
}
|
||||
|
||||
/*
|
||||
* Produce a C-string representation of a TimestampTz.
|
||||
|
||||
@@ -59,11 +59,9 @@
|
||||
|
||||
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
||||
|
||||
/* GUCs */
|
||||
char *wal_acceptors_list = "";
|
||||
int wal_acceptor_reconnect_timeout = 1000;
|
||||
int wal_acceptor_connection_timeout = 10000;
|
||||
int safekeeper_proto_version = 2;
|
||||
|
||||
/* Set to true in the walproposer bgw. */
|
||||
static bool am_walproposer;
|
||||
@@ -128,7 +126,6 @@ init_walprop_config(bool syncSafekeepers)
|
||||
else
|
||||
walprop_config.systemId = 0;
|
||||
walprop_config.pgTimeline = walprop_pg_get_timeline_id();
|
||||
walprop_config.proto_version = safekeeper_proto_version;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -222,37 +219,25 @@ nwp_register_gucs(void)
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_MS,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable(
|
||||
"neon.safekeeper_proto_version",
|
||||
"Version of compute <-> safekeeper protocol.",
|
||||
"Used while migrating from 2 to 3.",
|
||||
&safekeeper_proto_version,
|
||||
2, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
{
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
|
||||
for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
|
||||
{
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS)
|
||||
{
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS) {
|
||||
wpg_log(FATAL, "too many safekeepers");
|
||||
}
|
||||
|
||||
coma = strchr(coma, ',');
|
||||
safekeepers[n_safekeepers - 1] = curr_sk;
|
||||
safekeepers[n_safekeepers-1] = curr_sk;
|
||||
|
||||
if (coma != NULL)
|
||||
{
|
||||
if (coma != NULL) {
|
||||
*coma++ = '\0';
|
||||
}
|
||||
}
|
||||
@@ -267,10 +252,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
static bool
|
||||
safekeepers_cmp(char *old, char *new)
|
||||
{
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
|
||||
len_old = split_safekeepers_list(old, safekeepers_old);
|
||||
len_new = split_safekeepers_list(new, safekeepers_new);
|
||||
@@ -307,8 +292,7 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
if (!am_walproposer)
|
||||
return;
|
||||
|
||||
if (!newval)
|
||||
{
|
||||
if (!newval) {
|
||||
/* should never happen */
|
||||
wpg_log(FATAL, "neon.safekeepers is empty");
|
||||
}
|
||||
@@ -317,11 +301,11 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
newval_copy = pstrdup(newval);
|
||||
oldval = pstrdup(wal_acceptors_list);
|
||||
|
||||
/*
|
||||
/*
|
||||
* TODO: restarting through FATAL is stupid and introduces 1s delay before
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit
|
||||
* and thus remove this delay. XXX: If you change anything here, sync with
|
||||
* test_safekeepers_reconfigure_reorder.
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit and
|
||||
* thus remove this delay.
|
||||
* XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
|
||||
*/
|
||||
if (!safekeepers_cmp(oldval, newval_copy))
|
||||
{
|
||||
@@ -470,8 +454,7 @@ backpressure_throttling_impl(void)
|
||||
memcpy(new_status, old_status, len);
|
||||
snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
|
||||
set_ps_display(new_status);
|
||||
new_status[len] = '\0'; /* truncate off " backpressure ..." to later
|
||||
* reset the ps */
|
||||
new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */
|
||||
|
||||
elog(DEBUG2, "backpressure throttling: lag %lu", lag);
|
||||
start = GetCurrentTimestamp();
|
||||
@@ -638,7 +621,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
|
||||
wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
|
||||
LSN_FORMAT_ARGS(startpos));
|
||||
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
|
||||
cmd.timeline = wp->config->pgTimeline;
|
||||
cmd.timeline = wp->greetRequest.timeline;
|
||||
cmd.startpoint = startpos;
|
||||
StartProposerReplication(wp, &cmd);
|
||||
}
|
||||
@@ -1980,11 +1963,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
|
||||
FullTransactionId xmin = hsFeedback.xmin;
|
||||
FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
|
||||
FullTransactionId next_xid = ReadNextFullTransactionId();
|
||||
|
||||
/*
|
||||
* Page server is updating nextXid in checkpoint each 1024
|
||||
* transactions, so feedback xmin can be actually larger then nextXid
|
||||
* and function TransactionIdInRecentPast return false in this case,
|
||||
* Page server is updating nextXid in checkpoint each 1024 transactions,
|
||||
* so feedback xmin can be actually larger then nextXid and
|
||||
* function TransactionIdInRecentPast return false in this case,
|
||||
* preventing update of slot's xmin.
|
||||
*/
|
||||
if (FullTransactionIdPrecedes(next_xid, xmin))
|
||||
|
||||
@@ -19,6 +19,7 @@ aws-config.workspace = true
|
||||
aws-sdk-iam.workspace = true
|
||||
aws-sigv4.workspace = true
|
||||
base64.workspace = true
|
||||
boxcar = "0.2.8"
|
||||
bstr.workspace = true
|
||||
bytes = { workspace = true, features = ["serde"] }
|
||||
camino.workspace = true
|
||||
@@ -42,6 +43,7 @@ hyper0.workspace = true
|
||||
hyper = { workspace = true, features = ["server", "http1", "http2"] }
|
||||
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
|
||||
http-body-util = { version = "0.1" }
|
||||
gettid = "0.1.3"
|
||||
indexmap = { workspace = true, features = ["serde"] }
|
||||
ipnet.workspace = true
|
||||
itertools.workspace = true
|
||||
@@ -50,6 +52,8 @@ lasso = { workspace = true, features = ["multi-threaded"] }
|
||||
measured = { workspace = true, features = ["lasso"] }
|
||||
metrics.workspace = true
|
||||
once_cell.workspace = true
|
||||
opentelemetry = { workspace = true, features = ["trace"] }
|
||||
papaya = "0.1.8"
|
||||
parking_lot.workspace = true
|
||||
parquet.workspace = true
|
||||
parquet_derive.workspace = true
|
||||
@@ -89,6 +93,9 @@ tokio = { workspace = true, features = ["signal"] }
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-log.workspace = true
|
||||
tracing-serde.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
try-lock.workspace = true
|
||||
typed-json.workspace = true
|
||||
url.workspace = true
|
||||
@@ -112,6 +119,7 @@ rsa = "0.9"
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
assert-json-diff.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
flate2.workspace = true
|
||||
|
||||
@@ -1,10 +1,23 @@
|
||||
use tracing::Subscriber;
|
||||
use std::cell::{Cell, RefCell};
|
||||
use std::collections::HashMap;
|
||||
use std::hash::BuildHasher;
|
||||
use std::{env, io};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use opentelemetry::trace::TraceContextExt;
|
||||
use scopeguard::defer;
|
||||
use serde::ser::{SerializeMap, Serializer};
|
||||
use tracing::span;
|
||||
use tracing::subscriber::Interest;
|
||||
use tracing::{callsite, Event, Metadata, Span, Subscriber};
|
||||
use tracing_opentelemetry::OpenTelemetrySpanExt;
|
||||
use tracing_subscriber::filter::{EnvFilter, LevelFilter};
|
||||
use tracing_subscriber::fmt::format::{Format, Full};
|
||||
use tracing_subscriber::fmt::time::SystemTime;
|
||||
use tracing_subscriber::fmt::{FormatEvent, FormatFields};
|
||||
use tracing_subscriber::layer::{Context, Layer};
|
||||
use tracing_subscriber::prelude::*;
|
||||
use tracing_subscriber::registry::LookupSpan;
|
||||
use tracing_subscriber::registry::{LookupSpan, SpanRef};
|
||||
|
||||
/// Initialize logging and OpenTelemetry tracing and exporter.
|
||||
///
|
||||
@@ -15,6 +28,8 @@ use tracing_subscriber::registry::LookupSpan;
|
||||
/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
|
||||
/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
|
||||
pub async fn init() -> anyhow::Result<LoggingGuard> {
|
||||
let logfmt = LogFormat::from_env()?;
|
||||
|
||||
let env_filter = EnvFilter::builder()
|
||||
.with_default_directive(LevelFilter::INFO.into())
|
||||
.from_env_lossy()
|
||||
@@ -29,17 +44,36 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
|
||||
.expect("this should be a valid filter directive"),
|
||||
);
|
||||
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
.with_ansi(false)
|
||||
.with_writer(std::io::stderr)
|
||||
.with_target(false);
|
||||
|
||||
let otlp_layer = tracing_utils::init_tracing("proxy").await;
|
||||
|
||||
let json_log_layer = if logfmt == LogFormat::Json {
|
||||
Some(JsonLoggingLayer {
|
||||
clock: RealClock,
|
||||
skipped_field_indices: papaya::HashMap::default(),
|
||||
writer: StderrWriter {
|
||||
stderr: std::io::stderr(),
|
||||
},
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let text_log_layer = if logfmt == LogFormat::Text {
|
||||
Some(
|
||||
tracing_subscriber::fmt::layer()
|
||||
.with_ansi(false)
|
||||
.with_writer(std::io::stderr)
|
||||
.with_target(false),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(otlp_layer)
|
||||
.with(fmt_layer)
|
||||
.with(json_log_layer)
|
||||
.with(text_log_layer)
|
||||
.try_init()?;
|
||||
|
||||
Ok(LoggingGuard)
|
||||
@@ -94,3 +128,857 @@ impl Drop for LoggingGuard {
|
||||
tracing_utils::shutdown_tracing();
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: make JSON the default
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
|
||||
enum LogFormat {
|
||||
#[default]
|
||||
Text = 1,
|
||||
Json,
|
||||
}
|
||||
|
||||
impl LogFormat {
|
||||
fn from_env() -> anyhow::Result<Self> {
|
||||
let logfmt = env::var("LOGFMT");
|
||||
Ok(match logfmt.as_deref() {
|
||||
Err(_) => LogFormat::default(),
|
||||
Ok("text") => LogFormat::Text,
|
||||
Ok("json") => LogFormat::Json,
|
||||
Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
trait MakeWriter {
|
||||
fn make_writer(&self) -> impl io::Write;
|
||||
}
|
||||
|
||||
struct StderrWriter {
|
||||
stderr: io::Stderr,
|
||||
}
|
||||
|
||||
impl MakeWriter for StderrWriter {
|
||||
#[inline]
|
||||
fn make_writer(&self) -> impl io::Write {
|
||||
self.stderr.lock()
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: move into separate module or even separate crate.
|
||||
trait Clock {
|
||||
fn now(&self) -> DateTime<Utc>;
|
||||
}
|
||||
|
||||
struct RealClock;
|
||||
|
||||
impl Clock for RealClock {
|
||||
#[inline]
|
||||
fn now(&self) -> DateTime<Utc> {
|
||||
Utc::now()
|
||||
}
|
||||
}
|
||||
|
||||
/// Name of the field used by tracing crate to store the event message.
|
||||
const MESSAGE_FIELD: &str = "message";
|
||||
|
||||
thread_local! {
|
||||
/// Protects against deadlocks and double panics during log writing.
|
||||
/// The current panic handler will use tracing to log panic information.
|
||||
static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
|
||||
/// Thread-local instance with per-thread buffer for log writing.
|
||||
static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
|
||||
/// Cached OS thread ID.
|
||||
static THREAD_ID: u64 = gettid::gettid();
|
||||
}
|
||||
|
||||
/// Implements tracing layer to handle events specific to logging.
|
||||
struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
|
||||
clock: C,
|
||||
skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
|
||||
writer: W,
|
||||
}
|
||||
|
||||
impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
{
|
||||
fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) {
|
||||
use std::io::Write;
|
||||
|
||||
// TODO: consider special tracing subscriber to grab timestamp very
|
||||
// early, before OTel machinery, and add as event extension.
|
||||
let now = self.clock.now();
|
||||
|
||||
let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
|
||||
if entered.get() {
|
||||
let mut formatter = EventFormatter::new();
|
||||
formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
|
||||
self.writer.make_writer().write_all(formatter.buffer())
|
||||
} else {
|
||||
entered.set(true);
|
||||
defer!(entered.set(false););
|
||||
|
||||
EVENT_FORMATTER.with_borrow_mut(move |formatter| {
|
||||
formatter.reset();
|
||||
formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
|
||||
self.writer.make_writer().write_all(formatter.buffer())
|
||||
})
|
||||
}
|
||||
});
|
||||
|
||||
// In case logging fails we generate a simpler JSON object.
|
||||
if let Err(err) = res {
|
||||
if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
|
||||
"timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
|
||||
"level": "ERROR",
|
||||
"message": format_args!("cannot log event: {err:?}"),
|
||||
"fields": {
|
||||
"event": format_args!("{event:?}"),
|
||||
},
|
||||
})) {
|
||||
line.push(b'\n');
|
||||
self.writer.make_writer().write_all(&line).ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a SpanFields instance as span extension.
|
||||
fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
|
||||
let span = ctx.span(id).expect("span must exist");
|
||||
let fields = SpanFields::default();
|
||||
fields.record_fields(attrs);
|
||||
// This could deadlock when there's a panic somewhere in the tracing
|
||||
// event handling and a read or write guard is still held. This includes
|
||||
// the OTel subscriber.
|
||||
span.extensions_mut().insert(fields);
|
||||
}
|
||||
|
||||
fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
|
||||
let span = ctx.span(id).expect("span must exist");
|
||||
let ext = span.extensions();
|
||||
if let Some(data) = ext.get::<SpanFields>() {
|
||||
data.record_fields(values);
|
||||
}
|
||||
}
|
||||
|
||||
/// Called (lazily) whenever a new log call is executed. We quickly check
|
||||
/// for duplicate field names and record duplicates as skippable. Last one
|
||||
/// wins.
|
||||
fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
|
||||
if !metadata.is_event() {
|
||||
// Must not be never because we wouldn't get trace and span data.
|
||||
return Interest::always();
|
||||
}
|
||||
|
||||
let mut field_indices = SkippedFieldIndices::default();
|
||||
let mut seen_fields = HashMap::<&'static str, usize>::new();
|
||||
for field in metadata.fields() {
|
||||
use std::collections::hash_map::Entry;
|
||||
match seen_fields.entry(field.name()) {
|
||||
Entry::Vacant(entry) => {
|
||||
// field not seen yet
|
||||
entry.insert(field.index());
|
||||
}
|
||||
Entry::Occupied(mut entry) => {
|
||||
// replace currently stored index
|
||||
let old_index = entry.insert(field.index());
|
||||
// ... and append it to list of skippable indices
|
||||
field_indices.push(old_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !field_indices.is_empty() {
|
||||
self.skipped_field_indices
|
||||
.pin()
|
||||
.insert(metadata.callsite(), field_indices);
|
||||
}
|
||||
|
||||
Interest::always()
|
||||
}
|
||||
}
|
||||
|
||||
/// Stores span field values recorded during the spans lifetime.
|
||||
#[derive(Default)]
|
||||
struct SpanFields {
|
||||
// TODO: Switch to custom enum with lasso::Spur for Strings?
|
||||
fields: papaya::HashMap<&'static str, serde_json::Value>,
|
||||
}
|
||||
|
||||
impl SpanFields {
|
||||
#[inline]
|
||||
fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
|
||||
fields.record(&mut SpanFieldsRecorder {
|
||||
fields: self.fields.pin(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements a tracing field visitor to convert and store values.
|
||||
struct SpanFieldsRecorder<'m, S, G> {
|
||||
fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
|
||||
}
|
||||
|
||||
impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
|
||||
#[inline]
|
||||
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
|
||||
if let Ok(value) = i64::try_from(value) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
} else {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
|
||||
if let Ok(value) = u64::try_from(value) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
} else {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value:?}")));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_error(
|
||||
&mut self,
|
||||
field: &tracing::field::Field,
|
||||
value: &(dyn std::error::Error + 'static),
|
||||
) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value}")));
|
||||
}
|
||||
}
|
||||
|
||||
/// List of field indices skipped during logging. Can list duplicate fields or
|
||||
/// metafields not meant to be logged.
|
||||
#[derive(Clone, Default)]
|
||||
struct SkippedFieldIndices {
|
||||
bits: u64,
|
||||
}
|
||||
|
||||
impl SkippedFieldIndices {
|
||||
#[inline]
|
||||
fn is_empty(&self) -> bool {
|
||||
self.bits == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn push(&mut self, index: usize) {
|
||||
self.bits |= 1u64
|
||||
.checked_shl(index as u32)
|
||||
.expect("field index too large");
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn contains(&self, index: usize) -> bool {
|
||||
self.bits
|
||||
& 1u64
|
||||
.checked_shl(index as u32)
|
||||
.expect("field index too large")
|
||||
!= 0
|
||||
}
|
||||
}
|
||||
|
||||
/// Formats a tracing event and writes JSON to its internal buffer including a newline.
|
||||
// TODO: buffer capacity management, truncate if too large
|
||||
struct EventFormatter {
|
||||
logline_buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl EventFormatter {
|
||||
#[inline]
|
||||
fn new() -> Self {
|
||||
EventFormatter {
|
||||
logline_buffer: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn buffer(&self) -> &[u8] {
|
||||
&self.logline_buffer
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn reset(&mut self) {
|
||||
self.logline_buffer.clear();
|
||||
}
|
||||
|
||||
fn format<S>(
|
||||
&mut self,
|
||||
now: DateTime<Utc>,
|
||||
event: &Event<'_>,
|
||||
ctx: &Context<'_, S>,
|
||||
skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
{
|
||||
let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true);
|
||||
|
||||
use tracing_log::NormalizeEvent;
|
||||
let normalized_meta = event.normalized_metadata();
|
||||
let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
|
||||
|
||||
let skipped_field_indices = skipped_field_indices.pin();
|
||||
let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
|
||||
|
||||
let mut serialize = || {
|
||||
let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
|
||||
|
||||
let mut serializer = serializer.serialize_map(None)?;
|
||||
|
||||
// Timestamp comes first, so raw lines can be sorted by timestamp.
|
||||
serializer.serialize_entry("timestamp", ×tamp)?;
|
||||
|
||||
// Level next.
|
||||
serializer.serialize_entry("level", &meta.level().as_str())?;
|
||||
|
||||
// Message next.
|
||||
serializer.serialize_key("message")?;
|
||||
let mut message_extractor =
|
||||
MessageFieldExtractor::new(serializer, skipped_field_indices);
|
||||
event.record(&mut message_extractor);
|
||||
let mut serializer = message_extractor.into_serializer()?;
|
||||
|
||||
let mut fields_present = FieldsPresent(false, skipped_field_indices);
|
||||
event.record(&mut fields_present);
|
||||
if fields_present.0 {
|
||||
serializer.serialize_entry(
|
||||
"fields",
|
||||
&SerializableEventFields(event, skipped_field_indices),
|
||||
)?;
|
||||
}
|
||||
|
||||
let pid = std::process::id();
|
||||
if pid != 1 {
|
||||
serializer.serialize_entry("process_id", &pid)?;
|
||||
}
|
||||
|
||||
THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?;
|
||||
|
||||
// TODO: tls cache? name could change
|
||||
if let Some(thread_name) = std::thread::current().name() {
|
||||
if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" {
|
||||
serializer.serialize_entry("thread_name", thread_name)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(task_id) = tokio::task::try_id() {
|
||||
serializer.serialize_entry("task_id", &format_args!("{task_id}"))?;
|
||||
}
|
||||
|
||||
serializer.serialize_entry("target", meta.target())?;
|
||||
|
||||
if let Some(module) = meta.module_path() {
|
||||
if module != meta.target() {
|
||||
serializer.serialize_entry("module", module)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(file) = meta.file() {
|
||||
if let Some(line) = meta.line() {
|
||||
serializer.serialize_entry("src", &format_args!("{file}:{line}"))?;
|
||||
} else {
|
||||
serializer.serialize_entry("src", file)?;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let otel_context = Span::current().context();
|
||||
let otel_spanref = otel_context.span();
|
||||
let span_context = otel_spanref.span_context();
|
||||
if span_context.is_valid() {
|
||||
serializer.serialize_entry(
|
||||
"trace_id",
|
||||
&format_args!("{}", span_context.trace_id()),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
|
||||
|
||||
serializer.end()
|
||||
};
|
||||
|
||||
serialize().map_err(io::Error::other)?;
|
||||
self.logline_buffer.push(b'\n');
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the message field that's mixed will other fields.
|
||||
struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
|
||||
serializer: S,
|
||||
skipped_field_indices: Option<&'a SkippedFieldIndices>,
|
||||
state: Option<Result<(), S::Error>>,
|
||||
}
|
||||
|
||||
impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
|
||||
#[inline]
|
||||
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
|
||||
Self {
|
||||
serializer,
|
||||
skipped_field_indices,
|
||||
state: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn into_serializer(mut self) -> Result<S, S::Error> {
|
||||
match self.state {
|
||||
Some(Ok(())) => {}
|
||||
Some(Err(err)) => return Err(err),
|
||||
None => self.serializer.serialize_value("")?,
|
||||
}
|
||||
Ok(self.serializer)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept_field(&self, field: &tracing::field::Field) -> bool {
|
||||
self.state.is_none()
|
||||
&& field.name() == MESSAGE_FIELD
|
||||
&& !self
|
||||
.skipped_field_indices
|
||||
.is_some_and(|i| i.contains(field.index()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
|
||||
#[inline]
|
||||
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_error(
|
||||
&mut self,
|
||||
field: &tracing::field::Field,
|
||||
value: &(dyn std::error::Error + 'static),
|
||||
) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&format_args!("{value}")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if there's any fields and field values present. If not, the JSON subobject
|
||||
/// can be skipped.
|
||||
// This is entirely optional and only cosmetic, though maybe helps a
|
||||
// bit during log parsing in dashboards when there's no field with empty object.
|
||||
struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
|
||||
|
||||
// Even though some methods have an overhead (error, bytes) it is assumed the
|
||||
// compiler won't include this since we ignore the value entirely.
|
||||
impl tracing::field::Visit for FieldsPresent<'_> {
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
|
||||
if !self.1.is_some_and(|i| i.contains(field.index()))
|
||||
&& field.name() != MESSAGE_FIELD
|
||||
&& !field.name().starts_with("log.")
|
||||
{
|
||||
self.0 |= true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes the fields directly supplied with a log event.
|
||||
struct SerializableEventFields<'a, 'event>(
|
||||
&'a tracing::Event<'event>,
|
||||
Option<&'a SkippedFieldIndices>,
|
||||
);
|
||||
|
||||
impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let serializer = serializer.serialize_map(None)?;
|
||||
let mut message_skipper = MessageFieldSkipper::new(serializer, self.1);
|
||||
self.0.record(&mut message_skipper);
|
||||
let serializer = message_skipper.into_serializer()?;
|
||||
serializer.end()
|
||||
}
|
||||
}
|
||||
|
||||
/// A tracing field visitor that skips the message field.
|
||||
struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
|
||||
serializer: S,
|
||||
skipped_field_indices: Option<&'a SkippedFieldIndices>,
|
||||
state: Result<(), S::Error>,
|
||||
}
|
||||
|
||||
impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
|
||||
#[inline]
|
||||
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
|
||||
Self {
|
||||
serializer,
|
||||
skipped_field_indices,
|
||||
state: Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept_field(&self, field: &tracing::field::Field) -> bool {
|
||||
self.state.is_ok()
|
||||
&& field.name() != MESSAGE_FIELD
|
||||
&& !field.name().starts_with("log.")
|
||||
&& !self
|
||||
.skipped_field_indices
|
||||
.is_some_and(|i| i.contains(field.index()))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn into_serializer(self) -> Result<S, S::Error> {
|
||||
self.state?;
|
||||
Ok(self.serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
|
||||
#[inline]
|
||||
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self
|
||||
.serializer
|
||||
.serialize_entry(field.name(), &format_args!("{value:x?}"));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self
|
||||
.serializer
|
||||
.serialize_entry(field.name(), &format_args!("{value:?}"));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_error(
|
||||
&mut self,
|
||||
field: &tracing::field::Field,
|
||||
value: &(dyn std::error::Error + 'static),
|
||||
) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_value(&format_args!("{value}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes the span stack from root to leaf (parent of event) enumerated
|
||||
/// inside an object where the keys are just the number padded with zeroes
|
||||
/// to retain sorting order.
|
||||
// The object is necessary because Loki cannot flatten arrays.
|
||||
struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
|
||||
where
|
||||
Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
|
||||
|
||||
impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
|
||||
where
|
||||
Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
|
||||
{
|
||||
fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
|
||||
where
|
||||
Ser: serde::ser::Serializer,
|
||||
{
|
||||
let mut serializer = serializer.serialize_map(None)?;
|
||||
|
||||
if let Some(leaf_span) = self.0.lookup_current() {
|
||||
for (i, span) in leaf_span.scope().from_root().enumerate() {
|
||||
serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
|
||||
}
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes a single span. Include the span ID, name and its fields as
|
||||
/// recorded up to this point.
|
||||
struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
|
||||
where
|
||||
Span: for<'lookup> LookupSpan<'lookup>;
|
||||
|
||||
impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
|
||||
where
|
||||
Span: for<'lookup> LookupSpan<'lookup>,
|
||||
{
|
||||
fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
|
||||
where
|
||||
Ser: serde::ser::Serializer,
|
||||
{
|
||||
let mut serializer = serializer.serialize_map(None)?;
|
||||
// TODO: the span ID is probably only useful for debugging tracing.
|
||||
serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
|
||||
serializer.serialize_entry("span_name", self.0.metadata().name())?;
|
||||
|
||||
let ext = self.0.extensions();
|
||||
if let Some(data) = ext.get::<SpanFields>() {
|
||||
for (key, value) in &data.fields.pin() {
|
||||
serializer.serialize_entry(key, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[allow(clippy::unwrap_used)]
|
||||
mod tests {
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
|
||||
use assert_json_diff::assert_json_eq;
|
||||
use tracing::info_span;
|
||||
|
||||
use super::*;
|
||||
|
||||
struct TestClock {
|
||||
current_time: Mutex<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
impl Clock for Arc<TestClock> {
|
||||
fn now(&self) -> DateTime<Utc> {
|
||||
*self.current_time.lock().expect("poisoned")
|
||||
}
|
||||
}
|
||||
|
||||
struct VecWriter<'a> {
|
||||
buffer: MutexGuard<'a, Vec<u8>>,
|
||||
}
|
||||
|
||||
impl MakeWriter for Arc<Mutex<Vec<u8>>> {
|
||||
fn make_writer(&self) -> impl io::Write {
|
||||
VecWriter {
|
||||
buffer: self.lock().expect("poisoned"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Write for VecWriter<'_> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
self.buffer.write(buf)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_collection() {
|
||||
let clock = Arc::new(TestClock {
|
||||
current_time: Mutex::new(Utc::now()),
|
||||
});
|
||||
let buffer = Arc::new(Mutex::new(Vec::new()));
|
||||
let log_layer = JsonLoggingLayer {
|
||||
clock: clock.clone(),
|
||||
skipped_field_indices: papaya::HashMap::default(),
|
||||
writer: buffer.clone(),
|
||||
};
|
||||
|
||||
let registry = tracing_subscriber::Registry::default().with(log_layer);
|
||||
|
||||
tracing::subscriber::with_default(registry, || {
|
||||
info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
|
||||
info_span!("span2").in_scope(|| {
|
||||
tracing::error!(
|
||||
a = 1,
|
||||
a = 2,
|
||||
a = 3,
|
||||
message = "explicit message field",
|
||||
"implicit message field"
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
let buffer = Arc::try_unwrap(buffer)
|
||||
.expect("no other reference")
|
||||
.into_inner()
|
||||
.expect("poisoned");
|
||||
let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON");
|
||||
let expected: serde_json::Value = serde_json::json!(
|
||||
{
|
||||
"timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
|
||||
"level": "ERROR",
|
||||
"message": "explicit message field",
|
||||
"fields": {
|
||||
"a": 3,
|
||||
},
|
||||
"spans": {
|
||||
"00":{
|
||||
"span_id": "0000000000000001",
|
||||
"span_name": "span1",
|
||||
"x": 42,
|
||||
},
|
||||
"01": {
|
||||
"span_id": "0000000000000002",
|
||||
"span_name": "span2",
|
||||
}
|
||||
},
|
||||
"src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
|
||||
"target": "proxy::logging::tests",
|
||||
"process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),
|
||||
"thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(),
|
||||
"thread_name": "logging::tests::test_field_collection",
|
||||
}
|
||||
);
|
||||
|
||||
assert_json_eq!(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,12 +88,13 @@ fn bench_process_msg(c: &mut Criterion) {
|
||||
let (lsn, record) = walgen.next().expect("endless WAL");
|
||||
ProposerAcceptorMessage::AppendRequest(AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
generation: 0,
|
||||
term: 1,
|
||||
term_start_lsn: Lsn(0),
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
})
|
||||
@@ -159,12 +160,13 @@ fn bench_wal_acceptor(c: &mut Criterion) {
|
||||
.take(n)
|
||||
.map(|(lsn, record)| AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
generation: 0,
|
||||
term: 1,
|
||||
term_start_lsn: Lsn(0),
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: Lsn(0),
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
})
|
||||
@@ -260,12 +262,13 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) {
|
||||
runtime.block_on(async {
|
||||
let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
generation: 0,
|
||||
term: 1,
|
||||
term_start_lsn: Lsn(0),
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
});
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
use anyhow::Context;
|
||||
use postgres_backend::QueryError;
|
||||
use safekeeper_api::membership::{Configuration, INVALID_GENERATION};
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::{ServerInfo, Term};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -133,10 +133,10 @@ async fn send_proposer_elected(
|
||||
let history = TermHistory(history_entries);
|
||||
|
||||
let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
generation: INVALID_GENERATION,
|
||||
term,
|
||||
start_streaming_at: lsn,
|
||||
term_history: history,
|
||||
timeline_start_lsn: lsn,
|
||||
});
|
||||
|
||||
tli.process_msg(&proposer_elected_request).await?;
|
||||
@@ -170,12 +170,13 @@ pub async fn append_logical_message(
|
||||
|
||||
let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
generation: INVALID_GENERATION,
|
||||
term: msg.term,
|
||||
term_start_lsn: begin_lsn,
|
||||
begin_lsn,
|
||||
end_lsn,
|
||||
commit_lsn,
|
||||
truncate_lsn: msg.truncate_lsn,
|
||||
proposer_uuid: [0u8; 16],
|
||||
},
|
||||
wal_data,
|
||||
});
|
||||
|
||||
@@ -281,7 +281,7 @@ impl SafekeeperPostgresHandler {
|
||||
tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
|
||||
r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r,
|
||||
r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
|
||||
_ = timeline_cancel.cancelled() => {
|
||||
return Err(CopyStreamHandlerEnd::Cancelled);
|
||||
}
|
||||
@@ -342,8 +342,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
let tli = match next_msg {
|
||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
||||
info!(
|
||||
"start handshake with walproposer {} sysid {}",
|
||||
self.peer_addr, greeting.system_id,
|
||||
"start handshake with walproposer {} sysid {} timeline {}",
|
||||
self.peer_addr, greeting.system_id, greeting.tli,
|
||||
);
|
||||
let server_info = ServerInfo {
|
||||
pg_version: greeting.pg_version,
|
||||
@@ -459,7 +459,6 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_writer: &mut PostgresBackend<IO>,
|
||||
mut reply_rx: Receiver<AcceptorProposerMessage>,
|
||||
mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
|
||||
proto_version: u32,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut buf = BytesMut::with_capacity(128);
|
||||
|
||||
@@ -497,7 +496,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
};
|
||||
|
||||
buf.clear();
|
||||
msg.serialize(&mut buf, proto_version)?;
|
||||
msg.serialize(&mut buf)?;
|
||||
pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ use std::{fmt, pin::pin};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::StreamExt;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use safekeeper_api::membership::INVALID_GENERATION;
|
||||
use safekeeper_api::models::{PeerInfo, TimelineStatus};
|
||||
use safekeeper_api::Term;
|
||||
use tokio::sync::mpsc::{channel, Receiver, Sender};
|
||||
@@ -268,10 +267,7 @@ async fn recover(
|
||||
);
|
||||
|
||||
// Now understand our term history.
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
|
||||
generation: INVALID_GENERATION,
|
||||
term: donor.term,
|
||||
});
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
|
||||
let vote_response = match tli
|
||||
.process_msg(&vote_request)
|
||||
.await
|
||||
@@ -306,10 +302,10 @@ async fn recover(
|
||||
|
||||
// truncate WAL locally
|
||||
let pe = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
generation: INVALID_GENERATION,
|
||||
term: donor.term,
|
||||
start_streaming_at: last_common_point.lsn,
|
||||
term_history: donor_th,
|
||||
timeline_start_lsn: Lsn::INVALID,
|
||||
});
|
||||
// Successful ProposerElected handling always returns None. If term changed,
|
||||
// we'll find out that during the streaming. Note: it is expected to get
|
||||
@@ -438,12 +434,13 @@ async fn network_io(
|
||||
match msg {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
let ar_hdr = AppendRequestHeader {
|
||||
generation: INVALID_GENERATION,
|
||||
term: donor.term,
|
||||
term_start_lsn: Lsn::INVALID, // unused
|
||||
begin_lsn: Lsn(xlog_data.wal_start()),
|
||||
end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
|
||||
commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
|
||||
truncate_lsn: Lsn::INVALID, // do not attempt to advance
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let ar = AppendRequest {
|
||||
h: ar_hdr,
|
||||
|
||||
@@ -5,11 +5,6 @@ use byteorder::{LittleEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
|
||||
use safekeeper_api::membership;
|
||||
use safekeeper_api::membership::Generation;
|
||||
use safekeeper_api::membership::MemberSet;
|
||||
use safekeeper_api::membership::SafekeeperId;
|
||||
use safekeeper_api::membership::INVALID_GENERATION;
|
||||
use safekeeper_api::models::HotStandbyFeedback;
|
||||
use safekeeper_api::Term;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -17,7 +12,6 @@ use std::cmp::max;
|
||||
use std::cmp::min;
|
||||
use std::fmt;
|
||||
use std::io::Read;
|
||||
use std::str::FromStr;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
|
||||
use tracing::*;
|
||||
@@ -35,8 +29,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub const SK_PROTO_VERSION_2: u32 = 2;
|
||||
pub const SK_PROTO_VERSION_3: u32 = 3;
|
||||
pub const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
pub const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@@ -63,28 +56,8 @@ impl TermHistory {
|
||||
TermHistory(Vec::new())
|
||||
}
|
||||
|
||||
// Parse TermHistory as n_entries followed by TermLsn pairs in network order.
|
||||
// Parse TermHistory as n_entries followed by TermLsn pairs
|
||||
pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
|
||||
let n_entries = bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "TermHistory misses len")?;
|
||||
let mut res = Vec::with_capacity(n_entries as usize);
|
||||
for i in 0..n_entries {
|
||||
let term = bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| format!("TermHistory pos {} misses term", i))?;
|
||||
let lsn = bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| format!("TermHistory pos {} misses lsn", i))?
|
||||
.into();
|
||||
res.push(TermLsn { term, lsn })
|
||||
}
|
||||
Ok(TermHistory(res))
|
||||
}
|
||||
|
||||
// Parse TermHistory as n_entries followed by TermLsn pairs in LE order.
|
||||
// TODO remove once v2 protocol is fully dropped.
|
||||
pub fn from_bytes_le(bytes: &mut Bytes) -> Result<TermHistory> {
|
||||
if bytes.remaining() < 4 {
|
||||
bail!("TermHistory misses len");
|
||||
}
|
||||
@@ -224,18 +197,6 @@ impl AcceptorState {
|
||||
/// Initial Proposer -> Acceptor message
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ProposerGreeting {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub mconf: membership::Configuration,
|
||||
/// Postgres server version
|
||||
pub pg_version: u32,
|
||||
pub system_id: SystemId,
|
||||
pub wal_seg_size: u32,
|
||||
}
|
||||
|
||||
/// V2 of the message; exists as a struct because we (de)serialized it as is.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ProposerGreetingV2 {
|
||||
/// proposer-acceptor protocol version
|
||||
pub protocol_version: u32,
|
||||
/// Postgres server version
|
||||
@@ -252,35 +213,27 @@ pub struct ProposerGreetingV2 {
|
||||
/// (acceptor voted for).
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct AcceptorGreeting {
|
||||
node_id: NodeId,
|
||||
mconf: membership::Configuration,
|
||||
term: u64,
|
||||
node_id: NodeId,
|
||||
}
|
||||
|
||||
/// Vote request sent from proposer to safekeepers
|
||||
#[derive(Debug)]
|
||||
pub struct VoteRequest {
|
||||
pub generation: Generation,
|
||||
pub term: Term,
|
||||
}
|
||||
|
||||
/// V2 of the message; exists as a struct because we (de)serialized it as is.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct VoteRequestV2 {
|
||||
pub struct VoteRequest {
|
||||
pub term: Term,
|
||||
}
|
||||
|
||||
/// Vote itself, sent from safekeeper to proposer
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct VoteResponse {
|
||||
generation: Generation, // membership conf generation
|
||||
pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
vote_given: bool,
|
||||
vote_given: u64, // fixme u64 due to padding
|
||||
// Safekeeper flush_lsn (end of WAL) + history of term switches allow
|
||||
// proposer to choose the most advanced one.
|
||||
pub flush_lsn: Lsn,
|
||||
truncate_lsn: Lsn,
|
||||
pub term_history: TermHistory,
|
||||
timeline_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -289,10 +242,10 @@ pub struct VoteResponse {
|
||||
*/
|
||||
#[derive(Debug)]
|
||||
pub struct ProposerElected {
|
||||
pub generation: Generation, // membership conf generation
|
||||
pub term: Term,
|
||||
pub start_streaming_at: Lsn,
|
||||
pub term_history: TermHistory,
|
||||
pub timeline_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Request with WAL message sent from proposer to safekeeper. Along the way it
|
||||
@@ -304,22 +257,6 @@ pub struct AppendRequest {
|
||||
}
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct AppendRequestHeader {
|
||||
pub generation: Generation, // membership conf generation
|
||||
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term,
|
||||
/// start position of message in WAL
|
||||
pub begin_lsn: Lsn,
|
||||
/// end position of message in WAL
|
||||
pub end_lsn: Lsn,
|
||||
/// LSN committed by quorum of safekeepers
|
||||
pub commit_lsn: Lsn,
|
||||
/// minimal LSN which may be needed by proposer to perform recovery of some safekeeper
|
||||
pub truncate_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// V2 of the message; exists as a struct because we (de)serialized it as is.
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct AppendRequestHeaderV2 {
|
||||
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term,
|
||||
// TODO: remove this field from the protocol, it in unused -- LSN of term
|
||||
@@ -340,9 +277,6 @@ pub struct AppendRequestHeaderV2 {
|
||||
/// Report safekeeper state to proposer
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
pub struct AppendResponse {
|
||||
// Membership conf generation. Not strictly required because on mismatch
|
||||
// connection is reset, but let's sanity check it.
|
||||
generation: Generation,
|
||||
// Current term of the safekeeper; if it is higher than proposer's, the
|
||||
// compute is out of date.
|
||||
pub term: Term,
|
||||
@@ -359,9 +293,8 @@ pub struct AppendResponse {
|
||||
}
|
||||
|
||||
impl AppendResponse {
|
||||
fn term_only(generation: Generation, term: Term) -> AppendResponse {
|
||||
fn term_only(term: Term) -> AppendResponse {
|
||||
AppendResponse {
|
||||
generation,
|
||||
term,
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
@@ -382,316 +315,72 @@ pub enum ProposerAcceptorMessage {
|
||||
FlushWAL,
|
||||
}
|
||||
|
||||
/// Augment Bytes with fallible get_uN where N is number of bytes methods.
|
||||
/// All reads are in network (big endian) order.
|
||||
trait BytesF {
|
||||
fn get_u8_f(&mut self) -> Result<u8>;
|
||||
fn get_u16_f(&mut self) -> Result<u16>;
|
||||
fn get_u32_f(&mut self) -> Result<u32>;
|
||||
fn get_u64_f(&mut self) -> Result<u64>;
|
||||
}
|
||||
|
||||
impl BytesF for Bytes {
|
||||
fn get_u8_f(&mut self) -> Result<u8> {
|
||||
if self.is_empty() {
|
||||
bail!("no bytes left, expected 1");
|
||||
}
|
||||
Ok(self.get_u8())
|
||||
}
|
||||
fn get_u16_f(&mut self) -> Result<u16> {
|
||||
if self.remaining() < 2 {
|
||||
bail!("no bytes left, expected 2");
|
||||
}
|
||||
Ok(self.get_u16())
|
||||
}
|
||||
fn get_u32_f(&mut self) -> Result<u32> {
|
||||
if self.remaining() < 4 {
|
||||
bail!("only {} bytes left, expected 4", self.remaining());
|
||||
}
|
||||
Ok(self.get_u32())
|
||||
}
|
||||
fn get_u64_f(&mut self) -> Result<u64> {
|
||||
if self.remaining() < 8 {
|
||||
bail!("only {} bytes left, expected 8", self.remaining());
|
||||
}
|
||||
Ok(self.get_u64())
|
||||
}
|
||||
}
|
||||
|
||||
impl ProposerAcceptorMessage {
|
||||
/// Read cstring from Bytes.
|
||||
fn get_cstr(buf: &mut Bytes) -> Result<String> {
|
||||
let pos = buf
|
||||
.iter()
|
||||
.position(|x| *x == 0)
|
||||
.ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?;
|
||||
let result = buf.split_to(pos);
|
||||
buf.advance(1); // drop the null terminator
|
||||
match std::str::from_utf8(&result) {
|
||||
Ok(s) => Ok(s.to_string()),
|
||||
Err(e) => bail!("invalid utf8 in cstring: {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read membership::Configuration from Bytes.
|
||||
fn get_mconf(buf: &mut Bytes) -> Result<membership::Configuration> {
|
||||
let generation = buf.get_u32_f().with_context(|| "reading generation")?;
|
||||
let members_len = buf.get_u32_f().with_context(|| "reading members_len")?;
|
||||
// Main member set must have at least someone in valid configuration.
|
||||
// Empty conf is allowed until we fully migrate.
|
||||
if generation != INVALID_GENERATION && members_len == 0 {
|
||||
bail!("empty members_len");
|
||||
}
|
||||
let mut members = MemberSet::empty();
|
||||
for i in 0..members_len {
|
||||
let id = buf
|
||||
.get_u64_f()
|
||||
.with_context(|| format!("reading member {} node_id", i))?;
|
||||
let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?;
|
||||
let pg_port = buf
|
||||
.get_u16_f()
|
||||
.with_context(|| format!("reading member {} port", i))?;
|
||||
let sk = SafekeeperId {
|
||||
id: NodeId(id),
|
||||
host,
|
||||
pg_port,
|
||||
};
|
||||
members.add(sk)?;
|
||||
}
|
||||
let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?;
|
||||
// Non joint conf.
|
||||
if new_members_len == 0 {
|
||||
Ok(membership::Configuration {
|
||||
generation,
|
||||
members,
|
||||
new_members: None,
|
||||
})
|
||||
} else {
|
||||
let mut new_members = MemberSet::empty();
|
||||
for i in 0..new_members_len {
|
||||
let id = buf
|
||||
.get_u64_f()
|
||||
.with_context(|| format!("reading new member {} node_id", i))?;
|
||||
let host = Self::get_cstr(buf)
|
||||
.with_context(|| format!("reading new member {} host", i))?;
|
||||
let pg_port = buf
|
||||
.get_u16_f()
|
||||
.with_context(|| format!("reading new member {} port", i))?;
|
||||
let sk = SafekeeperId {
|
||||
id: NodeId(id),
|
||||
host,
|
||||
pg_port,
|
||||
};
|
||||
new_members.add(sk)?;
|
||||
}
|
||||
Ok(membership::Configuration {
|
||||
generation,
|
||||
members,
|
||||
new_members: Some(new_members),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse proposer message.
|
||||
pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
|
||||
if proto_version == SK_PROTO_VERSION_3 {
|
||||
if msg_bytes.is_empty() {
|
||||
bail!("ProposerAcceptorMessage is not complete: missing tag");
|
||||
pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
|
||||
if proto_version != SK_PROTOCOL_VERSION {
|
||||
bail!(
|
||||
"incompatible protocol version {}, expected {}",
|
||||
proto_version,
|
||||
SK_PROTOCOL_VERSION
|
||||
);
|
||||
}
|
||||
// xxx using Reader is inefficient but easy to work with bincode
|
||||
let mut stream = msg_bytes.reader();
|
||||
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
|
||||
let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
|
||||
match tag {
|
||||
'g' => {
|
||||
let msg = ProposerGreeting::des_from(&mut stream)?;
|
||||
Ok(ProposerAcceptorMessage::Greeting(msg))
|
||||
}
|
||||
let tag = msg_bytes.get_u8_f().with_context(|| {
|
||||
"ProposerAcceptorMessage is not complete: missing tag".to_string()
|
||||
})? as char;
|
||||
match tag {
|
||||
'g' => {
|
||||
let tenant_id_str =
|
||||
Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?;
|
||||
let tenant_id = TenantId::from_str(&tenant_id_str)?;
|
||||
let timeline_id_str =
|
||||
Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?;
|
||||
let timeline_id = TimelineId::from_str(&timeline_id_str)?;
|
||||
let mconf = Self::get_mconf(&mut msg_bytes)?;
|
||||
let pg_version = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading pg_version")?;
|
||||
let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?;
|
||||
let wal_seg_size = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading wal_seg_size")?;
|
||||
let g = ProposerGreeting {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mconf,
|
||||
pg_version,
|
||||
system_id,
|
||||
wal_seg_size,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Greeting(g))
|
||||
}
|
||||
'v' => {
|
||||
let generation = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading generation")?;
|
||||
let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
|
||||
let v = VoteRequest { generation, term };
|
||||
Ok(ProposerAcceptorMessage::VoteRequest(v))
|
||||
}
|
||||
'e' => {
|
||||
let generation = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading generation")?;
|
||||
let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
|
||||
let start_streaming_at: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading start_streaming_at")?
|
||||
.into();
|
||||
let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
|
||||
let msg = ProposerElected {
|
||||
generation,
|
||||
term,
|
||||
start_streaming_at,
|
||||
term_history,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Elected(msg))
|
||||
}
|
||||
'a' => {
|
||||
let generation = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading generation")?;
|
||||
let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
|
||||
let begin_lsn: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading begin_lsn")?
|
||||
.into();
|
||||
let end_lsn: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading end_lsn")?
|
||||
.into();
|
||||
let commit_lsn: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading commit_lsn")?
|
||||
.into();
|
||||
let truncate_lsn: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading truncate_lsn")?
|
||||
.into();
|
||||
let hdr = AppendRequestHeader {
|
||||
generation,
|
||||
term,
|
||||
begin_lsn,
|
||||
end_lsn,
|
||||
commit_lsn,
|
||||
truncate_lsn,
|
||||
};
|
||||
let rec_size = hdr
|
||||
.end_lsn
|
||||
.checked_sub(hdr.begin_lsn)
|
||||
.context("begin_lsn > end_lsn in AppendRequest")?
|
||||
.0 as usize;
|
||||
if rec_size > MAX_SEND_SIZE {
|
||||
bail!(
|
||||
"AppendRequest is longer than MAX_SEND_SIZE ({})",
|
||||
MAX_SEND_SIZE
|
||||
);
|
||||
}
|
||||
if msg_bytes.remaining() < rec_size {
|
||||
bail!(
|
||||
"reading WAL: only {} bytes left, wanted {}",
|
||||
msg_bytes.remaining(),
|
||||
rec_size
|
||||
);
|
||||
}
|
||||
let wal_data = msg_bytes.copy_to_bytes(rec_size);
|
||||
let msg = AppendRequest { h: hdr, wal_data };
|
||||
|
||||
Ok(ProposerAcceptorMessage::AppendRequest(msg))
|
||||
}
|
||||
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
|
||||
'v' => {
|
||||
let msg = VoteRequest::des_from(&mut stream)?;
|
||||
Ok(ProposerAcceptorMessage::VoteRequest(msg))
|
||||
}
|
||||
} else if proto_version == SK_PROTO_VERSION_2 {
|
||||
// xxx using Reader is inefficient but easy to work with bincode
|
||||
let mut stream = msg_bytes.reader();
|
||||
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
|
||||
let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
|
||||
match tag {
|
||||
'g' => {
|
||||
let msgv2 = ProposerGreetingV2::des_from(&mut stream)?;
|
||||
let g = ProposerGreeting {
|
||||
tenant_id: msgv2.tenant_id,
|
||||
timeline_id: msgv2.timeline_id,
|
||||
mconf: membership::Configuration {
|
||||
generation: INVALID_GENERATION,
|
||||
members: MemberSet::empty(),
|
||||
new_members: None,
|
||||
},
|
||||
pg_version: msgv2.pg_version,
|
||||
system_id: msgv2.system_id,
|
||||
wal_seg_size: msgv2.wal_seg_size,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Greeting(g))
|
||||
'e' => {
|
||||
let mut msg_bytes = stream.into_inner();
|
||||
if msg_bytes.remaining() < 16 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
'v' => {
|
||||
let msg = VoteRequestV2::des_from(&mut stream)?;
|
||||
let v = VoteRequest {
|
||||
generation: INVALID_GENERATION,
|
||||
term: msg.term,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::VoteRequest(v))
|
||||
let term = msg_bytes.get_u64_le();
|
||||
let start_streaming_at = msg_bytes.get_u64_le().into();
|
||||
let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
|
||||
if msg_bytes.remaining() < 8 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
'e' => {
|
||||
let mut msg_bytes = stream.into_inner();
|
||||
if msg_bytes.remaining() < 16 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
let term = msg_bytes.get_u64_le();
|
||||
let start_streaming_at = msg_bytes.get_u64_le().into();
|
||||
let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?;
|
||||
if msg_bytes.remaining() < 8 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
let _timeline_start_lsn = msg_bytes.get_u64_le();
|
||||
let msg = ProposerElected {
|
||||
generation: INVALID_GENERATION,
|
||||
term,
|
||||
start_streaming_at,
|
||||
term_history,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Elected(msg))
|
||||
}
|
||||
'a' => {
|
||||
// read header followed by wal data
|
||||
let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?;
|
||||
let hdr = AppendRequestHeader {
|
||||
generation: INVALID_GENERATION,
|
||||
term: hdrv2.term,
|
||||
begin_lsn: hdrv2.begin_lsn,
|
||||
end_lsn: hdrv2.end_lsn,
|
||||
commit_lsn: hdrv2.commit_lsn,
|
||||
truncate_lsn: hdrv2.truncate_lsn,
|
||||
};
|
||||
let rec_size = hdr
|
||||
.end_lsn
|
||||
.checked_sub(hdr.begin_lsn)
|
||||
.context("begin_lsn > end_lsn in AppendRequest")?
|
||||
.0 as usize;
|
||||
if rec_size > MAX_SEND_SIZE {
|
||||
bail!(
|
||||
"AppendRequest is longer than MAX_SEND_SIZE ({})",
|
||||
MAX_SEND_SIZE
|
||||
);
|
||||
}
|
||||
|
||||
let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
|
||||
stream.read_exact(&mut wal_data_vec)?;
|
||||
let wal_data = Bytes::from(wal_data_vec);
|
||||
|
||||
let msg = AppendRequest { h: hdr, wal_data };
|
||||
|
||||
Ok(ProposerAcceptorMessage::AppendRequest(msg))
|
||||
}
|
||||
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
|
||||
let timeline_start_lsn = msg_bytes.get_u64_le().into();
|
||||
let msg = ProposerElected {
|
||||
term,
|
||||
start_streaming_at,
|
||||
timeline_start_lsn,
|
||||
term_history,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Elected(msg))
|
||||
}
|
||||
} else {
|
||||
bail!("unsupported protocol version {}", proto_version);
|
||||
'a' => {
|
||||
// read header followed by wal data
|
||||
let hdr = AppendRequestHeader::des_from(&mut stream)?;
|
||||
let rec_size = hdr
|
||||
.end_lsn
|
||||
.checked_sub(hdr.begin_lsn)
|
||||
.context("begin_lsn > end_lsn in AppendRequest")?
|
||||
.0 as usize;
|
||||
if rec_size > MAX_SEND_SIZE {
|
||||
bail!(
|
||||
"AppendRequest is longer than MAX_SEND_SIZE ({})",
|
||||
MAX_SEND_SIZE
|
||||
);
|
||||
}
|
||||
|
||||
let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
|
||||
stream.read_exact(&mut wal_data_vec)?;
|
||||
let wal_data = Bytes::from(wal_data_vec);
|
||||
let msg = AppendRequest { h: hdr, wal_data };
|
||||
|
||||
Ok(ProposerAcceptorMessage::AppendRequest(msg))
|
||||
}
|
||||
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -705,21 +394,36 @@ impl ProposerAcceptorMessage {
|
||||
// We explicitly list all fields, to draw attention here when new fields are added.
|
||||
let mut size = BASE_SIZE;
|
||||
size += match self {
|
||||
Self::Greeting(_) => 0,
|
||||
Self::Greeting(ProposerGreeting {
|
||||
protocol_version: _,
|
||||
pg_version: _,
|
||||
proposer_id: _,
|
||||
system_id: _,
|
||||
timeline_id: _,
|
||||
tenant_id: _,
|
||||
tli: _,
|
||||
wal_seg_size: _,
|
||||
}) => 0,
|
||||
|
||||
Self::VoteRequest(_) => 0,
|
||||
Self::VoteRequest(VoteRequest { term: _ }) => 0,
|
||||
|
||||
Self::Elected(_) => 0,
|
||||
Self::Elected(ProposerElected {
|
||||
term: _,
|
||||
start_streaming_at: _,
|
||||
term_history: _,
|
||||
timeline_start_lsn: _,
|
||||
}) => 0,
|
||||
|
||||
Self::AppendRequest(AppendRequest {
|
||||
h:
|
||||
AppendRequestHeader {
|
||||
generation: _,
|
||||
term: _,
|
||||
term_start_lsn: _,
|
||||
begin_lsn: _,
|
||||
end_lsn: _,
|
||||
commit_lsn: _,
|
||||
truncate_lsn: _,
|
||||
proposer_uuid: _,
|
||||
},
|
||||
wal_data,
|
||||
}) => wal_data.len(),
|
||||
@@ -727,12 +431,13 @@ impl ProposerAcceptorMessage {
|
||||
Self::NoFlushAppendRequest(AppendRequest {
|
||||
h:
|
||||
AppendRequestHeader {
|
||||
generation: _,
|
||||
term: _,
|
||||
term_start_lsn: _,
|
||||
begin_lsn: _,
|
||||
end_lsn: _,
|
||||
commit_lsn: _,
|
||||
truncate_lsn: _,
|
||||
proposer_uuid: _,
|
||||
},
|
||||
wal_data,
|
||||
}) => wal_data.len(),
|
||||
@@ -753,118 +458,45 @@ pub enum AcceptorProposerMessage {
|
||||
}
|
||||
|
||||
impl AcceptorProposerMessage {
|
||||
fn put_cstr(buf: &mut BytesMut, s: &str) {
|
||||
buf.put_slice(s.as_bytes());
|
||||
buf.put_u8(0); // null terminator
|
||||
}
|
||||
|
||||
/// Serialize membership::Configuration into buf.
|
||||
fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) {
|
||||
buf.put_u32(mconf.generation);
|
||||
buf.put_u32(mconf.members.m.len() as u32);
|
||||
for sk in &mconf.members.m {
|
||||
buf.put_u64(sk.id.0);
|
||||
Self::put_cstr(buf, &sk.host);
|
||||
buf.put_u16(sk.pg_port);
|
||||
}
|
||||
if let Some(ref new_members) = mconf.new_members {
|
||||
buf.put_u32(new_members.m.len() as u32);
|
||||
for sk in &new_members.m {
|
||||
buf.put_u64(sk.id.0);
|
||||
Self::put_cstr(buf, &sk.host);
|
||||
buf.put_u16(sk.pg_port);
|
||||
}
|
||||
} else {
|
||||
buf.put_u32(0);
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialize acceptor -> proposer message.
|
||||
pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> {
|
||||
if proto_version == SK_PROTO_VERSION_3 {
|
||||
match self {
|
||||
AcceptorProposerMessage::Greeting(msg) => {
|
||||
buf.put_u8(b'g');
|
||||
buf.put_u64(msg.node_id.0);
|
||||
Self::serialize_mconf(buf, &msg.mconf);
|
||||
buf.put_u64(msg.term)
|
||||
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
|
||||
match self {
|
||||
AcceptorProposerMessage::Greeting(msg) => {
|
||||
buf.put_u64_le('g' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.node_id.0);
|
||||
}
|
||||
AcceptorProposerMessage::VoteResponse(msg) => {
|
||||
buf.put_u64_le('v' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.vote_given);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.truncate_lsn.into());
|
||||
buf.put_u32_le(msg.term_history.0.len() as u32);
|
||||
for e in &msg.term_history.0 {
|
||||
buf.put_u64_le(e.term);
|
||||
buf.put_u64_le(e.lsn.into());
|
||||
}
|
||||
AcceptorProposerMessage::VoteResponse(msg) => {
|
||||
buf.put_u8(b'v');
|
||||
buf.put_u32(msg.generation);
|
||||
buf.put_u64(msg.term);
|
||||
buf.put_u8(msg.vote_given as u8);
|
||||
buf.put_u64(msg.flush_lsn.into());
|
||||
buf.put_u64(msg.truncate_lsn.into());
|
||||
buf.put_u32(msg.term_history.0.len() as u32);
|
||||
for e in &msg.term_history.0 {
|
||||
buf.put_u64(e.term);
|
||||
buf.put_u64(e.lsn.into());
|
||||
}
|
||||
}
|
||||
AcceptorProposerMessage::AppendResponse(msg) => {
|
||||
buf.put_u8(b'a');
|
||||
buf.put_u32(msg.generation);
|
||||
buf.put_u64(msg.term);
|
||||
buf.put_u64(msg.flush_lsn.into());
|
||||
buf.put_u64(msg.commit_lsn.into());
|
||||
buf.put_i64(msg.hs_feedback.ts);
|
||||
buf.put_u64(msg.hs_feedback.xmin);
|
||||
buf.put_u64(msg.hs_feedback.catalog_xmin);
|
||||
buf.put_u64_le(msg.timeline_start_lsn.into());
|
||||
}
|
||||
AcceptorProposerMessage::AppendResponse(msg) => {
|
||||
buf.put_u64_le('a' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.commit_lsn.into());
|
||||
buf.put_i64_le(msg.hs_feedback.ts);
|
||||
buf.put_u64_le(msg.hs_feedback.xmin);
|
||||
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
|
||||
|
||||
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
|
||||
// if it is not present.
|
||||
if let Some(ref msg) = msg.pageserver_feedback {
|
||||
msg.serialize(buf);
|
||||
}
|
||||
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
|
||||
// if it is not present.
|
||||
if let Some(ref msg) = msg.pageserver_feedback {
|
||||
msg.serialize(buf);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
// TODO remove 3 after converting all msgs
|
||||
} else if proto_version == SK_PROTO_VERSION_2 {
|
||||
match self {
|
||||
AcceptorProposerMessage::Greeting(msg) => {
|
||||
buf.put_u64_le('g' as u64);
|
||||
// v2 didn't have mconf and fields were reordered
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.node_id.0);
|
||||
}
|
||||
AcceptorProposerMessage::VoteResponse(msg) => {
|
||||
// v2 didn't have generation, had u64 vote_given and timeline_start_lsn
|
||||
buf.put_u64_le('v' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.vote_given as u64);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.truncate_lsn.into());
|
||||
buf.put_u32_le(msg.term_history.0.len() as u32);
|
||||
for e in &msg.term_history.0 {
|
||||
buf.put_u64_le(e.term);
|
||||
buf.put_u64_le(e.lsn.into());
|
||||
}
|
||||
// removed timeline_start_lsn
|
||||
buf.put_u64_le(0);
|
||||
}
|
||||
AcceptorProposerMessage::AppendResponse(msg) => {
|
||||
// v2 didn't have generation
|
||||
buf.put_u64_le('a' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.commit_lsn.into());
|
||||
buf.put_i64_le(msg.hs_feedback.ts);
|
||||
buf.put_u64_le(msg.hs_feedback.xmin);
|
||||
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
|
||||
|
||||
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
|
||||
// if it is not present.
|
||||
if let Some(ref msg) = msg.pageserver_feedback {
|
||||
msg.serialize(buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("unsupported protocol version {}", proto_version);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -961,6 +593,14 @@ where
|
||||
&mut self,
|
||||
msg: &ProposerGreeting,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
// Check protocol compatibility
|
||||
if msg.protocol_version != SK_PROTOCOL_VERSION {
|
||||
bail!(
|
||||
"incompatible protocol version {}, expected {}",
|
||||
msg.protocol_version,
|
||||
SK_PROTOCOL_VERSION
|
||||
);
|
||||
}
|
||||
/* Postgres major version mismatch is treated as fatal error
|
||||
* because safekeepers parse WAL headers and the format
|
||||
* may change between versions.
|
||||
@@ -1015,16 +655,15 @@ where
|
||||
self.state.finish_change(&state).await?;
|
||||
}
|
||||
|
||||
let apg = AcceptorGreeting {
|
||||
node_id: self.node_id,
|
||||
mconf: self.state.mconf.clone(),
|
||||
term: self.state.acceptor_state.term,
|
||||
};
|
||||
info!(
|
||||
"processed greeting {:?} from walproposer, sending {:?}",
|
||||
msg, apg
|
||||
"processed greeting from walproposer {}, sending term {:?}",
|
||||
msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
|
||||
self.state.acceptor_state.term
|
||||
);
|
||||
Ok(Some(AcceptorProposerMessage::Greeting(apg)))
|
||||
Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
|
||||
term: self.state.acceptor_state.term,
|
||||
node_id: self.node_id,
|
||||
})))
|
||||
}
|
||||
|
||||
/// Give vote for the given term, if we haven't done that previously.
|
||||
@@ -1045,12 +684,12 @@ where
|
||||
self.wal_store.flush_wal().await?;
|
||||
// initialize with refusal
|
||||
let mut resp = VoteResponse {
|
||||
generation: self.state.mconf.generation,
|
||||
term: self.state.acceptor_state.term,
|
||||
vote_given: false,
|
||||
vote_given: false as u64,
|
||||
flush_lsn: self.flush_lsn(),
|
||||
truncate_lsn: self.state.inmem.peer_horizon_lsn,
|
||||
term_history: self.get_term_history(),
|
||||
timeline_start_lsn: self.state.timeline_start_lsn,
|
||||
};
|
||||
if self.state.acceptor_state.term < msg.term {
|
||||
let mut state = self.state.start_change();
|
||||
@@ -1059,16 +698,15 @@ where
|
||||
self.state.finish_change(&state).await?;
|
||||
|
||||
resp.term = self.state.acceptor_state.term;
|
||||
resp.vote_given = true;
|
||||
resp.vote_given = true as u64;
|
||||
}
|
||||
info!("processed {:?}: sending {:?}", msg, &resp);
|
||||
info!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
|
||||
Ok(Some(AcceptorProposerMessage::VoteResponse(resp)))
|
||||
}
|
||||
|
||||
/// Form AppendResponse from current state.
|
||||
fn append_response(&self) -> AppendResponse {
|
||||
let ar = AppendResponse {
|
||||
generation: self.state.mconf.generation,
|
||||
term: self.state.acceptor_state.term,
|
||||
flush_lsn: self.flush_lsn(),
|
||||
commit_lsn: self.state.commit_lsn,
|
||||
@@ -1167,22 +805,18 @@ where
|
||||
// Here we learn initial LSN for the first time, set fields
|
||||
// interested in that.
|
||||
|
||||
if let Some(start_lsn) = msg.term_history.0.first() {
|
||||
if state.timeline_start_lsn == Lsn(0) {
|
||||
// Remember point where WAL begins globally. In the future it
|
||||
// will be intialized immediately on timeline creation.
|
||||
state.timeline_start_lsn = start_lsn.lsn;
|
||||
info!(
|
||||
"setting timeline_start_lsn to {:?}",
|
||||
state.timeline_start_lsn
|
||||
);
|
||||
}
|
||||
if state.timeline_start_lsn == Lsn(0) {
|
||||
// Remember point where WAL begins globally.
|
||||
state.timeline_start_lsn = msg.timeline_start_lsn;
|
||||
info!(
|
||||
"setting timeline_start_lsn to {:?}",
|
||||
state.timeline_start_lsn
|
||||
);
|
||||
}
|
||||
|
||||
if state.peer_horizon_lsn == Lsn(0) {
|
||||
// Update peer_horizon_lsn as soon as we know where timeline starts.
|
||||
// It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
|
||||
state.peer_horizon_lsn = state.timeline_start_lsn;
|
||||
state.peer_horizon_lsn = msg.timeline_start_lsn;
|
||||
}
|
||||
if state.local_start_lsn == Lsn(0) {
|
||||
state.local_start_lsn = msg.start_streaming_at;
|
||||
@@ -1262,10 +896,7 @@ where
|
||||
|
||||
// If our term is higher, immediately refuse the message.
|
||||
if self.state.acceptor_state.term > msg.h.term {
|
||||
let resp = AppendResponse::term_only(
|
||||
self.state.mconf.generation,
|
||||
self.state.acceptor_state.term,
|
||||
);
|
||||
let resp = AppendResponse::term_only(self.state.acceptor_state.term);
|
||||
return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
|
||||
}
|
||||
|
||||
@@ -1293,8 +924,10 @@ where
|
||||
);
|
||||
}
|
||||
|
||||
// Now we know that we are in the same term as the proposer, process the
|
||||
// message.
|
||||
// Now we know that we are in the same term as the proposer,
|
||||
// processing the message.
|
||||
|
||||
self.state.inmem.proposer_uuid = msg.h.proposer_uuid;
|
||||
|
||||
// do the job
|
||||
if !msg.wal_data.is_empty() {
|
||||
@@ -1464,13 +1097,10 @@ mod tests {
|
||||
let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
|
||||
|
||||
// check voting for 1 is ok
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
|
||||
generation: 0,
|
||||
term: 1,
|
||||
});
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
|
||||
let mut vote_resp = sk.process_msg(&vote_request).await;
|
||||
match vote_resp.unwrap() {
|
||||
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given),
|
||||
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0),
|
||||
r => panic!("unexpected response: {:?}", r),
|
||||
}
|
||||
|
||||
@@ -1485,7 +1115,7 @@ mod tests {
|
||||
// and ensure voting second time for 1 is not ok
|
||||
vote_resp = sk.process_msg(&vote_request).await;
|
||||
match vote_resp.unwrap() {
|
||||
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given),
|
||||
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0),
|
||||
r => panic!("unexpected response: {:?}", r),
|
||||
}
|
||||
}
|
||||
@@ -1500,12 +1130,13 @@ mod tests {
|
||||
let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
|
||||
|
||||
let mut ar_hdr = AppendRequestHeader {
|
||||
generation: 0,
|
||||
term: 2,
|
||||
term_start_lsn: Lsn(3),
|
||||
begin_lsn: Lsn(1),
|
||||
end_lsn: Lsn(2),
|
||||
commit_lsn: Lsn(0),
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let mut append_request = AppendRequest {
|
||||
h: ar_hdr.clone(),
|
||||
@@ -1513,7 +1144,6 @@ mod tests {
|
||||
};
|
||||
|
||||
let pem = ProposerElected {
|
||||
generation: 0,
|
||||
term: 2,
|
||||
start_streaming_at: Lsn(1),
|
||||
term_history: TermHistory(vec![
|
||||
@@ -1526,6 +1156,7 @@ mod tests {
|
||||
lsn: Lsn(3),
|
||||
},
|
||||
]),
|
||||
timeline_start_lsn: Lsn(1),
|
||||
};
|
||||
sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
|
||||
.await
|
||||
@@ -1560,25 +1191,26 @@ mod tests {
|
||||
let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
|
||||
|
||||
let pem = ProposerElected {
|
||||
generation: 0,
|
||||
term: 1,
|
||||
start_streaming_at: Lsn(1),
|
||||
term_history: TermHistory(vec![TermLsn {
|
||||
term: 1,
|
||||
lsn: Lsn(1),
|
||||
}]),
|
||||
timeline_start_lsn: Lsn(1),
|
||||
};
|
||||
sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let ar_hdr = AppendRequestHeader {
|
||||
generation: 0,
|
||||
term: 1,
|
||||
term_start_lsn: Lsn(3),
|
||||
begin_lsn: Lsn(1),
|
||||
end_lsn: Lsn(2),
|
||||
commit_lsn: Lsn(0),
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let append_request = AppendRequest {
|
||||
h: ar_hdr.clone(),
|
||||
|
||||
@@ -120,6 +120,20 @@ pub enum InterpretedWalReaderError {
|
||||
WalStreamClosed,
|
||||
}
|
||||
|
||||
enum CurrentPositionUpdate {
|
||||
Reset(Lsn),
|
||||
NotReset(Lsn),
|
||||
}
|
||||
|
||||
impl CurrentPositionUpdate {
|
||||
fn current_position(&self) -> Lsn {
|
||||
match self {
|
||||
CurrentPositionUpdate::Reset(lsn) => *lsn,
|
||||
CurrentPositionUpdate::NotReset(lsn) => *lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl InterpretedWalReaderState {
|
||||
fn current_position(&self) -> Option<Lsn> {
|
||||
match self {
|
||||
@@ -129,6 +143,26 @@ impl InterpretedWalReaderState {
|
||||
InterpretedWalReaderState::Done => None,
|
||||
}
|
||||
}
|
||||
|
||||
// Reset the current position of the WAL reader if the requested starting position
|
||||
// of the new shard is smaller than the current value.
|
||||
fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate {
|
||||
match self {
|
||||
InterpretedWalReaderState::Running {
|
||||
current_position, ..
|
||||
} => {
|
||||
if new_shard_start_pos < *current_position {
|
||||
*current_position = new_shard_start_pos;
|
||||
CurrentPositionUpdate::Reset(*current_position)
|
||||
} else {
|
||||
CurrentPositionUpdate::NotReset(*current_position)
|
||||
}
|
||||
}
|
||||
InterpretedWalReaderState::Done => {
|
||||
panic!("maybe_reset called on finished reader")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct AttachShardNotification {
|
||||
@@ -410,15 +444,24 @@ impl InterpretedWalReader {
|
||||
};
|
||||
|
||||
senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
|
||||
let current_pos = self.state.read().unwrap().current_position().unwrap();
|
||||
if start_pos < current_pos {
|
||||
self.wal_stream.reset(start_pos).await;
|
||||
wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
|
||||
}
|
||||
|
||||
// If the shard is subscribing below the current position the we need
|
||||
// to update the cursor that tracks where we are at in the WAL
|
||||
// ([`Self::state`]) and reset the WAL stream itself
|
||||
// (`[Self::wal_stream`]). This must be done atomically from the POV of
|
||||
// anything outside the select statement.
|
||||
let position_reset = self.state.write().unwrap().maybe_reset(start_pos);
|
||||
match position_reset {
|
||||
CurrentPositionUpdate::Reset(to) => {
|
||||
self.wal_stream.reset(to).await;
|
||||
wal_decoder = WalStreamDecoder::new(to, self.pg_version);
|
||||
},
|
||||
CurrentPositionUpdate::NotReset(_) => {}
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"Added shard sender {} with start_pos={} current_pos={}",
|
||||
ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
|
||||
ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -584,7 +627,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
@@ -715,7 +758,6 @@ mod tests {
|
||||
const MSG_COUNT: usize = 200;
|
||||
const PG_VERSION: u32 = 17;
|
||||
const SHARD_COUNT: u8 = 2;
|
||||
const ATTACHED_SHARDS: u8 = 4;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
@@ -725,9 +767,11 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut next_record_lsns = Vec::default();
|
||||
let end_watch =
|
||||
Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
@@ -746,38 +790,71 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
let mut batch_receivers = vec![rx];
|
||||
struct Sender {
|
||||
tx: Option<tokio::sync::mpsc::Sender<Batch>>,
|
||||
rx: tokio::sync::mpsc::Receiver<Batch>,
|
||||
shard: ShardIdentity,
|
||||
start_lsn: Lsn,
|
||||
received_next_record_lsns: Vec<Lsn>,
|
||||
}
|
||||
|
||||
impl Sender {
|
||||
fn new(start_lsn: Lsn, shard: ShardIdentity) -> Self {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
Self {
|
||||
tx: Some(tx),
|
||||
rx,
|
||||
shard,
|
||||
start_lsn,
|
||||
received_next_record_lsns: Vec::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert!(next_record_lsns.len() > 7);
|
||||
let start_lsns = vec![
|
||||
next_record_lsns[5],
|
||||
next_record_lsns[1],
|
||||
next_record_lsns[3],
|
||||
];
|
||||
let mut senders = start_lsns
|
||||
.into_iter()
|
||||
.map(|lsn| Sender::new(lsn, shard_0))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let first_sender = senders.first_mut().unwrap();
|
||||
let handle = InterpretedWalReader::spawn(
|
||||
streaming_wal_reader,
|
||||
start_lsn,
|
||||
tx,
|
||||
shard_0,
|
||||
first_sender.start_lsn,
|
||||
first_sender.tx.take().unwrap(),
|
||||
first_sender.shard,
|
||||
PG_VERSION,
|
||||
&Some("pageserver".to_string()),
|
||||
);
|
||||
|
||||
for _ in 0..(ATTACHED_SHARDS - 1) {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
handle.fanout(shard_0, tx, start_lsn).unwrap();
|
||||
batch_receivers.push(rx);
|
||||
for sender in senders.iter_mut().skip(1) {
|
||||
handle
|
||||
.fanout(sender.shard, sender.tx.take().unwrap(), sender.start_lsn)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
loop {
|
||||
let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
|
||||
for rx in batch_receivers.iter_mut().skip(1) {
|
||||
let other_batch = rx.recv().await.unwrap();
|
||||
|
||||
assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
|
||||
assert_eq!(
|
||||
batch.available_wal_end_lsn,
|
||||
other_batch.available_wal_end_lsn
|
||||
for sender in senders.iter_mut() {
|
||||
loop {
|
||||
let batch = sender.rx.recv().await.unwrap();
|
||||
tracing::info!(
|
||||
"Sender with start_lsn={} received batch ending at {} with {} records",
|
||||
sender.start_lsn,
|
||||
batch.wal_end_lsn,
|
||||
batch.records.records.len()
|
||||
);
|
||||
}
|
||||
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
for rec in batch.records.records {
|
||||
sender.received_next_record_lsns.push(rec.next_record_lsn);
|
||||
}
|
||||
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -792,5 +869,20 @@ mod tests {
|
||||
}
|
||||
|
||||
assert!(done);
|
||||
|
||||
for sender in senders {
|
||||
tracing::info!(
|
||||
"Validating records received by sender with start_lsn={}",
|
||||
sender.start_lsn
|
||||
);
|
||||
|
||||
assert!(sender.received_next_record_lsns.is_sorted());
|
||||
let expected = next_record_lsns
|
||||
.iter()
|
||||
.filter(|lsn| **lsn > sender.start_lsn)
|
||||
.copied()
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(sender.received_next_record_lsns, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,10 +73,10 @@ impl Env {
|
||||
// Emulate an initial election.
|
||||
safekeeper
|
||||
.process_msg(&ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
generation: 0,
|
||||
term: 1,
|
||||
start_streaming_at: start_lsn,
|
||||
term_history: TermHistory(vec![(1, start_lsn).into()]),
|
||||
timeline_start_lsn: start_lsn,
|
||||
}))
|
||||
.await?;
|
||||
|
||||
@@ -122,6 +122,7 @@ impl Env {
|
||||
start_lsn: Lsn,
|
||||
msg_size: usize,
|
||||
msg_count: usize,
|
||||
mut next_record_lsns: Option<&mut Vec<Lsn>>,
|
||||
) -> anyhow::Result<EndWatch> {
|
||||
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
|
||||
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
|
||||
@@ -130,7 +131,7 @@ impl Env {
|
||||
|
||||
WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
|
||||
|
||||
let prefix = c"p";
|
||||
let prefix = c"neon-file:";
|
||||
let prefixlen = prefix.to_bytes_with_nul().len();
|
||||
assert!(msg_size >= prefixlen);
|
||||
let message = vec![0; msg_size - prefixlen];
|
||||
@@ -139,15 +140,19 @@ impl Env {
|
||||
&mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
|
||||
for _ in 0..msg_count {
|
||||
let (lsn, record) = walgen.next().unwrap();
|
||||
if let Some(ref mut lsns) = next_record_lsns {
|
||||
lsns.push(lsn);
|
||||
}
|
||||
|
||||
let req = AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
generation: 0,
|
||||
term: 1,
|
||||
term_start_lsn: start_lsn,
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: lsn,
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
};
|
||||
|
||||
@@ -246,7 +246,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
@@ -15,7 +15,9 @@ use desim::{
|
||||
};
|
||||
use http::Uri;
|
||||
use safekeeper::{
|
||||
safekeeper::{ProposerAcceptorMessage, SafeKeeper, SK_PROTO_VERSION_3, UNKNOWN_SERVER_VERSION},
|
||||
safekeeper::{
|
||||
ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION,
|
||||
},
|
||||
state::{TimelinePersistentState, TimelineState},
|
||||
timeline::TimelineError,
|
||||
wal_storage::Storage,
|
||||
@@ -285,7 +287,7 @@ impl ConnState {
|
||||
bail!("finished processing START_REPLICATION")
|
||||
}
|
||||
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTO_VERSION_3)?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?;
|
||||
debug!("got msg: {:?}", msg);
|
||||
self.process(msg, global)
|
||||
} else {
|
||||
@@ -401,7 +403,7 @@ impl ConnState {
|
||||
// TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
|
||||
|
||||
let mut buf = BytesMut::with_capacity(128);
|
||||
reply.serialize(&mut buf, SK_PROTO_VERSION_3)?;
|
||||
reply.serialize(&mut buf)?;
|
||||
|
||||
self.tcp.send(AnyMessage::Bytes(buf.into()));
|
||||
}
|
||||
|
||||
@@ -225,7 +225,7 @@ pub(crate) enum NotifyError {
|
||||
// We shutdown while sending
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
// A response indicates we will never succeed, such as 400 or 404
|
||||
// A response indicates we will never succeed, such as 400 or 403
|
||||
#[error("Non-retryable error {0}")]
|
||||
Fatal(StatusCode),
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ use pageserver_api::shard::ShardConfigError;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use rustls::client::danger::ServerCertVerifier;
|
||||
use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
|
||||
use rustls::client::WebPkiServerVerifier;
|
||||
use rustls::crypto::ring;
|
||||
use scoped_futures::ScopedBoxFuture;
|
||||
@@ -194,6 +194,8 @@ impl Persistence {
|
||||
timeout: Duration,
|
||||
) -> Result<(), diesel::ConnectionError> {
|
||||
let started_at = Instant::now();
|
||||
log_postgres_connstr_info(database_url)
|
||||
.map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
|
||||
loop {
|
||||
match establish_connection_rustls(database_url).await {
|
||||
Ok(_) => {
|
||||
@@ -1281,6 +1283,51 @@ pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
|
||||
Ok(Arc::new(store))
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
/// A verifier that accepts all certificates (but logs an error still)
|
||||
struct AcceptAll(Arc<WebPkiServerVerifier>);
|
||||
impl ServerCertVerifier for AcceptAll {
|
||||
fn verify_server_cert(
|
||||
&self,
|
||||
end_entity: &rustls::pki_types::CertificateDer<'_>,
|
||||
intermediates: &[rustls::pki_types::CertificateDer<'_>],
|
||||
server_name: &rustls::pki_types::ServerName<'_>,
|
||||
ocsp_response: &[u8],
|
||||
now: rustls::pki_types::UnixTime,
|
||||
) -> Result<ServerCertVerified, rustls::Error> {
|
||||
let r =
|
||||
self.0
|
||||
.verify_server_cert(end_entity, intermediates, server_name, ocsp_response, now);
|
||||
if let Err(err) = r {
|
||||
tracing::info!(
|
||||
?server_name,
|
||||
"ignoring db connection TLS validation error: {err:?}"
|
||||
);
|
||||
return Ok(ServerCertVerified::assertion());
|
||||
}
|
||||
r
|
||||
}
|
||||
fn verify_tls12_signature(
|
||||
&self,
|
||||
message: &[u8],
|
||||
cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
|
||||
self.0.verify_tls12_signature(message, cert, dss)
|
||||
}
|
||||
fn verify_tls13_signature(
|
||||
&self,
|
||||
message: &[u8],
|
||||
cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
|
||||
self.0.verify_tls13_signature(message, cert, dss)
|
||||
}
|
||||
fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
|
||||
self.0.supported_verify_schemes()
|
||||
}
|
||||
}
|
||||
|
||||
/// Loads the root certificates and constructs a client config suitable for connecting.
|
||||
/// This function is blocking.
|
||||
fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
|
||||
@@ -1290,76 +1337,12 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
|
||||
.expect("ring should support the default protocol versions");
|
||||
static DO_CERT_CHECKS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
|
||||
let do_cert_checks =
|
||||
DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_CERT_CHECKS").is_ok());
|
||||
DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_DB_CERT_CHECKS").is_ok());
|
||||
Ok(if *do_cert_checks {
|
||||
client_config
|
||||
.with_root_certificates(load_certs()?)
|
||||
.with_no_client_auth()
|
||||
} else {
|
||||
use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified};
|
||||
#[derive(Debug)]
|
||||
struct AcceptAll(Arc<WebPkiServerVerifier>);
|
||||
impl ServerCertVerifier for AcceptAll {
|
||||
fn verify_server_cert(
|
||||
&self,
|
||||
end_entity: &rustls::pki_types::CertificateDer<'_>,
|
||||
intermediates: &[rustls::pki_types::CertificateDer<'_>],
|
||||
server_name: &rustls::pki_types::ServerName<'_>,
|
||||
ocsp_response: &[u8],
|
||||
now: rustls::pki_types::UnixTime,
|
||||
) -> Result<ServerCertVerified, rustls::Error> {
|
||||
let r = self.0.verify_server_cert(
|
||||
end_entity,
|
||||
intermediates,
|
||||
server_name,
|
||||
ocsp_response,
|
||||
now,
|
||||
);
|
||||
if let Err(err) = r {
|
||||
tracing::info!(
|
||||
?server_name,
|
||||
"ignoring db connection TLS validation error: {err:?}"
|
||||
);
|
||||
return Ok(ServerCertVerified::assertion());
|
||||
}
|
||||
r
|
||||
}
|
||||
fn verify_tls12_signature(
|
||||
&self,
|
||||
message: &[u8],
|
||||
cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
|
||||
{
|
||||
let r = self.0.verify_tls12_signature(message, cert, dss);
|
||||
if let Err(err) = r {
|
||||
tracing::info!(
|
||||
"ignoring db connection 1.2 signature TLS validation error: {err:?}"
|
||||
);
|
||||
return Ok(HandshakeSignatureValid::assertion());
|
||||
}
|
||||
r
|
||||
}
|
||||
fn verify_tls13_signature(
|
||||
&self,
|
||||
message: &[u8],
|
||||
cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
|
||||
{
|
||||
let r = self.0.verify_tls13_signature(message, cert, dss);
|
||||
if let Err(err) = r {
|
||||
tracing::info!(
|
||||
"ignoring db connection 1.3 signature TLS validation error: {err:?}"
|
||||
);
|
||||
return Ok(HandshakeSignatureValid::assertion());
|
||||
}
|
||||
r
|
||||
}
|
||||
fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
|
||||
self.0.supported_verify_schemes()
|
||||
}
|
||||
}
|
||||
let verifier = AcceptAll(
|
||||
WebPkiServerVerifier::builder_with_provider(
|
||||
load_certs()?,
|
||||
@@ -1389,6 +1372,29 @@ fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<Async
|
||||
fut.boxed()
|
||||
}
|
||||
|
||||
#[cfg_attr(test, test)]
|
||||
fn test_config_debug_censors_password() {
|
||||
let has_pw =
|
||||
"host=/var/lib/postgresql,localhost port=1234 user=specialuser password='NOT ALLOWED TAG'";
|
||||
let has_pw_cfg = has_pw.parse::<tokio_postgres::Config>().unwrap();
|
||||
assert!(format!("{has_pw_cfg:?}").contains("specialuser"));
|
||||
// Ensure that the password is not leaked by the debug impl
|
||||
assert!(!format!("{has_pw_cfg:?}").contains("NOT ALLOWED TAG"));
|
||||
}
|
||||
|
||||
fn log_postgres_connstr_info(config_str: &str) -> anyhow::Result<()> {
|
||||
let config = config_str
|
||||
.parse::<tokio_postgres::Config>()
|
||||
.map_err(|_e| anyhow::anyhow!("Couldn't parse config str"))?;
|
||||
// We use debug formatting here, and use a unit test to ensure that we don't leak the password.
|
||||
// To make extra sure the test gets ran, run it every time the function is called
|
||||
// (this is rather cold code, we can afford it).
|
||||
#[cfg(not(test))]
|
||||
test_config_debug_censors_password();
|
||||
tracing::info!("database connection config: {config:?}");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
#[derive(
|
||||
QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,
|
||||
|
||||
@@ -115,6 +115,15 @@ impl ReconcilerConfigBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self {
|
||||
Self {
|
||||
config: ReconcilerConfig {
|
||||
tenant_creation_hint: hint,
|
||||
..self.config
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> ReconcilerConfig {
|
||||
self.config
|
||||
}
|
||||
@@ -129,6 +138,10 @@ pub(crate) struct ReconcilerConfig {
|
||||
// During live migrations this is the amount of time that
|
||||
// the pagserver will hold our poll.
|
||||
secondary_download_request_timeout: Option<Duration>,
|
||||
|
||||
// A hint indicating whether this reconciliation is done on the
|
||||
// creation of a new tenant. This only informs logging behaviour.
|
||||
tenant_creation_hint: bool,
|
||||
}
|
||||
|
||||
impl ReconcilerConfig {
|
||||
@@ -143,6 +156,10 @@ impl ReconcilerConfig {
|
||||
self.secondary_download_request_timeout
|
||||
.unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_creation_hint(&self) -> bool {
|
||||
self.tenant_creation_hint
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
|
||||
@@ -934,16 +951,35 @@ impl Reconciler {
|
||||
)
|
||||
.await;
|
||||
if let Err(e) = &result {
|
||||
// It is up to the caller whether they want to drop out on this error, but they don't have to:
|
||||
// in general we should avoid letting unavailability of the cloud control plane stop us from
|
||||
// making progress.
|
||||
if !matches!(e, NotifyError::ShuttingDown) {
|
||||
tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
|
||||
}
|
||||
|
||||
// Set this flag so that in our ReconcileResult we will set the flag on the shard that it
|
||||
// needs to retry at some point.
|
||||
self.compute_notify_failure = true;
|
||||
|
||||
// It is up to the caller whether they want to drop out on this error, but they don't have to:
|
||||
// in general we should avoid letting unavailability of the cloud control plane stop us from
|
||||
// making progress.
|
||||
match e {
|
||||
// 404s from cplane during tenant creation are expected.
|
||||
// Cplane only persists the shards to the database after
|
||||
// creating the tenant and the timeline. If we notify before
|
||||
// that, we'll get a 404.
|
||||
//
|
||||
// This is fine because tenant creations happen via /location_config
|
||||
// and that returns the list of locations in the response. Hence, we
|
||||
// silence the error and return Ok(()) here. Reconciliation will still
|
||||
// be retried because we set [`Reconciler::compute_notify_failure`] above.
|
||||
NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND)
|
||||
if self.reconciler_config.tenant_creation_hint() =>
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
NotifyError::ShuttingDown => {}
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
"Failed to notify compute of attached pageserver {node}: {e}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
result
|
||||
} else {
|
||||
|
||||
@@ -2238,9 +2238,14 @@ impl Service {
|
||||
let waiters = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
let config = ReconcilerConfigBuilder::new()
|
||||
.tenant_creation_hint(true)
|
||||
.build();
|
||||
tenants
|
||||
.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
.filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
|
||||
.filter_map(|(_shard_id, shard)| {
|
||||
self.maybe_configured_reconcile_shard(shard, nodes, config)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
|
||||
@@ -707,6 +707,7 @@ impl TenantShard {
|
||||
if let Some(node_id) = self.intent.get_attached() {
|
||||
// Populate secondary by demoting the attached node
|
||||
self.intent.demote_attached(scheduler, *node_id);
|
||||
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
@@ -979,24 +980,51 @@ impl TenantShard {
|
||||
),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
if secondary_scores.iter().any(|score| score.1.is_none()) {
|
||||
// Don't have full list of scores, so can't make a good decision about which to drop unless
|
||||
// there is an obvious one in the wrong AZ
|
||||
for secondary in self.intent.get_secondary() {
|
||||
if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
|
||||
// Trivial case: if we only have one secondary, drop that one
|
||||
if self.intent.get_secondary().len() == 1 {
|
||||
return Some(ScheduleOptimization {
|
||||
sequence: self.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(
|
||||
*self.intent.get_secondary().first().unwrap(),
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
// Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state
|
||||
// where its score can't be calculated), and drop the others. This enables us to make progress in
|
||||
// most cases, even if some nodes are offline or have scheduling=pause set.
|
||||
|
||||
debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this
|
||||
// logic presumes we are in a mode where we want secondaries to be in non-home AZ
|
||||
if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| {
|
||||
let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id;
|
||||
let is_available = secondary_scores
|
||||
.get(n)
|
||||
.expect("Built from same list of nodes")
|
||||
.is_some();
|
||||
is_available && !in_home_az
|
||||
}) {
|
||||
// Great, we found one to retain. Pick some other to drop.
|
||||
if let Some(victim) = self
|
||||
.intent
|
||||
.get_secondary()
|
||||
.iter()
|
||||
.find(|n| n != &retain_secondary)
|
||||
{
|
||||
return Some(ScheduleOptimization {
|
||||
sequence: self.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(*victim),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through: we didn't identify one to remove. This ought to be rare.
|
||||
tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
} else {
|
||||
let victim = secondary_scores
|
||||
.iter()
|
||||
@@ -1005,7 +1033,7 @@ impl TenantShard {
|
||||
.0;
|
||||
return Some(ScheduleOptimization {
|
||||
sequence: self.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(victim),
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(*victim),
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -2379,6 +2407,110 @@ pub(crate) mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test how the optimisation code behaves with an extra secondary
|
||||
#[test]
|
||||
fn optimize_removes_secondary() -> anyhow::Result<()> {
|
||||
let az_a_tag = AvailabilityZone("az-a".to_string());
|
||||
let az_b_tag = AvailabilityZone("az-b".to_string());
|
||||
let mut nodes = make_test_nodes(
|
||||
4,
|
||||
&[
|
||||
az_a_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
az_a_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
],
|
||||
);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
shard_a.intent.preferred_az_id = Some(az_a_tag.clone());
|
||||
shard_a
|
||||
.schedule(&mut scheduler, &mut schedule_context)
|
||||
.unwrap();
|
||||
|
||||
// Attached on node 1, secondary on node 2
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]);
|
||||
|
||||
// Initially optimiser is idle
|
||||
assert_eq!(
|
||||
shard_a.optimize_attachment(&mut scheduler, &schedule_context),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
shard_a.optimize_secondary(&mut scheduler, &schedule_context),
|
||||
None
|
||||
);
|
||||
|
||||
// A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over
|
||||
// to our new location
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
|
||||
assert_eq!(
|
||||
optimization,
|
||||
Some(ScheduleOptimization {
|
||||
sequence: shard_a.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
|
||||
})
|
||||
);
|
||||
shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
|
||||
|
||||
// A spare secondary in the non-home AZ, and one of them is offline
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(4));
|
||||
nodes
|
||||
.get_mut(&NodeId(4))
|
||||
.unwrap()
|
||||
.set_availability(NodeAvailability::Offline);
|
||||
scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
|
||||
let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
|
||||
assert_eq!(
|
||||
optimization,
|
||||
Some(ScheduleOptimization {
|
||||
sequence: shard_a.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4))
|
||||
})
|
||||
);
|
||||
shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
|
||||
|
||||
// A spare secondary when should have none
|
||||
shard_a.policy = PlacementPolicy::Attached(0);
|
||||
let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
|
||||
assert_eq!(
|
||||
optimization,
|
||||
Some(ScheduleOptimization {
|
||||
sequence: shard_a.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
|
||||
})
|
||||
);
|
||||
shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![]);
|
||||
|
||||
// Check that in secondary mode, we preserve the secondary in the preferred AZ
|
||||
let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule()
|
||||
shard_a.policy = PlacementPolicy::Secondary;
|
||||
shard_a
|
||||
.schedule(&mut scheduler, &mut schedule_context)
|
||||
.unwrap();
|
||||
assert_eq!(shard_a.intent.get_attached(), &None);
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
|
||||
assert_eq!(
|
||||
shard_a.optimize_attachment(&mut scheduler, &schedule_context),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
shard_a.optimize_secondary(&mut scheduler, &schedule_context),
|
||||
None
|
||||
);
|
||||
|
||||
shard_a.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Optimize til quiescent: this emulates what Service::optimize_all does, when
|
||||
// called repeatedly in the background.
|
||||
// Returns the applied optimizations
|
||||
|
||||
@@ -2766,6 +2766,11 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
|
||||
raise
|
||||
|
||||
def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any:
|
||||
path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json"
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
def tenant_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -34,16 +34,20 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
|
||||
cur.execute("set log_statement = 'all'")
|
||||
cur.execute("create table t(x integer)")
|
||||
for _ in range(n_iters):
|
||||
cur.execute(f"insert into t values (generate_series(1,{n_records}))")
|
||||
with zenbenchmark.record_duration(f"insert into t values (generate_series(1,{n_records}))"):
|
||||
cur.execute(f"insert into t values (generate_series(1,{n_records}))")
|
||||
time.sleep(1)
|
||||
|
||||
cur.execute("vacuum t")
|
||||
with zenbenchmark.record_duration("vacuum t"):
|
||||
cur.execute("vacuum t")
|
||||
|
||||
with zenbenchmark.record_duration("test_query"):
|
||||
with zenbenchmark.record_duration("SELECT count(*) from t"):
|
||||
cur.execute("SELECT count(*) from t")
|
||||
assert cur.fetchone() == (n_iters * n_records,)
|
||||
|
||||
flush_ep_to_pageserver(env, endpoint, tenant, timeline)
|
||||
env.pageserver.http_client().timeline_checkpoint(
|
||||
tenant, timeline, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
with zenbenchmark.record_duration("flush_ep_to_pageserver"):
|
||||
flush_ep_to_pageserver(env, endpoint, tenant, timeline)
|
||||
with zenbenchmark.record_duration("timeline_checkpoint"):
|
||||
env.pageserver.http_client().timeline_checkpoint(
|
||||
tenant, timeline, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
|
||||
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
|
||||
"LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
|
||||
"PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
|
||||
"PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
|
||||
"PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
|
||||
"PGOPTIONS": "-c idle_in_transaction_session_timeout=0 -c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
|
||||
}
|
||||
# Combine the current environment with custom variables
|
||||
env = os.environ.copy()
|
||||
|
||||
@@ -29,6 +29,21 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
|
||||
# "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
|
||||
}
|
||||
|
||||
PREEMPT_COMPACTION_TENANT_CONF = {
|
||||
"gc_period": "5s",
|
||||
"compaction_period": "5s",
|
||||
# Small checkpoint distance to create many layers
|
||||
"checkpoint_distance": 1024**2,
|
||||
# Compact small layers
|
||||
"compaction_target_size": 1024**2,
|
||||
"image_creation_threshold": 1,
|
||||
"image_creation_preempt_threshold": 1,
|
||||
# compact more frequently
|
||||
"compaction_threshold": 3,
|
||||
"compaction_upper_limit": 6,
|
||||
"lsn_lease_length": "0s",
|
||||
}
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
@pytest.mark.parametrize(
|
||||
@@ -36,7 +51,8 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
|
||||
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
|
||||
)
|
||||
def test_pageserver_compaction_smoke(
|
||||
neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
wal_receiver_protocol: PageserverWalReceiverProtocol,
|
||||
):
|
||||
"""
|
||||
This is a smoke test that compaction kicks in. The workload repeatedly churns
|
||||
@@ -54,7 +70,8 @@ def test_pageserver_compaction_smoke(
|
||||
page_cache_size=10
|
||||
"""
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
|
||||
conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
@@ -113,6 +130,41 @@ page_cache_size=10
|
||||
assert vectored_average < 8
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
def test_pageserver_compaction_preempt(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
# Ideally we should be able to do unit tests for this, but we need real Postgres
|
||||
# WALs in order to do unit testing...
|
||||
|
||||
conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
row_count = 200000
|
||||
churn_rounds = 10
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init(env.pageserver.id)
|
||||
|
||||
log.info("Writing initial data ...")
|
||||
workload.write_rows(row_count, env.pageserver.id)
|
||||
|
||||
for i in range(1, churn_rounds + 1):
|
||||
log.info(f"Running churn round {i}/{churn_rounds} ...")
|
||||
workload.churn_rows(row_count, env.pageserver.id, upload=False)
|
||||
workload.validate(env.pageserver.id)
|
||||
ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
|
||||
log.info("Validating at workload end ...")
|
||||
workload.validate(env.pageserver.id)
|
||||
# ensure image layer creation gets preempted and then resumed
|
||||
env.pageserver.assert_log_contains("resuming image layer creation")
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
@pytest.mark.parametrize(
|
||||
"with_branches",
|
||||
@@ -250,6 +302,9 @@ def test_pageserver_gc_compaction_idempotent(
|
||||
workload.churn_rows(row_count, env.pageserver.id)
|
||||
# compact 3 times if mode is before_restart
|
||||
n_compactions = 3 if compaction_mode == "before_restart" else 1
|
||||
ps_http.timeline_compact(
|
||||
tenant_id, timeline_id, force_l0_compaction=True, wait_until_uploaded=True
|
||||
)
|
||||
for _ in range(n_compactions):
|
||||
# Force refresh gc info to have gc_cutoff generated
|
||||
ps_http.timeline_gc(tenant_id, timeline_id, None)
|
||||
|
||||
@@ -95,6 +95,8 @@ def test_remote_extensions(
|
||||
|
||||
# mock remote_extensions spec
|
||||
spec: dict[str, Any] = {
|
||||
"public_extensions": ["anon"],
|
||||
"custom_extensions": None,
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
},
|
||||
|
||||
@@ -6,14 +6,9 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
|
||||
|
||||
def check_tenant(
|
||||
env: NeonEnv, pageserver_http: PageserverHttpClient, safekeeper_proto_version: int
|
||||
):
|
||||
def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
|
||||
tenant_id, timeline_id = env.create_tenant()
|
||||
config_lines = [
|
||||
f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
|
||||
]
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
||||
# we rely upon autocommit after each statement
|
||||
res_1 = endpoint.safe_psql_many(
|
||||
queries=[
|
||||
@@ -38,14 +33,7 @@ def check_tenant(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)])
|
||||
# Test both proto versions until we fully migrate.
|
||||
@pytest.mark.parametrize("safekeeper_proto_version", [2, 3])
|
||||
def test_normal_work(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
num_timelines: int,
|
||||
num_safekeepers: int,
|
||||
safekeeper_proto_version: int,
|
||||
):
|
||||
def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int):
|
||||
"""
|
||||
Basic test:
|
||||
* create new tenant with a timeline
|
||||
@@ -64,4 +52,4 @@ def test_normal_work(
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
for _ in range(num_timelines):
|
||||
check_tenant(env, pageserver_http, safekeeper_proto_version)
|
||||
check_tenant(env, pageserver_http)
|
||||
|
||||
@@ -443,7 +443,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
workload.write_rows(256, env.pageservers[0].id)
|
||||
env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
|
||||
|
||||
def validate_heatmap(heatmap):
|
||||
def validate_heatmap(heatmap, on_disk_heatmap):
|
||||
assert len(heatmap["timelines"]) == 1
|
||||
assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id)
|
||||
assert len(heatmap["timelines"][0]["layers"]) > 0
|
||||
@@ -452,10 +452,13 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
# Each layer appears at most once
|
||||
assert len(set(layer["name"] for layer in layers)) == len(layers)
|
||||
|
||||
assert heatmap == on_disk_heatmap
|
||||
|
||||
# Download and inspect the heatmap that the pageserver uploaded
|
||||
heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id)
|
||||
heatmap_first_on_disk = env.pageserver.heatmap_content(tenant_id)
|
||||
log.info(f"Read back heatmap: {heatmap_first}")
|
||||
validate_heatmap(heatmap_first)
|
||||
validate_heatmap(heatmap_first, heatmap_first_on_disk)
|
||||
|
||||
# Do some more I/O to generate more layers
|
||||
workload.churn_rows(64, env.pageservers[0].id)
|
||||
@@ -463,9 +466,10 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Ensure that another heatmap upload includes the new layers
|
||||
heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id)
|
||||
heatmap_second_on_disk = env.pageserver.heatmap_content(tenant_id)
|
||||
log.info(f"Read back heatmap: {heatmap_second}")
|
||||
assert heatmap_second != heatmap_first
|
||||
validate_heatmap(heatmap_second)
|
||||
validate_heatmap(heatmap_second, heatmap_second_on_disk)
|
||||
|
||||
|
||||
def list_elegible_layers(
|
||||
|
||||
@@ -538,16 +538,13 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
|
||||
asyncio.run(run_recovery_uncommitted(env))
|
||||
|
||||
|
||||
async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
|
||||
async def run_wal_truncation(env: NeonEnv):
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
(sk1, sk2, sk3) = env.safekeepers
|
||||
|
||||
config_lines = [
|
||||
f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
|
||||
]
|
||||
ep = env.endpoints.create_start("main", config_lines=config_lines)
|
||||
ep = env.endpoints.create_start("main")
|
||||
ep.safe_psql("create table t (key int, value text)")
|
||||
ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
|
||||
|
||||
@@ -574,7 +571,6 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
|
||||
sk2.start()
|
||||
ep = env.endpoints.create_start(
|
||||
"main",
|
||||
config_lines=config_lines,
|
||||
)
|
||||
ep.safe_psql("insert into t select generate_series(1, 200), 'payload'")
|
||||
|
||||
@@ -593,13 +589,11 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
|
||||
|
||||
# Simple deterministic test creating tail of WAL on safekeeper which is
|
||||
# truncated when majority without this sk elects walproposer starting earlier.
|
||||
# Test both proto versions until we fully migrate.
|
||||
@pytest.mark.parametrize("safekeeper_proto_version", [2, 3])
|
||||
def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int):
|
||||
def test_wal_truncation(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
asyncio.run(run_wal_truncation(env, safekeeper_proto_version))
|
||||
asyncio.run(run_wal_truncation(env))
|
||||
|
||||
|
||||
async def run_segment_init_failure(env: NeonEnv):
|
||||
|
||||
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 3cf7ce1afa...86d9ea96eb
2
vendor/postgres-v17
vendored
2
vendor/postgres-v17
vendored
Submodule vendor/postgres-v17 updated: f0ffc8279d...8dfd5a7030
4
vendor/revisions.json
vendored
4
vendor/revisions.json
vendored
@@ -1,11 +1,11 @@
|
||||
{
|
||||
"v17": [
|
||||
"17.2",
|
||||
"f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
|
||||
"8dfd5a7030d3e8a98b60265ebe045788892ac7f3"
|
||||
],
|
||||
"v16": [
|
||||
"16.6",
|
||||
"3cf7ce1afab75027716d14223f95ddb300754162"
|
||||
"86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c"
|
||||
],
|
||||
"v15": [
|
||||
"15.10",
|
||||
|
||||
@@ -92,6 +92,7 @@ tonic = { version = "0.12", default-features = false, features = ["codegen", "pr
|
||||
tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
|
||||
tracing = { version = "0.1", features = ["log"] }
|
||||
tracing-core = { version = "0.1" }
|
||||
tracing-log = { version = "0.2" }
|
||||
url = { version = "2", features = ["serde"] }
|
||||
zerocopy = { version = "0.7", features = ["derive", "simd"] }
|
||||
zeroize = { version = "1", features = ["derive", "serde"] }
|
||||
|
||||
Reference in New Issue
Block a user