mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 23:20:40 +00:00
Compare commits
27 Commits
jcsp/no-mo
...
bojan-tmp-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a8fd6266aa | ||
|
|
151605d751 | ||
|
|
eaaa18f6ed | ||
|
|
188f67e1df | ||
|
|
7e805200bb | ||
|
|
c6ca1d76d2 | ||
|
|
94b4e76e13 | ||
|
|
b514da90cb | ||
|
|
7d17f1719f | ||
|
|
41ee75bc71 | ||
|
|
11e523f503 | ||
|
|
b1a1126152 | ||
|
|
a8899e1e0f | ||
|
|
2fbd5ab075 | ||
|
|
702382e99a | ||
|
|
1b53b3e200 | ||
|
|
b332268cec | ||
|
|
76c702219c | ||
|
|
ba856140e7 | ||
|
|
2cf6a47cca | ||
|
|
5a8bcdccb0 | ||
|
|
2c8741a5ed | ||
|
|
893b7bac9a | ||
|
|
66f8f5f1c8 | ||
|
|
3a19da1066 | ||
|
|
572eda44ee | ||
|
|
b1d6af5ebe |
3
.github/workflows/build_and_test.yml
vendored
3
.github/workflows/build_and_test.yml
vendored
@@ -320,6 +320,9 @@ jobs:
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
18
.github/workflows/neon_extra_builds.yml
vendored
18
.github/workflows/neon_extra_builds.yml
vendored
@@ -32,7 +32,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -90,18 +90,21 @@ jobs:
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: make postgres-v14 -j$(nproc)
|
||||
run: make postgres-v14 -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: make postgres-v15 -j$(nproc)
|
||||
run: make postgres-v15 -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: make postgres-v16 -j$(nproc)
|
||||
run: make postgres-v16 -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: make neon-pg-ext -j$(nproc)
|
||||
run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: make walproposer-lib -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Run cargo build
|
||||
run: cargo build --all --release
|
||||
@@ -126,7 +129,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -135,6 +138,9 @@ jobs:
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Produce the build stats
|
||||
run: cargo build --all --release --timings
|
||||
|
||||
|
||||
31
Cargo.lock
generated
31
Cargo.lock
generated
@@ -285,7 +285,7 @@ dependencies = [
|
||||
"log",
|
||||
"parking",
|
||||
"polling",
|
||||
"rustix 0.37.19",
|
||||
"rustix 0.37.25",
|
||||
"slab",
|
||||
"socket2 0.4.9",
|
||||
"waker-fn",
|
||||
@@ -2582,7 +2582,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"io-lifetimes",
|
||||
"rustix 0.37.19",
|
||||
"rustix 0.37.25",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
@@ -3561,7 +3561,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3574,7 +3574,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -3585,7 +3585,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3603,7 +3603,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4331,9 +4331,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.37.19"
|
||||
version = "0.37.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
|
||||
checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
@@ -5174,7 +5174,7 @@ dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand 1.9.0",
|
||||
"redox_syscall 0.3.5",
|
||||
"rustix 0.37.19",
|
||||
"rustix 0.37.25",
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
@@ -5407,7 +5407,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6092,6 +6092,17 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "walproposer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bindgen",
|
||||
"postgres_ffi",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "want"
|
||||
version = "0.3.0"
|
||||
|
||||
14
Cargo.toml
14
Cargo.toml
@@ -26,6 +26,7 @@ members = [
|
||||
"libs/tracing-utils",
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
"libs/walproposer",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -160,11 +161,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -185,6 +186,7 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
@@ -200,7 +202,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
38
Makefile
38
Makefile
@@ -62,7 +62,7 @@ all: neon postgres neon-pg-ext
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: neon
|
||||
neon: postgres-headers
|
||||
neon: postgres-headers walproposer-lib
|
||||
+@echo "Compiling Neon"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
|
||||
@@ -168,6 +168,42 @@ neon-pg-ext-clean-%:
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
|
||||
|
||||
# Build walproposer as a static library. walproposer source code is located
|
||||
# in the pgxn/neon directory.
|
||||
#
|
||||
# We also need to include libpgport.a and libpgcommon.a, because walproposer
|
||||
# uses some functions from those libraries.
|
||||
#
|
||||
# Some object files are removed from libpgport.a and libpgcommon.a because
|
||||
# they depend on openssl and other libraries that are not included in our
|
||||
# Rust build.
|
||||
.PHONY: walproposer-lib
|
||||
walproposer-lib: neon-pg-ext-v16
|
||||
+@echo "Compiling walproposer-lib"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
|
||||
pg_strong_random.o
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
|
||||
pg_crc32c.o \
|
||||
hmac_openssl.o \
|
||||
cryptohash_openssl.o \
|
||||
scram-common.o \
|
||||
md5_common.o \
|
||||
checksum_helper.o
|
||||
endif
|
||||
|
||||
.PHONY: walproposer-lib-clean
|
||||
walproposer-lib-clean:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
neon-pg-ext-v14 \
|
||||
|
||||
@@ -302,7 +302,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
}
|
||||
RoleAction::Create => {
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("role create query: '{}'", &query);
|
||||
|
||||
@@ -36,7 +36,7 @@ use utils::pid_file::{self, PidFileRead};
|
||||
// it's waiting. If the process hasn't started/stopped after 5 seconds,
|
||||
// it prints a notice that it's taking long, but keeps waiting.
|
||||
//
|
||||
const RETRY_UNTIL_SECS: u64 = 10;
|
||||
const RETRY_UNTIL_SECS: u64 = 10000;
|
||||
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
|
||||
const RETRY_INTERVAL_MILLIS: u64 = 100;
|
||||
const DOT_EVERY_RETRIES: u64 = 10;
|
||||
|
||||
@@ -13,6 +13,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
@@ -268,7 +269,16 @@ async fn main() -> anyhow::Result<()> {
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
server.await?;
|
||||
|
||||
tokio::task::spawn(server);
|
||||
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
|
||||
tracing::info!("Got {}. Terminating", signal.name());
|
||||
// We're just a test helper: no graceful shutdown.
|
||||
std::process::exit(0);
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -110,6 +110,7 @@ impl TenantState {
|
||||
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
|
||||
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
|
||||
// tenant mgr startup distinguishes attaching from loading via marker file.
|
||||
// If it's loading, there is no attach marker file, i.e., attach had finished in the past.
|
||||
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
|
||||
// We only reach Active after successful load / attach.
|
||||
// So, call atttachment status Attached.
|
||||
|
||||
@@ -23,8 +23,8 @@ use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
use crate::{
|
||||
AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
|
||||
RemoteStorage, StorageMetadata,
|
||||
AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
|
||||
StorageMetadata,
|
||||
};
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
@@ -121,22 +121,7 @@ impl AzureBlobStorage {
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = match part {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
return Err(if let Some(http_err) = e.as_http_error() {
|
||||
match http_err.status() {
|
||||
StatusCode::NotFound => DownloadError::NotFound,
|
||||
StatusCode::BadRequest => {
|
||||
DownloadError::BadInput(anyhow::Error::new(e))
|
||||
}
|
||||
_ => DownloadError::Other(anyhow::Error::new(e)),
|
||||
}
|
||||
} else {
|
||||
DownloadError::Other(e.into())
|
||||
});
|
||||
}
|
||||
};
|
||||
let part = part.map_err(to_download_error)?;
|
||||
let data = part
|
||||
.data
|
||||
.collect()
|
||||
@@ -157,30 +142,16 @@ impl AzureBlobStorage {
|
||||
) -> Result<StorageMetadata, DownloadError> {
|
||||
let builder = blob_client.get_metadata();
|
||||
|
||||
match builder.into_future().await {
|
||||
Ok(r) => {
|
||||
let mut map = HashMap::new();
|
||||
let response = builder.into_future().await.map_err(to_download_error)?;
|
||||
let mut map = HashMap::new();
|
||||
|
||||
for md in r.metadata.iter() {
|
||||
map.insert(
|
||||
md.name().as_str().to_string(),
|
||||
md.value().as_str().to_string(),
|
||||
);
|
||||
}
|
||||
Ok(StorageMetadata(map))
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(if let Some(http_err) = e.as_http_error() {
|
||||
match http_err.status() {
|
||||
StatusCode::NotFound => DownloadError::NotFound,
|
||||
StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(e)),
|
||||
_ => DownloadError::Other(anyhow::Error::new(e)),
|
||||
}
|
||||
} else {
|
||||
DownloadError::Other(e.into())
|
||||
});
|
||||
}
|
||||
for md in response.metadata.iter() {
|
||||
map.insert(
|
||||
md.name().as_str().to_string(),
|
||||
md.value().as_str().to_string(),
|
||||
);
|
||||
}
|
||||
Ok(StorageMetadata(map))
|
||||
}
|
||||
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
@@ -199,13 +170,24 @@ fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
|
||||
res
|
||||
}
|
||||
|
||||
fn to_download_error(error: azure_core::Error) -> DownloadError {
|
||||
if let Some(http_err) = error.as_http_error() {
|
||||
match http_err.status() {
|
||||
StatusCode::NotFound => DownloadError::NotFound,
|
||||
StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
|
||||
_ => DownloadError::Other(anyhow::Error::new(error)),
|
||||
}
|
||||
} else {
|
||||
DownloadError::Other(error.into())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for AzureBlobStorage {
|
||||
async fn list(
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
@@ -213,19 +195,16 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let mut builder = self.client.list_blobs();
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
let mut builder = self
|
||||
.client
|
||||
.list_blobs()
|
||||
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
|
||||
if let Some(prefix) = list_prefix {
|
||||
builder = builder.prefix(Cow::from(prefix.to_owned()));
|
||||
@@ -236,39 +215,46 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
|
||||
let mut response = builder.into_stream();
|
||||
let mut res = Listing::default();
|
||||
while let Some(l) = response.next().await {
|
||||
let entry = match l {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
return Err(if let Some(http_err) = e.as_http_error() {
|
||||
match http_err.status() {
|
||||
StatusCode::NotFound => DownloadError::NotFound,
|
||||
StatusCode::BadRequest => {
|
||||
DownloadError::BadInput(anyhow::Error::new(e))
|
||||
}
|
||||
_ => DownloadError::Other(anyhow::Error::new(e)),
|
||||
}
|
||||
} else {
|
||||
DownloadError::Other(e.into())
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let prefix_iter = entry
|
||||
let mut res = Vec::new();
|
||||
while let Some(entry) = response.next().await {
|
||||
let entry = entry.map_err(to_download_error)?;
|
||||
let name_iter = entry
|
||||
.blobs
|
||||
.prefixes()
|
||||
.map(|prefix| self.name_to_relative_path(&prefix.name));
|
||||
res.prefixes.extend(prefix_iter);
|
||||
|
||||
let blob_iter = entry
|
||||
.blobs
|
||||
.blobs()
|
||||
.map(|k| self.name_to_relative_path(&k.name));
|
||||
res.keys.extend(blob_iter);
|
||||
res.extend(name_iter);
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
.or_else(|| self.prefix_in_container.clone());
|
||||
|
||||
let mut builder = self.client.list_blobs();
|
||||
|
||||
if let Some(folder_name) = folder_name {
|
||||
builder = builder.prefix(Cow::from(folder_name.to_owned()));
|
||||
}
|
||||
|
||||
if let Some(limit) = self.max_keys_per_list_response {
|
||||
builder = builder.max_results(MaxResults::new(limit));
|
||||
}
|
||||
|
||||
let mut response = builder.into_stream();
|
||||
let mut res = Vec::new();
|
||||
while let Some(l) = response.next().await {
|
||||
let entry = l.map_err(anyhow::Error::new)?;
|
||||
let name_iter = entry
|
||||
.blobs
|
||||
.blobs()
|
||||
.map(|bl| self.name_to_relative_path(&bl.name));
|
||||
res.extend(name_iter);
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -129,22 +129,6 @@ impl RemotePath {
|
||||
}
|
||||
}
|
||||
|
||||
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
||||
/// whether listings will use a '/' separator or not.
|
||||
///
|
||||
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
|
||||
/// NoDelimiter mode will only populate `keys`.
|
||||
pub enum ListingMode {
|
||||
WithDelimiter,
|
||||
NoDelimiter,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Listing {
|
||||
pub prefixes: Vec<RemotePath>,
|
||||
pub keys: Vec<RemotePath>,
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
@@ -157,13 +141,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::WithDelimiter)
|
||||
.await?
|
||||
.prefixes;
|
||||
Ok(result)
|
||||
}
|
||||
) -> Result<Vec<RemotePath>, DownloadError>;
|
||||
|
||||
/// Lists all files in directory "recursively"
|
||||
/// (not really recursively, because AWS has a flat namespace)
|
||||
/// Note: This is subtely different than list_prefixes,
|
||||
@@ -175,21 +154,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// whereas,
|
||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||
/// See `test_real_s3.rs` for more details.
|
||||
async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
_mode: ListingMode,
|
||||
) -> anyhow::Result<Listing, DownloadError>; /* {
|
||||
// XXX Placeholder impl.
|
||||
let mut result = Listing::default();
|
||||
result.prefixes = self.list_prefixes(prefix).await?;
|
||||
Ok(result)
|
||||
}*/
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
@@ -240,9 +205,6 @@ pub enum DownloadError {
|
||||
BadInput(anyhow::Error),
|
||||
/// The file was not found in the remote storage.
|
||||
NotFound,
|
||||
/// A cancellation token aborted the download, typically during
|
||||
/// tenant detach or process shutdown.
|
||||
Cancelled,
|
||||
/// The file was found in the remote storage, but the download failed.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
@@ -253,7 +215,6 @@ impl std::fmt::Display for DownloadError {
|
||||
DownloadError::BadInput(e) => {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
|
||||
}
|
||||
@@ -273,19 +234,6 @@ pub enum GenericRemoteStorage {
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list(prefix, mode).await,
|
||||
Self::AwsS3(s) => s.list(prefix, mode).await,
|
||||
Self::AzureBlob(s) => s.list(prefix, mode).await,
|
||||
Self::Unreliable(s) => s.list(prefix, mode).await,
|
||||
}
|
||||
}
|
||||
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
|
||||
@@ -15,7 +15,7 @@ use tokio::{
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
|
||||
use crate::{Download, DownloadError, RemotePath};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -75,7 +75,7 @@ impl LocalFs {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
Ok(get_all_files(&self.storage_root, true)
|
||||
.await?
|
||||
.into_iter()
|
||||
@@ -89,10 +89,52 @@ impl LocalFs {
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
prefixes.push(
|
||||
prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
Ok(prefixes)
|
||||
}
|
||||
|
||||
// recursively lists all files in a directory,
|
||||
// mirroring the `list_files` for `s3_bucket`
|
||||
async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let full_path = match folder {
|
||||
Some(folder) => folder.with_base(&self.storage_root),
|
||||
None => self.storage_root.clone(),
|
||||
@@ -144,61 +186,6 @@ impl LocalFs {
|
||||
|
||||
Ok(files)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let mut result = Listing::default();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
result.keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped)
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
@@ -492,7 +479,7 @@ mod fs_tests {
|
||||
|
||||
let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
|
||||
assert_eq!(
|
||||
storage.list_all().await?,
|
||||
storage.list().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
@@ -680,7 +667,7 @@ mod fs_tests {
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
assert!(storage.list_all().await?.is_empty());
|
||||
assert!(storage.list().await?.is_empty());
|
||||
|
||||
storage
|
||||
.delete(&upload_target)
|
||||
@@ -790,7 +777,7 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let mut files = storage.list_all().await?;
|
||||
let mut files = storage.list().await?;
|
||||
files.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
@@ -30,8 +30,8 @@ use tracing::debug;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
|
||||
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
@@ -299,13 +299,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
async fn list(
|
||||
/// See the doc for `RemoteStorage::list_prefixes`
|
||||
/// Note: it wont include empty "directories"
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let kind = RequestKind::List;
|
||||
let mut result = Listing::default();
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
@@ -314,33 +314,28 @@ impl RemoteStorage for S3Bucket {
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let mut request = self
|
||||
let fetch_response = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(list_prefix.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.set_max_keys(self.max_keys_per_list_response);
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
let response = request
|
||||
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.context("Failed to list S3 prefixes")
|
||||
@@ -350,35 +345,71 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
.observe_elapsed(kind, &fetch_response, started_at);
|
||||
|
||||
let response = response?;
|
||||
let fetch_response = fetch_response?;
|
||||
|
||||
let keys = response.contents().unwrap_or_default();
|
||||
let empty = Vec::new();
|
||||
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
||||
|
||||
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
|
||||
for object in keys {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
result.keys.push(remote_path);
|
||||
}
|
||||
|
||||
result.prefixes.extend(
|
||||
prefixes
|
||||
.iter()
|
||||
document_keys.extend(
|
||||
fetch_response
|
||||
.common_prefixes
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
|
||||
continuation_token = match response.next_continuation_token {
|
||||
continuation_token = match fetch_response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let kind = RequestKind::List;
|
||||
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
// AWS may need to break the response into several parts
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(folder_name.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.context("Failed to list files in S3 bucket");
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
let response = response?;
|
||||
|
||||
for object in response.contents().unwrap_or_default() {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
all_files.push(remote_path);
|
||||
}
|
||||
match response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
Ok(all_files)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
|
||||
@@ -5,9 +5,7 @@ use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
};
|
||||
use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
|
||||
|
||||
pub struct UnreliableWrapper {
|
||||
inner: crate::GenericRemoteStorage,
|
||||
@@ -97,15 +95,6 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.list_files(folder).await
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
|
||||
self.inner.list(prefix, mode).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -1,36 +1,12 @@
|
||||
use std::sync::{atomic::AtomicI32, Arc};
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::sync::{mpsc, Mutex};
|
||||
|
||||
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
|
||||
///
|
||||
/// Can be cloned, moved and kept around in futures as "guard objects".
|
||||
pub struct Completion {
|
||||
sender: mpsc::Sender<()>,
|
||||
refcount: Arc<AtomicI32>,
|
||||
}
|
||||
|
||||
impl Clone for Completion {
|
||||
fn clone(&self) -> Self {
|
||||
let i = self
|
||||
.refcount
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
tracing::info!("Completion::clone[{:p}]: {i}", &(*self.refcount));
|
||||
Self {
|
||||
sender: self.sender.clone(),
|
||||
refcount: self.refcount.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Completion {
|
||||
fn drop(&mut self) {
|
||||
let i = self
|
||||
.refcount
|
||||
.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
|
||||
tracing::info!("Completion::drop[{:p}]: {i}", &(*self.refcount));
|
||||
}
|
||||
}
|
||||
#[derive(Clone)]
|
||||
pub struct Completion(mpsc::Sender<()>);
|
||||
|
||||
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
||||
#[derive(Clone)]
|
||||
@@ -69,11 +45,5 @@ pub fn channel() -> (Completion, Barrier) {
|
||||
let (tx, rx) = mpsc::channel::<()>(1);
|
||||
let rx = Mutex::new(rx);
|
||||
let rx = Arc::new(rx);
|
||||
(
|
||||
Completion {
|
||||
sender: tx,
|
||||
refcount: Arc::new(AtomicI32::new(1)),
|
||||
},
|
||||
Barrier(rx),
|
||||
)
|
||||
(Completion(tx), Barrier(rx))
|
||||
}
|
||||
|
||||
@@ -253,11 +253,22 @@ impl Runner {
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let (last_time, last_history) = *cgroup.watcher.borrow();
|
||||
|
||||
// NB: The ordering of these conditions is intentional. During startup, we should deny
|
||||
// downscaling until we have enough information to determine that it's safe to do so
|
||||
// (i.e. enough samples have come in). But if it's been a while and we *still* haven't
|
||||
// received any information, we should *fail* instead of just denying downscaling.
|
||||
//
|
||||
// `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
|
||||
// serves double-duty: it trips if we haven't received *any* metrics for long enough,
|
||||
// OR if we haven't received metrics *recently enough*.
|
||||
//
|
||||
// TODO: make the duration here configurable.
|
||||
if last_time.elapsed() > Duration::from_secs(5) {
|
||||
bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
|
||||
} else if last_history.samples_count <= 1 {
|
||||
bail!("haven't received enough cgroup memory stats yet");
|
||||
let status = "haven't received enough cgroup memory stats yet";
|
||||
info!(status, "discontinuing downscale");
|
||||
return Ok((false, status.to_owned()));
|
||||
}
|
||||
|
||||
let new_threshold = self
|
||||
@@ -505,11 +516,14 @@ impl Runner {
|
||||
Ok(Some(out)) => out,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
let error = e.to_string();
|
||||
warn!(?error, "error handling message");
|
||||
// use {:#} for our logging because the display impl only
|
||||
// gives the outermost cause, and the debug impl
|
||||
// pretty-prints the error, whereas {:#} contains all the
|
||||
// causes, but is compact (no newlines).
|
||||
warn!(error = format!("{e:#}"), "error handling message");
|
||||
OutboundMsg::new(
|
||||
OutboundMsgKind::InternalError {
|
||||
error
|
||||
error: e.to_string(),
|
||||
},
|
||||
message.id
|
||||
)
|
||||
|
||||
16
libs/walproposer/Cargo.toml
Normal file
16
libs/walproposer/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "walproposer"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
anyhow.workspace = true
|
||||
bindgen.workspace = true
|
||||
1
libs/walproposer/bindgen_deps.h
Normal file
1
libs/walproposer/bindgen_deps.h
Normal file
@@ -0,0 +1 @@
|
||||
#include "walproposer.h"
|
||||
113
libs/walproposer/build.rs
Normal file
113
libs/walproposer/build.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
use std::{env, path::PathBuf, process::Command};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use bindgen::CargoCallbacks;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=bindgen_deps.h");
|
||||
|
||||
// Finding the location of built libraries and Postgres C headers:
|
||||
// - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
|
||||
// - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
|
||||
let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
|
||||
postgres_install_dir.into()
|
||||
} else {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
|
||||
};
|
||||
|
||||
let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
|
||||
let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
|
||||
let walproposer_lib_search_str = walproposer_lib_dir
|
||||
.to_str()
|
||||
.ok_or(anyhow!("Bad non-UTF path"))?;
|
||||
|
||||
let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
|
||||
let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
|
||||
let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
|
||||
|
||||
println!("cargo:rustc-link-lib=static=pgport");
|
||||
println!("cargo:rustc-link-lib=static=pgcommon");
|
||||
println!("cargo:rustc-link-lib=static=walproposer");
|
||||
println!("cargo:rustc-link-search={walproposer_lib_search_str}");
|
||||
|
||||
let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
|
||||
let inc_server_path: String = if pg_config_bin.exists() {
|
||||
let output = Command::new(pg_config_bin)
|
||||
.arg("--includedir-server")
|
||||
.output()
|
||||
.context("failed to execute `pg_config --includedir-server`")?;
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("`pg_config --includedir-server` failed")
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout)
|
||||
.context("pg_config output is not UTF-8")?
|
||||
.trim_end()
|
||||
.into()
|
||||
} else {
|
||||
let server_path = pg_install_abs
|
||||
.join("v16")
|
||||
.join("include")
|
||||
.join("postgresql")
|
||||
.join("server")
|
||||
.into_os_string();
|
||||
server_path
|
||||
.into_string()
|
||||
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
|
||||
};
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
// The input header we would like to generate
|
||||
// bindings for.
|
||||
.header("bindgen_deps.h")
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
.parse_callbacks(Box::new(CargoCallbacks))
|
||||
.allowlist_type("WalProposer")
|
||||
.allowlist_type("WalProposerConfig")
|
||||
.allowlist_type("walproposer_api")
|
||||
.allowlist_function("WalProposerCreate")
|
||||
.allowlist_function("WalProposerStart")
|
||||
.allowlist_function("WalProposerBroadcast")
|
||||
.allowlist_function("WalProposerPoll")
|
||||
.allowlist_function("WalProposerFree")
|
||||
.allowlist_var("DEBUG5")
|
||||
.allowlist_var("DEBUG4")
|
||||
.allowlist_var("DEBUG3")
|
||||
.allowlist_var("DEBUG2")
|
||||
.allowlist_var("DEBUG1")
|
||||
.allowlist_var("LOG")
|
||||
.allowlist_var("INFO")
|
||||
.allowlist_var("NOTICE")
|
||||
.allowlist_var("WARNING")
|
||||
.allowlist_var("ERROR")
|
||||
.allowlist_var("FATAL")
|
||||
.allowlist_var("PANIC")
|
||||
.allowlist_var("WPEVENT")
|
||||
.allowlist_var("WL_LATCH_SET")
|
||||
.allowlist_var("WL_SOCKET_READABLE")
|
||||
.allowlist_var("WL_SOCKET_WRITEABLE")
|
||||
.allowlist_var("WL_TIMEOUT")
|
||||
.allowlist_var("WL_SOCKET_CLOSED")
|
||||
.allowlist_var("WL_SOCKET_MASK")
|
||||
.clang_arg("-DWALPROPOSER_LIB")
|
||||
.clang_arg(format!("-I{pgxn_neon}"))
|
||||
.clang_arg(format!("-I{inc_server_path}"))
|
||||
// Finish the builder and generate the bindings.
|
||||
.generate()
|
||||
// Unwrap the Result and panic on failure.
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
|
||||
bindings
|
||||
.write_to_file(out_path)
|
||||
.expect("Couldn't write bindings!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
455
libs/walproposer/src/api_bindings.rs
Normal file
455
libs/walproposer/src/api_bindings.rs
Normal file
@@ -0,0 +1,455 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::ffi::CStr;
|
||||
use std::ffi::CString;
|
||||
|
||||
use crate::bindings::uint32;
|
||||
use crate::bindings::walproposer_api;
|
||||
use crate::bindings::PGAsyncReadResult;
|
||||
use crate::bindings::PGAsyncWriteResult;
|
||||
use crate::bindings::Safekeeper;
|
||||
use crate::bindings::Size;
|
||||
use crate::bindings::StringInfoData;
|
||||
use crate::bindings::TimeLineID;
|
||||
use crate::bindings::TimestampTz;
|
||||
use crate::bindings::WalProposer;
|
||||
use crate::bindings::WalProposerConnStatusType;
|
||||
use crate::bindings::WalProposerConnectPollStatusType;
|
||||
use crate::bindings::WalProposerExecStatusType;
|
||||
use crate::bindings::WalproposerShmemState;
|
||||
use crate::bindings::XLogRecPtr;
|
||||
use crate::walproposer::ApiImpl;
|
||||
use crate::walproposer::WaitResult;
|
||||
|
||||
extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_shmem_state()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).start_streaming(startpos)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_flush_rec_ptr()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_current_timestamp()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let msg = (*api).conn_error_message(&mut (*sk));
|
||||
let msg = CString::new(msg).unwrap();
|
||||
// TODO: fix leaking error message
|
||||
msg.into_raw()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_status(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_connect_start(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_connect_start(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_connect_poll(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool {
|
||||
let query = unsafe { CStr::from_ptr(query) };
|
||||
let query = query.to_str().unwrap();
|
||||
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_send_query(&mut (*sk), query)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_get_query_result(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_flush(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_finish(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_finish(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_async_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut *mut ::std::os::raw::c_char,
|
||||
amount: *mut ::std::os::raw::c_int,
|
||||
) -> PGAsyncReadResult {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let (res, result) = (*api).conn_async_read(&mut (*sk));
|
||||
|
||||
// This function has guarantee that returned buf will be valid until
|
||||
// the next call. So we can store a Vec in each Safekeeper and reuse
|
||||
// it on the next call.
|
||||
let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
|
||||
|
||||
inbuf.clear();
|
||||
inbuf.extend_from_slice(res);
|
||||
|
||||
// Put a Vec back to sk->inbuf and return data ptr.
|
||||
*buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
|
||||
*amount = res.len() as i32;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_async_write(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *const ::std::os::raw::c_void,
|
||||
size: usize,
|
||||
) -> PGAsyncWriteResult {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts(buf as *const u8, size);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_async_write(&mut (*sk), buf)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_blocking_write(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *const ::std::os::raw::c_void,
|
||||
size: usize,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts(buf as *const u8, size);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_blocking_write(&mut (*sk), buf)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn recovery_download(
|
||||
sk: *mut Safekeeper,
|
||||
_timeline: TimeLineID,
|
||||
startpos: XLogRecPtr,
|
||||
endpos: XLogRecPtr,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).recovery_download(&mut (*sk), startpos, endpos)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
startptr: XLogRecPtr,
|
||||
count: Size,
|
||||
) {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_reader_allocate(&mut (*sk));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn free_event_set(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).free_event_set(&mut (*wp));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn init_event_set(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).init_event_set(&mut (*wp));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).update_event_set(&mut (*sk), events);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).add_safekeeper_event_set(&mut (*sk), events);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wait_event_set(
|
||||
wp: *mut WalProposer,
|
||||
timeout: ::std::os::raw::c_long,
|
||||
event_sk: *mut *mut Safekeeper,
|
||||
events: *mut uint32,
|
||||
) -> ::std::os::raw::c_int {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let result = (*api).wait_event_set(&mut (*wp), timeout);
|
||||
match result {
|
||||
WaitResult::Latch => {
|
||||
*event_sk = std::ptr::null_mut();
|
||||
*events = crate::bindings::WL_LATCH_SET;
|
||||
1
|
||||
}
|
||||
WaitResult::Timeout => {
|
||||
*event_sk = std::ptr::null_mut();
|
||||
*events = crate::bindings::WL_TIMEOUT;
|
||||
0
|
||||
}
|
||||
WaitResult::Network(sk, event_mask) => {
|
||||
*event_sk = sk;
|
||||
*events = event_mask;
|
||||
1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn strong_random(
|
||||
wp: *mut WalProposer,
|
||||
buf: *mut ::std::os::raw::c_void,
|
||||
len: usize,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len);
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).strong_random(buf)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_redo_start_lsn()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).finish_sync_safekeepers(lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).confirm_wal_streamed(&mut (*wp), lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn log_internal(
|
||||
wp: *mut WalProposer,
|
||||
level: ::std::os::raw::c_int,
|
||||
line: *const ::std::os::raw::c_char,
|
||||
) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let line = CStr::from_ptr(line);
|
||||
let line = line.to_str().unwrap();
|
||||
(*api).log_internal(&mut (*wp), Level::from(level as u32), line)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn after_election(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).after_election(&mut (*wp))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Level {
|
||||
Debug5,
|
||||
Debug4,
|
||||
Debug3,
|
||||
Debug2,
|
||||
Debug1,
|
||||
Log,
|
||||
Info,
|
||||
Notice,
|
||||
Warning,
|
||||
Error,
|
||||
Fatal,
|
||||
Panic,
|
||||
WPEvent,
|
||||
}
|
||||
|
||||
impl Level {
|
||||
pub fn from(elevel: u32) -> Level {
|
||||
use crate::bindings::*;
|
||||
|
||||
match elevel {
|
||||
DEBUG5 => Level::Debug5,
|
||||
DEBUG4 => Level::Debug4,
|
||||
DEBUG3 => Level::Debug3,
|
||||
DEBUG2 => Level::Debug2,
|
||||
DEBUG1 => Level::Debug1,
|
||||
LOG => Level::Log,
|
||||
INFO => Level::Info,
|
||||
NOTICE => Level::Notice,
|
||||
WARNING => Level::Warning,
|
||||
ERROR => Level::Error,
|
||||
FATAL => Level::Fatal,
|
||||
PANIC => Level::Panic,
|
||||
WPEVENT => Level::WPEvent,
|
||||
_ => panic!("unknown log level {}", elevel),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_api() -> walproposer_api {
|
||||
walproposer_api {
|
||||
get_shmem_state: Some(get_shmem_state),
|
||||
start_streaming: Some(start_streaming),
|
||||
get_flush_rec_ptr: Some(get_flush_rec_ptr),
|
||||
get_current_timestamp: Some(get_current_timestamp),
|
||||
conn_error_message: Some(conn_error_message),
|
||||
conn_status: Some(conn_status),
|
||||
conn_connect_start: Some(conn_connect_start),
|
||||
conn_connect_poll: Some(conn_connect_poll),
|
||||
conn_send_query: Some(conn_send_query),
|
||||
conn_get_query_result: Some(conn_get_query_result),
|
||||
conn_flush: Some(conn_flush),
|
||||
conn_finish: Some(conn_finish),
|
||||
conn_async_read: Some(conn_async_read),
|
||||
conn_async_write: Some(conn_async_write),
|
||||
conn_blocking_write: Some(conn_blocking_write),
|
||||
recovery_download: Some(recovery_download),
|
||||
wal_read: Some(wal_read),
|
||||
wal_reader_allocate: Some(wal_reader_allocate),
|
||||
free_event_set: Some(free_event_set),
|
||||
init_event_set: Some(init_event_set),
|
||||
update_event_set: Some(update_event_set),
|
||||
add_safekeeper_event_set: Some(add_safekeeper_event_set),
|
||||
wait_event_set: Some(wait_event_set),
|
||||
strong_random: Some(strong_random),
|
||||
get_redo_start_lsn: Some(get_redo_start_lsn),
|
||||
finish_sync_safekeepers: Some(finish_sync_safekeepers),
|
||||
process_safekeeper_feedback: Some(process_safekeeper_feedback),
|
||||
confirm_wal_streamed: Some(confirm_wal_streamed),
|
||||
log_internal: Some(log_internal),
|
||||
after_election: Some(after_election),
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Level {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Take ownership of `Vec<u8>` from StringInfoData.
|
||||
pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
|
||||
if pg.data.is_null() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ptr = pg.data as *mut u8;
|
||||
let length = pg.len as usize;
|
||||
let capacity = pg.maxlen as usize;
|
||||
|
||||
pg.data = std::ptr::null_mut();
|
||||
pg.len = 0;
|
||||
pg.maxlen = 0;
|
||||
|
||||
unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) }
|
||||
}
|
||||
|
||||
/// Store `Vec<u8>` in StringInfoData.
|
||||
fn store_vec_u8(pg: &mut StringInfoData, vec: Vec<u8>) -> *mut ::std::os::raw::c_char {
|
||||
let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char;
|
||||
let length = vec.len();
|
||||
let capacity = vec.capacity();
|
||||
|
||||
assert!(pg.data.is_null());
|
||||
|
||||
pg.data = ptr;
|
||||
pg.len = length as i32;
|
||||
pg.maxlen = capacity as i32;
|
||||
|
||||
std::mem::forget(vec);
|
||||
|
||||
ptr
|
||||
}
|
||||
14
libs/walproposer/src/lib.rs
Normal file
14
libs/walproposer/src/lib.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
pub mod bindings {
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
// bindgen creates some unsafe code with no doc comments.
|
||||
#![allow(clippy::missing_safety_doc)]
|
||||
// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
|
||||
#![allow(clippy::useless_transmute)]
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
}
|
||||
|
||||
pub mod api_bindings;
|
||||
pub mod walproposer;
|
||||
485
libs/walproposer/src/walproposer.rs
Normal file
485
libs/walproposer/src/walproposer.rs
Normal file
@@ -0,0 +1,485 @@
|
||||
use std::ffi::CString;
|
||||
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{
|
||||
api_bindings::{create_api, take_vec_u8, Level},
|
||||
bindings::{
|
||||
Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
|
||||
WalProposerStart,
|
||||
},
|
||||
};
|
||||
|
||||
/// Rust high-level wrapper for C walproposer API. Many methods are not required
|
||||
/// for simple cases, hence todo!() in default implementations.
|
||||
///
|
||||
/// Refer to `pgxn/neon/walproposer.h` for documentation.
|
||||
pub trait ApiImpl {
|
||||
fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn start_streaming(&self, _startpos: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_flush_rec_ptr(&self) -> u64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_current_timestamp(&self) -> i64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_error_message(&self, _sk: &mut Safekeeper) -> String {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_connect_start(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_connect_poll(
|
||||
&self,
|
||||
_sk: &mut Safekeeper,
|
||||
) -> crate::bindings::WalProposerConnectPollStatusType {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_get_query_result(
|
||||
&self,
|
||||
_sk: &mut Safekeeper,
|
||||
) -> crate::bindings::WalProposerExecStatusType {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_finish(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_async_write(
|
||||
&self,
|
||||
_sk: &mut Safekeeper,
|
||||
_buf: &[u8],
|
||||
) -> crate::bindings::PGAsyncWriteResult {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn free_event_set(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn init_event_set(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn strong_random(&self, _buf: &mut [u8]) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_redo_start_lsn(&self) -> u64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn finish_sync_safekeepers(&self, _lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn after_election(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum WaitResult {
|
||||
Latch,
|
||||
Timeout,
|
||||
Network(*mut Safekeeper, u32),
|
||||
}
|
||||
|
||||
pub struct Config {
|
||||
/// Tenant and timeline id
|
||||
pub ttid: TenantTimelineId,
|
||||
/// List of safekeepers in format `host:port`
|
||||
pub safekeepers_list: Vec<String>,
|
||||
/// Safekeeper reconnect timeout in milliseconds
|
||||
pub safekeeper_reconnect_timeout: i32,
|
||||
/// Safekeeper connection timeout in milliseconds
|
||||
pub safekeeper_connection_timeout: i32,
|
||||
/// walproposer mode, finish when all safekeepers are synced or subscribe
|
||||
/// to WAL streaming
|
||||
pub sync_safekeepers: bool,
|
||||
}
|
||||
|
||||
/// WalProposer main struct. C methods are reexported as Rust functions.
|
||||
pub struct Wrapper {
|
||||
wp: *mut WalProposer,
|
||||
_safekeepers_list_vec: Vec<u8>,
|
||||
}
|
||||
|
||||
impl Wrapper {
|
||||
pub fn new(api: Box<dyn ApiImpl>, config: Config) -> Wrapper {
|
||||
let neon_tenant = CString::new(config.ttid.tenant_id.to_string())
|
||||
.unwrap()
|
||||
.into_raw();
|
||||
let neon_timeline = CString::new(config.ttid.timeline_id.to_string())
|
||||
.unwrap()
|
||||
.into_raw();
|
||||
|
||||
let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
|
||||
.unwrap()
|
||||
.into_bytes_with_nul();
|
||||
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
|
||||
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
|
||||
|
||||
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
|
||||
|
||||
let c_config = WalProposerConfig {
|
||||
neon_tenant,
|
||||
neon_timeline,
|
||||
safekeepers_list,
|
||||
safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
|
||||
safekeeper_connection_timeout: config.safekeeper_connection_timeout,
|
||||
wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
|
||||
syncSafekeepers: config.sync_safekeepers,
|
||||
systemId: 0,
|
||||
pgTimeline: 1,
|
||||
callback_data,
|
||||
};
|
||||
let c_config = Box::into_raw(Box::new(c_config));
|
||||
|
||||
let api = create_api();
|
||||
let wp = unsafe { WalProposerCreate(c_config, api) };
|
||||
Wrapper {
|
||||
wp,
|
||||
_safekeepers_list_vec: safekeepers_list_vec,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
unsafe { WalProposerStart(self.wp) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Wrapper {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
let config = (*self.wp).config;
|
||||
drop(Box::from_raw(
|
||||
(*config).callback_data as *mut Box<dyn ApiImpl>,
|
||||
));
|
||||
drop(CString::from_raw((*config).neon_tenant));
|
||||
drop(CString::from_raw((*config).neon_timeline));
|
||||
drop(Box::from_raw(config));
|
||||
|
||||
for i in 0..(*self.wp).n_safekeepers {
|
||||
let sk = &mut (*self.wp).safekeeper[i as usize];
|
||||
take_vec_u8(&mut sk.inbuf);
|
||||
}
|
||||
|
||||
WalProposerFree(self.wp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
cell::Cell,
|
||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||
};
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{api_bindings::Level, walproposer::Wrapper};
|
||||
|
||||
use super::ApiImpl;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct WaitEventsData {
|
||||
sk: *mut crate::bindings::Safekeeper,
|
||||
event_mask: u32,
|
||||
}
|
||||
|
||||
struct MockImpl {
|
||||
// data to return from wait_event_set
|
||||
wait_events: Cell<WaitEventsData>,
|
||||
// walproposer->safekeeper messages
|
||||
expected_messages: Vec<Vec<u8>>,
|
||||
expected_ptr: AtomicUsize,
|
||||
// safekeeper->walproposer messages
|
||||
safekeeper_replies: Vec<Vec<u8>>,
|
||||
replies_ptr: AtomicUsize,
|
||||
// channel to send LSN to the main thread
|
||||
sync_channel: std::sync::mpsc::SyncSender<u64>,
|
||||
}
|
||||
|
||||
impl MockImpl {
|
||||
fn check_walproposer_msg(&self, msg: &[u8]) {
|
||||
let ptr = self
|
||||
.expected_ptr
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
|
||||
if ptr >= self.expected_messages.len() {
|
||||
panic!("unexpected message from walproposer");
|
||||
}
|
||||
|
||||
let expected_msg = &self.expected_messages[ptr];
|
||||
assert_eq!(msg, expected_msg.as_slice());
|
||||
}
|
||||
|
||||
fn next_safekeeper_reply(&self) -> &[u8] {
|
||||
let ptr = self
|
||||
.replies_ptr
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
|
||||
if ptr >= self.safekeeper_replies.len() {
|
||||
panic!("no more safekeeper replies");
|
||||
}
|
||||
|
||||
&self.safekeeper_replies[ptr]
|
||||
}
|
||||
}
|
||||
|
||||
impl ApiImpl for MockImpl {
|
||||
fn get_current_timestamp(&self) -> i64 {
|
||||
println!("get_current_timestamp");
|
||||
0
|
||||
}
|
||||
|
||||
fn conn_status(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> crate::bindings::WalProposerConnStatusType {
|
||||
println!("conn_status");
|
||||
crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
|
||||
}
|
||||
|
||||
fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) {
|
||||
println!("conn_connect_start");
|
||||
}
|
||||
|
||||
fn conn_connect_poll(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> crate::bindings::WalProposerConnectPollStatusType {
|
||||
println!("conn_connect_poll");
|
||||
crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
|
||||
}
|
||||
|
||||
fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool {
|
||||
println!("conn_send_query: {}", query);
|
||||
true
|
||||
}
|
||||
|
||||
fn conn_get_query_result(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> crate::bindings::WalProposerExecStatusType {
|
||||
println!("conn_get_query_result");
|
||||
crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
|
||||
}
|
||||
|
||||
fn conn_async_read(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> (&[u8], crate::bindings::PGAsyncReadResult) {
|
||||
println!("conn_async_read");
|
||||
let reply = self.next_safekeeper_reply();
|
||||
println!("conn_async_read result: {:?}", reply);
|
||||
(
|
||||
reply,
|
||||
crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
|
||||
)
|
||||
}
|
||||
|
||||
fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
|
||||
println!("conn_blocking_write: {:?}", buf);
|
||||
self.check_walproposer_msg(buf);
|
||||
true
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
|
||||
println!("wal_reader_allocate")
|
||||
}
|
||||
|
||||
fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
println!("free_event_set")
|
||||
}
|
||||
|
||||
fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
println!("init_event_set")
|
||||
}
|
||||
|
||||
fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
|
||||
println!(
|
||||
"update_event_set, sk={:?}, events_mask={:#b}",
|
||||
sk as *mut crate::bindings::Safekeeper, event_mask
|
||||
);
|
||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||
}
|
||||
|
||||
fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
|
||||
println!(
|
||||
"add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
|
||||
sk as *mut crate::bindings::Safekeeper, event_mask
|
||||
);
|
||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||
}
|
||||
|
||||
fn wait_event_set(
|
||||
&self,
|
||||
_: &mut crate::bindings::WalProposer,
|
||||
timeout_millis: i64,
|
||||
) -> super::WaitResult {
|
||||
let data = self.wait_events.get();
|
||||
println!(
|
||||
"wait_event_set, timeout_millis={}, res={:?}",
|
||||
timeout_millis, data
|
||||
);
|
||||
super::WaitResult::Network(data.sk, data.event_mask)
|
||||
}
|
||||
|
||||
fn strong_random(&self, buf: &mut [u8]) -> bool {
|
||||
println!("strong_random");
|
||||
buf.fill(0);
|
||||
true
|
||||
}
|
||||
|
||||
fn finish_sync_safekeepers(&self, lsn: u64) {
|
||||
self.sync_channel.send(lsn).unwrap();
|
||||
panic!("sync safekeepers finished at lsn={}", lsn);
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
||||
println!("walprop_log[{}] {}", level, msg);
|
||||
}
|
||||
|
||||
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
||||
println!("after_election");
|
||||
}
|
||||
}
|
||||
|
||||
/// Test that walproposer can successfully connect to safekeeper and finish
|
||||
/// sync_safekeepers. API is mocked in MockImpl.
|
||||
///
|
||||
/// Run this test with valgrind to detect leaks:
|
||||
/// `valgrind --leak-check=full target/debug/deps/walproposer-<build>`
|
||||
#[test]
|
||||
fn test_simple_sync_safekeepers() -> anyhow::Result<()> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
"9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
|
||||
"9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
|
||||
);
|
||||
|
||||
let (sender, receiver) = sync_channel(1);
|
||||
|
||||
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
||||
wait_events: Cell::new(WaitEventsData {
|
||||
sk: std::ptr::null_mut(),
|
||||
event_mask: 0,
|
||||
}),
|
||||
expected_messages: vec![
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||
],
|
||||
// VoteRequest(VoteRequest { term: 3 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
expected_ptr: AtomicUsize::new(0),
|
||||
safekeeper_replies: vec![
|
||||
// Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
// VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
|
||||
5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
||||
0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
replies_ptr: AtomicUsize::new(0),
|
||||
sync_channel: sender,
|
||||
});
|
||||
let config = crate::walproposer::Config {
|
||||
ttid,
|
||||
safekeepers_list: vec!["localhost:5000".to_string()],
|
||||
safekeeper_reconnect_timeout: 1000,
|
||||
safekeeper_connection_timeout: 10000,
|
||||
sync_safekeepers: true,
|
||||
};
|
||||
|
||||
let wp = Wrapper::new(my_impl, config);
|
||||
|
||||
// walproposer will panic when it finishes sync_safekeepers
|
||||
std::panic::catch_unwind(|| wp.start()).unwrap_err();
|
||||
// validate the resulting LSN
|
||||
assert_eq!(receiver.recv()?, 1337);
|
||||
Ok(())
|
||||
// drop() will free up resources here
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
use std::env::{var, VarError};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{env, ops::ControlFlow, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
@@ -200,6 +201,51 @@ fn initialize_config(
|
||||
})
|
||||
}
|
||||
|
||||
struct WaitForPhaseResult<F: std::future::Future + Unpin> {
|
||||
timeout_remaining: Duration,
|
||||
skipped: Option<F>,
|
||||
}
|
||||
|
||||
/// During startup, we apply a timeout to our waits for readiness, to avoid
|
||||
/// stalling the whole service if one Tenant experiences some problem. Each
|
||||
/// phase may consume some of the timeout: this function returns the updated
|
||||
/// timeout for use in the next call.
|
||||
async fn wait_for_phase<F>(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult<F>
|
||||
where
|
||||
F: std::future::Future + Unpin,
|
||||
{
|
||||
let initial_t = Instant::now();
|
||||
let skipped = match tokio::time::timeout(timeout, &mut fut).await {
|
||||
Ok(_) => None,
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
%phase,
|
||||
"Startup phase timed out, proceeding anyway"
|
||||
);
|
||||
Some(fut)
|
||||
}
|
||||
};
|
||||
|
||||
WaitForPhaseResult {
|
||||
timeout_remaining: timeout
|
||||
.checked_sub(Instant::now().duration_since(initial_t))
|
||||
.unwrap_or(Duration::ZERO),
|
||||
skipped,
|
||||
}
|
||||
}
|
||||
|
||||
fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
|
||||
let elapsed = started_at.elapsed();
|
||||
let secs = elapsed.as_secs_f64();
|
||||
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
|
||||
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"{human_phase} ({secs:.3}s since start)"
|
||||
)
|
||||
}
|
||||
|
||||
fn start_pageserver(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -207,16 +253,6 @@ fn start_pageserver(
|
||||
// Monotonic time for later calculating startup duration
|
||||
let started_startup_at = Instant::now();
|
||||
|
||||
let startup_checkpoint = move |phase: &str, human_phase: &str| {
|
||||
let elapsed = started_startup_at.elapsed();
|
||||
let secs = elapsed.as_secs_f64();
|
||||
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"{human_phase} ({secs:.3}s since start)"
|
||||
)
|
||||
};
|
||||
|
||||
// Print version and launch timestamp to the log,
|
||||
// and expose them as prometheus metrics.
|
||||
// A changed version string indicates changed software.
|
||||
@@ -341,7 +377,7 @@ fn start_pageserver(
|
||||
|
||||
// Up to this point no significant I/O has been done: this should have been fast. Record
|
||||
// duration prior to starting I/O intensive phase of startup.
|
||||
startup_checkpoint("initial", "Starting loading tenants");
|
||||
startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
|
||||
STARTUP_IS_LOADING.set(1);
|
||||
|
||||
// Startup staging or optimizing:
|
||||
@@ -361,18 +397,12 @@ fn start_pageserver(
|
||||
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
|
||||
|
||||
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
|
||||
let (tenants_can_start, tenants_can_start_barrier) = utils::completion::channel();
|
||||
|
||||
tracing::info!("init_remote_done_tx:");
|
||||
let c = init_remote_done_tx.clone();
|
||||
drop(c);
|
||||
|
||||
let order = pageserver::InitializationOrder {
|
||||
initial_tenant_load_remote: Some(init_done_tx),
|
||||
initial_tenant_load: Some(init_remote_done_tx),
|
||||
initial_logical_size_can_start: init_done_rx.clone(),
|
||||
initial_logical_size_attempt: Some(init_logical_size_done_tx),
|
||||
tenants_can_start: tenants_can_start_barrier.clone(),
|
||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||
};
|
||||
|
||||
@@ -394,60 +424,93 @@ fn start_pageserver(
|
||||
let shutdown_pageserver = shutdown_pageserver.clone();
|
||||
let drive_init = async move {
|
||||
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before initial load completed")
|
||||
});
|
||||
|
||||
init_remote_done_rx.wait().await;
|
||||
startup_checkpoint("initial_tenant_load_remote", "Remote part of initial load completed");
|
||||
let timeout = conf.background_task_maximum_delay;
|
||||
|
||||
drop(tenants_can_start);
|
||||
let init_remote_done = std::pin::pin!(async {
|
||||
init_remote_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_tenant_load_remote",
|
||||
"Remote part of initial load completed",
|
||||
);
|
||||
});
|
||||
|
||||
init_done_rx.wait().await;
|
||||
startup_checkpoint("initial_tenant_load", "Initial load completed");
|
||||
STARTUP_IS_LOADING.set(0);
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: timeout,
|
||||
skipped: init_remote_skipped,
|
||||
} = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await;
|
||||
|
||||
let init_load_done = std::pin::pin!(async {
|
||||
init_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_tenant_load",
|
||||
"Initial load completed",
|
||||
);
|
||||
STARTUP_IS_LOADING.set(0);
|
||||
});
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: timeout,
|
||||
skipped: init_load_skipped,
|
||||
} = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
|
||||
|
||||
// initial logical sizes can now start, as they were waiting on init_done_rx.
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before initial logical sizes completed")
|
||||
});
|
||||
|
||||
let timeout = conf.background_task_maximum_delay;
|
||||
let logical_sizes_done = std::pin::pin!(async {
|
||||
init_logical_size_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_logical_sizes",
|
||||
"Initial logical sizes completed",
|
||||
);
|
||||
});
|
||||
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
|
||||
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
|
||||
Ok(_) => {
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
|
||||
None
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
"Initial logical size timeout elapsed; starting background jobs"
|
||||
);
|
||||
Some(init_sizes_done)
|
||||
}
|
||||
};
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: _,
|
||||
skipped: logical_sizes_skipped,
|
||||
} = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
// allow background jobs to start
|
||||
// allow background jobs to start: we either completed prior stages, or they reached timeout
|
||||
// and were skipped. It is important that we do not let them block background jobs indefinitely,
|
||||
// because things like consumption metrics for billing are blocked by this barrier.
|
||||
drop(background_jobs_can_start);
|
||||
startup_checkpoint("background_jobs_can_start", "Starting background jobs");
|
||||
|
||||
if let Some(init_sizes_done) = init_sizes_done {
|
||||
// ending up here is not a bug; at the latest logical sizes will be queried by
|
||||
// consumption metrics.
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
init_sizes_done.await;
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"background_jobs_can_start",
|
||||
"Starting background jobs",
|
||||
);
|
||||
|
||||
// We are done. If we skipped any phases due to timeout, run them to completion here so that
|
||||
// they will eventually update their startup_checkpoint, and so that we do not declare the
|
||||
// 'complete' stage until all the other stages are really done.
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before waiting for skipped phases done")
|
||||
});
|
||||
if let Some(f) = init_remote_skipped {
|
||||
f.await;
|
||||
}
|
||||
if let Some(f) = init_load_skipped {
|
||||
f.await;
|
||||
}
|
||||
if let Some(f) = logical_sizes_skipped {
|
||||
f.await;
|
||||
}
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
startup_checkpoint("complete", "Startup complete");
|
||||
startup_checkpoint(started_startup_at, "complete", "Startup complete");
|
||||
};
|
||||
|
||||
async move {
|
||||
|
||||
@@ -33,7 +33,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
|
||||
@@ -632,6 +633,11 @@ impl PageServerConf {
|
||||
self.tenants_path().join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_id)
|
||||
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
||||
}
|
||||
|
||||
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ use reqwest::Url;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::id::NodeId;
|
||||
|
||||
@@ -88,22 +89,12 @@ pub async fn collect_metrics(
|
||||
|
||||
let node_id = node_id.to_string();
|
||||
|
||||
// reminder: ticker is ready immediatedly
|
||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||
|
||||
loop {
|
||||
let tick_at = tokio::select! {
|
||||
_ = cancel.cancelled() => return Ok(()),
|
||||
tick_at = ticker.tick() => tick_at,
|
||||
};
|
||||
let started_at = Instant::now();
|
||||
|
||||
// these are point in time, with variable "now"
|
||||
let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
|
||||
|
||||
if metrics.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let metrics = Arc::new(metrics);
|
||||
|
||||
// why not race cancellation here? because we are one of the last tasks, and if we are
|
||||
@@ -142,10 +133,19 @@ pub async fn collect_metrics(
|
||||
let (_, _) = tokio::join!(flush, upload);
|
||||
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
tick_at.elapsed(),
|
||||
started_at.elapsed(),
|
||||
metric_collection_interval,
|
||||
BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
|
||||
);
|
||||
|
||||
let res = tokio::time::timeout_at(
|
||||
started_at + metric_collection_interval,
|
||||
task_mgr::shutdown_token().cancelled(),
|
||||
)
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -244,16 +244,14 @@ async fn calculate_synthetic_size_worker(
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("starting calculate_synthetic_size_worker");
|
||||
scopeguard::defer! {
|
||||
info!("calculate_synthetic_size_worker stopped");
|
||||
};
|
||||
|
||||
// reminder: ticker is ready immediatedly
|
||||
let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
|
||||
let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
|
||||
|
||||
loop {
|
||||
let tick_at = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => return Ok(()),
|
||||
tick_at = ticker.tick() => tick_at,
|
||||
};
|
||||
let started_at = Instant::now();
|
||||
|
||||
let tenants = match mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
@@ -281,9 +279,18 @@ async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
tick_at.elapsed(),
|
||||
started_at.elapsed(),
|
||||
synthetic_size_calculation_interval,
|
||||
BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
|
||||
);
|
||||
|
||||
let res = tokio::time::timeout_at(
|
||||
started_at + synthetic_size_calculation_interval,
|
||||
task_mgr::shutdown_token().cancelled(),
|
||||
)
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -186,8 +186,6 @@ pub struct InitializationOrder {
|
||||
/// attempt. It is important to drop this once the attempt has completed.
|
||||
pub initial_logical_size_attempt: Option<utils::completion::Completion>,
|
||||
|
||||
pub tenants_can_start: utils::completion::Barrier,
|
||||
|
||||
/// Barrier for when we can start any background jobs.
|
||||
///
|
||||
/// This can be broken up later on, but right now there is just one class of a background job.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -3,10 +3,10 @@ use std::sync::Arc;
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, instrument, warn, Instrument, Span};
|
||||
use tracing::{error, info, instrument, warn, Instrument, Span};
|
||||
|
||||
use utils::{
|
||||
backoff, completion, crashsafe, fs_ext,
|
||||
@@ -25,9 +25,11 @@ use super::{
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant,
|
||||
};
|
||||
|
||||
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum DeleteTenantError {
|
||||
#[error("GetTenant {0}")]
|
||||
@@ -58,7 +60,7 @@ fn remote_tenant_delete_mark_path(
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.context("tenant path")?;
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
|
||||
}
|
||||
|
||||
async fn create_remote_delete_mark(
|
||||
@@ -237,6 +239,32 @@ async fn cleanup_remaining_fs_traces(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn remote_delete_mark_exists(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<bool> {
|
||||
// If remote storage is there we rely on it
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
|
||||
|
||||
let result = backoff::retry(
|
||||
|| async { remote_storage.download(&remote_mark_path).await },
|
||||
|e| matches!(e, DownloadError::NotFound),
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
"fetch_tenant_deletion_mark",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(_) => Ok(true),
|
||||
Err(DownloadError::NotFound) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
|
||||
}
|
||||
}
|
||||
|
||||
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
@@ -248,9 +276,10 @@ async fn cleanup_remaining_fs_traces(
|
||||
/// 6. Remove remote mark
|
||||
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are two entrypoints to the process:
|
||||
/// There are three entrypoints to the process:
|
||||
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
|
||||
/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
|
||||
/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
|
||||
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTenantFlow {
|
||||
@@ -349,7 +378,7 @@ impl DeleteTenantFlow {
|
||||
|
||||
pub(crate) async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_mark_exists: bool,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
|
||||
let acquire = |t: &Tenant| {
|
||||
@@ -360,24 +389,66 @@ impl DeleteTenantFlow {
|
||||
)
|
||||
};
|
||||
|
||||
if remote_mark_exists {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
let tenant_id = tenant.tenant_id;
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
match tokio::fs::metadata(conf.tenant_deleted_mark_file_path(&tenant_id)).await {
|
||||
Ok(_) => Ok(acquire(tenant)),
|
||||
Err(_) => Ok(None),
|
||||
let remote_storage = match remote_storage {
|
||||
Some(remote_storage) => remote_storage,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
|
||||
Ok(acquire(tenant))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn resume_from_load(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
tenant
|
||||
.set_stopping(progress, true, false)
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
// Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
|
||||
let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
if let Some(background) = background_jobs_can_start {
|
||||
info!("waiting for backgound jobs barrier");
|
||||
background.clone().wait().await;
|
||||
info!("ready for backgound jobs barrier");
|
||||
}
|
||||
|
||||
// Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
|
||||
let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
|
||||
if timelines_path.exists() {
|
||||
tenant.load(init_order, None, ctx).await.context("load")?;
|
||||
}
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
tenant.conf,
|
||||
tenant.remote_storage.clone(),
|
||||
tenants,
|
||||
tenant,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn resume_from_attach(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
@@ -388,7 +459,7 @@ impl DeleteTenantFlow {
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant
|
||||
.attach(init_order, preload, ctx)
|
||||
.attach(ctx, super::AttachMarkerMode::Expect)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
|
||||
@@ -26,7 +26,10 @@ use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::tenant::{
|
||||
create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant,
|
||||
TenantState,
|
||||
};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
@@ -329,12 +332,7 @@ async fn init_load_tenant_configs(
|
||||
.read_dir_utf8()
|
||||
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
|
||||
|
||||
let mut result = Vec::new();
|
||||
for dentry in dir_entries {
|
||||
result.push(dentry?);
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
|
||||
})
|
||||
.await??;
|
||||
|
||||
@@ -346,10 +344,9 @@ async fn init_load_tenant_configs(
|
||||
}
|
||||
|
||||
while let Some(r) = join_set.join_next().await {
|
||||
match r?? {
|
||||
Some((tenant_id, tenant_config)) => configs.insert(tenant_id, tenant_config),
|
||||
None => None,
|
||||
};
|
||||
if let Some((tenant_id, tenant_config)) = r?? {
|
||||
configs.insert(tenant_id, tenant_config);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(configs)
|
||||
@@ -499,24 +496,45 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||
);
|
||||
|
||||
info!("Attaching tenant {tenant_id}");
|
||||
let tenant = match Tenant::spawn(
|
||||
conf,
|
||||
tenant_id,
|
||||
resources,
|
||||
location_conf,
|
||||
init_order,
|
||||
tenants,
|
||||
SpawnMode::Normal,
|
||||
ctx,
|
||||
) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
|
||||
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if resources.remote_storage.is_none() {
|
||||
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
||||
Tenant::create_broken_tenant(
|
||||
conf,
|
||||
tenant_id,
|
||||
"attaching mark file present but no remote storage configured".to_string(),
|
||||
)
|
||||
} else {
|
||||
match Tenant::spawn_attach(
|
||||
conf,
|
||||
tenant_id,
|
||||
resources,
|
||||
location_conf,
|
||||
tenants,
|
||||
AttachMarkerMode::Expect,
|
||||
ctx,
|
||||
) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
|
||||
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(
|
||||
conf,
|
||||
tenant_id,
|
||||
location_conf,
|
||||
resources,
|
||||
init_order,
|
||||
tenants,
|
||||
ctx,
|
||||
)
|
||||
};
|
||||
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
@@ -658,13 +676,13 @@ pub(crate) async fn create_tenant(
|
||||
// We're holding the tenants lock in write mode while doing local IO.
|
||||
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
|
||||
// and do the work in that state.
|
||||
super::create_tenant_files(conf, &location_conf, &tenant_id).await?;
|
||||
|
||||
let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let created_tenant = Tenant::spawn(conf, tenant_id, resources,
|
||||
AttachedTenantConf::try_from(location_conf)?, None, &TENANTS, SpawnMode::Create, ctx)?;
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
|
||||
AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -815,7 +833,7 @@ pub(crate) async fn upsert_location(
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
let tenant = match Tenant::spawn(
|
||||
let tenant = match Tenant::spawn_attach(
|
||||
conf,
|
||||
tenant_id,
|
||||
TenantSharedResources {
|
||||
@@ -824,14 +842,16 @@ pub(crate) async fn upsert_location(
|
||||
deletion_queue_client,
|
||||
},
|
||||
AttachedTenantConf::try_from(new_location_config)?,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
// The LocationConf API does not use marker files, because we have Secondary
|
||||
// locations where the directory's existence is not a signal that it contains
|
||||
// all timelines. See https://github.com/neondatabase/neon/issues/5550
|
||||
AttachMarkerMode::Ignore,
|
||||
ctx,
|
||||
) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
|
||||
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
|
||||
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
|
||||
}
|
||||
};
|
||||
@@ -1097,10 +1117,17 @@ pub(crate) async fn attach_tenant(
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
tenant_map_insert(tenant_id, || async {
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
||||
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
|
||||
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
// Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
|
||||
let marker_file_exists = conf
|
||||
.tenant_attaching_mark_file_path(&tenant_id)
|
||||
.try_exists()
|
||||
.context("check for attach marker file existence")?;
|
||||
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
|
||||
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -170,14 +170,36 @@
|
||||
//! - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
|
||||
//! for layers that are referenced by `IndexPart` but not present locally
|
||||
//! - schedule uploads for layers that are only present locally.
|
||||
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
|
||||
//! the local filesystem, write the remote metadata to the local filesystem
|
||||
//! - After the above is done for each timeline, open the tenant for business by
|
||||
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
|
||||
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
|
||||
//!
|
||||
//! We keep track of the fact that a client is in `Attaching` state in a marker
|
||||
//! file on the local disk. This is critical because, when we restart the pageserver,
|
||||
//! we do not want to do the `List timelines` step for each tenant that has already
|
||||
//! been successfully attached (for performance & cost reasons).
|
||||
//! Instead, for a tenant without the attach marker file, we assume that the
|
||||
//! local state is in sync or ahead of the remote state. This includes the list
|
||||
//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
|
||||
//! if there's a timeline on the remote that the pageserver doesn't know about,
|
||||
//! the GC will not consider its branch point, leading to data loss.
|
||||
//! So, for a tenant with the attach marker file, we know that we do not yet have
|
||||
//! persisted all the remote timeline's metadata files locally. To exclude the
|
||||
//! risk above, we re-run the procedure for such tenants
|
||||
//!
|
||||
//! # Operating Without Remote Storage
|
||||
//!
|
||||
//! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
|
||||
//! not created and the uploads are skipped.
|
||||
//! Theoretically, it should be ok to remove and re-add remote storage configuration to
|
||||
//! the pageserver config at any time, since it doesn't make a difference to
|
||||
//! [`Timeline::load_layer_map`].
|
||||
//! Of course, the remote timeline dir must not change while we have de-configured
|
||||
//! remote storage, i.e., the pageserver must remain the owner of the given prefix
|
||||
//! in remote storage.
|
||||
//! But note that we don't test any of this right now.
|
||||
//!
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
|
||||
@@ -446,10 +468,7 @@ impl RemoteTimelineClient {
|
||||
//
|
||||
|
||||
/// Download index file
|
||||
pub async fn download_index_file(
|
||||
&self,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<MaybeDeletedIndexPart, DownloadError> {
|
||||
pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
|
||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||
&RemoteOpFileKind::Index,
|
||||
&RemoteOpKind::Download,
|
||||
@@ -463,7 +482,6 @@ impl RemoteTimelineClient {
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.generation,
|
||||
cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
@@ -1637,11 +1655,7 @@ mod tests {
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let initial_index_part = match client
|
||||
.download_index_file(CancellationToken::new())
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let initial_index_part = match client.download_index_file().await.unwrap() {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
|
||||
};
|
||||
@@ -1733,11 +1747,7 @@ mod tests {
|
||||
}
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let index_part = match client
|
||||
.download_index_file(CancellationToken::new())
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let index_part = match client.download_index_file().await.unwrap() {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
|
||||
};
|
||||
@@ -1928,7 +1938,7 @@ mod tests {
|
||||
let client = test_state.build_client(get_generation);
|
||||
|
||||
let download_r = client
|
||||
.download_index_file(CancellationToken::new())
|
||||
.download_index_file()
|
||||
.await
|
||||
.expect("download should always succeed");
|
||||
assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
|
||||
|
||||
@@ -18,8 +18,8 @@ use crate::config::PageServerConf;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::Generation;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
||||
use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -170,52 +170,53 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
|
||||
pub async fn list_remote_timelines(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
||||
) -> anyhow::Result<HashSet<TimelineId>> {
|
||||
let remote_path = remote_timelines_path(&tenant_id);
|
||||
|
||||
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||
});
|
||||
|
||||
let listing = download_retry_forever(
|
||||
|| storage.list(Some(&remote_path), ListingMode::WithDelimiter),
|
||||
&format!("list timelines for {tenant_id}"),
|
||||
cancel,
|
||||
let timelines = download_retry(
|
||||
|| storage.list_prefixes(Some(&remote_path)),
|
||||
&format!("list prefixes for {tenant_id}"),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
anyhow::bail!("no timelines found on the remote storage")
|
||||
}
|
||||
|
||||
let mut timeline_ids = HashSet::new();
|
||||
let mut other_prefixes = HashSet::new();
|
||||
|
||||
tracing::info!("list_remote_timelines prefixes:");
|
||||
for p in &listing.prefixes {
|
||||
tracing::info!(" '{p}'");
|
||||
}
|
||||
tracing::info!("list_remote_timelines keys:");
|
||||
for p in &listing.keys {
|
||||
tracing::info!(" '{p}'");
|
||||
}
|
||||
for timeline_remote_storage_key in timelines {
|
||||
if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
|
||||
// A `deleted` key within `timelines/` is a marker file, not a timeline. Ignore it.
|
||||
// This code will be removed in https://github.com/neondatabase/neon/pull/5580
|
||||
continue;
|
||||
}
|
||||
|
||||
for timeline_remote_storage_key in listing.prefixes {
|
||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||
})?;
|
||||
|
||||
match object_name.parse::<TimelineId>() {
|
||||
Ok(t) => timeline_ids.insert(t),
|
||||
Err(_) => other_prefixes.insert(object_name.to_string()),
|
||||
};
|
||||
let timeline_id: TimelineId = object_name
|
||||
.parse()
|
||||
.with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
|
||||
|
||||
// list_prefixes is assumed to return unique names. Ensure this here.
|
||||
// NB: it's safer to bail out than warn-log this because the pageserver
|
||||
// needs to absolutely know about _all_ timelines that exist, so that
|
||||
// GC knows all the branchpoints. If we skipped over a timeline instead,
|
||||
// GC could delete a layer that's still needed by that timeline.
|
||||
anyhow::ensure!(
|
||||
!timeline_ids.contains(&timeline_id),
|
||||
"list_prefixes contains duplicate timeline id {timeline_id}"
|
||||
);
|
||||
timeline_ids.insert(timeline_id);
|
||||
}
|
||||
|
||||
for key in listing.keys {
|
||||
let object_name = key
|
||||
.object_name()
|
||||
.ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
|
||||
other_prefixes.insert(object_name.to_string());
|
||||
}
|
||||
|
||||
Ok((timeline_ids, other_prefixes))
|
||||
Ok(timeline_ids)
|
||||
}
|
||||
|
||||
async fn do_download_index_part(
|
||||
@@ -223,11 +224,10 @@ async fn do_download_index_part(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
index_generation: Generation,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
|
||||
|
||||
let index_part_bytes = download_retry_forever(
|
||||
let index_part_bytes = download_retry(
|
||||
|| async {
|
||||
let mut index_part_download = storage.download(&remote_path).await?;
|
||||
|
||||
@@ -242,7 +242,6 @@ async fn do_download_index_part(
|
||||
Ok(index_part_bytes)
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -264,28 +263,19 @@ pub(super) async fn download_index_part(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
my_generation: Generation,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
if my_generation.is_none() {
|
||||
// Operating without generations: just fetch the generation-less path
|
||||
return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
|
||||
.await;
|
||||
return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
|
||||
}
|
||||
|
||||
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
||||
// index in our generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res = do_download_index_part(
|
||||
storage,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
my_generation,
|
||||
cancel.clone(),
|
||||
)
|
||||
.await;
|
||||
let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!(
|
||||
@@ -305,14 +295,8 @@ pub(super) async fn download_index_part(
|
||||
// we want to find the most recent index from a previous generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res = do_download_index_part(
|
||||
storage,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
my_generation.previous(),
|
||||
cancel.clone(),
|
||||
)
|
||||
.await;
|
||||
let res =
|
||||
do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!("Found index_part from previous generation");
|
||||
@@ -356,14 +340,13 @@ pub(super) async fn download_index_part(
|
||||
match max_previous_generation {
|
||||
Some(g) => {
|
||||
tracing::debug!("Found index_part in generation {g:?}");
|
||||
do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
|
||||
do_download_index_part(storage, tenant_id, timeline_id, g).await
|
||||
}
|
||||
None => {
|
||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||
// attached pageservers did. Try to load from a no-generation path.
|
||||
tracing::info!("No index_part.json* found");
|
||||
do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
|
||||
.await
|
||||
do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -393,23 +376,3 @@ where
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn download_retry_forever<T, O, F>(
|
||||
op: O,
|
||||
description: &str,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<T, DownloadError>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, DownloadError>>,
|
||||
{
|
||||
backoff::retry(
|
||||
op,
|
||||
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
u32::MAX,
|
||||
description,
|
||||
backoff::Cancel::new(cancel, || DownloadError::Cancelled),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -294,7 +294,6 @@ async fn cleanup_remaining_timeline_fs_traces(
|
||||
// Remove delete mark
|
||||
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.context("remove delete mark")
|
||||
}
|
||||
|
||||
|
||||
@@ -27,13 +27,14 @@ use std::collections::VecDeque;
|
||||
use std::io;
|
||||
use std::io::prelude::*;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::os::unix::prelude::CommandExt;
|
||||
use std::process::Stdio;
|
||||
use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
|
||||
use std::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
@@ -47,7 +48,6 @@ use crate::metrics::{
|
||||
};
|
||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||
use crate::repository::Key;
|
||||
use crate::task_mgr::BACKGROUND_RUNTIME;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
@@ -72,8 +72,6 @@ pub(crate) struct BufferTag {
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: ChildStdin,
|
||||
stderr_fd: RawFd,
|
||||
stdout_fd: RawFd,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
@@ -121,6 +119,7 @@ impl PostgresRedoManager {
|
||||
/// The WAL redo is handled by a separate thread, so this just sends a request
|
||||
/// to the thread and waits for response.
|
||||
///
|
||||
/// CANCEL SAFETY: NOT CANCEL SAFE.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -153,6 +152,7 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
};
|
||||
img = Some(result?);
|
||||
|
||||
@@ -173,6 +173,7 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -194,7 +195,7 @@ impl PostgresRedoManager {
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn apply_batch_postgres(
|
||||
async fn apply_batch_postgres(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -283,19 +284,20 @@ impl PostgresRedoManager {
|
||||
);
|
||||
// Avoid concurrent callers hitting the same issue.
|
||||
// We can't prevent it from happening because we want to enable parallelism.
|
||||
let mut guard = self.redo_process.write().unwrap();
|
||||
match &*guard {
|
||||
Some(current_field_value) => {
|
||||
if Arc::ptr_eq(current_field_value, &proc) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
*guard = None;
|
||||
{
|
||||
let mut guard = self.redo_process.write().unwrap();
|
||||
match &*guard {
|
||||
Some(current_field_value) => {
|
||||
if Arc::ptr_eq(current_field_value, &proc) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
*guard = None;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Another thread was faster to observe the error, and already took the process out of rotation.
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Another thread was faster to observe the error, and already took the process out of rotation.
|
||||
}
|
||||
}
|
||||
drop(guard);
|
||||
// NB: there may still be other concurrent threads using `proc`.
|
||||
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
|
||||
// NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
|
||||
@@ -308,7 +310,12 @@ impl PostgresRedoManager {
|
||||
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
||||
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
||||
// This probably needs revisiting at some later point.
|
||||
let mut wait_done = proc.stderr_logger_task_done.clone();
|
||||
drop(proc);
|
||||
wait_done
|
||||
.wait_for(|v| *v)
|
||||
.await
|
||||
.expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
|
||||
} else if n_attempts != 0 {
|
||||
info!(n_attempts, "retried walredo succeeded");
|
||||
}
|
||||
@@ -619,7 +626,8 @@ struct WalRedoProcess {
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
stderr: Mutex<ChildStderr>,
|
||||
stderr_logger_cancel: CancellationToken,
|
||||
stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
@@ -668,7 +676,6 @@ impl WalRedoProcess {
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
@@ -682,16 +689,73 @@ impl WalRedoProcess {
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
set_nonblock_or_log_err!(stderr)?;
|
||||
|
||||
let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
let stderr_logger_cancel = CancellationToken::new();
|
||||
let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
|
||||
tokio::sync::watch::channel(false);
|
||||
tokio::spawn({
|
||||
let stderr_logger_cancel = stderr_logger_cancel.clone();
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
let _ = stderr_logger_task_done_tx.send(true);
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
loop {
|
||||
// NB: we purposefully don't do a select! for the cancellation here.
|
||||
// The cancellation would likely cause us to miss stderr messages.
|
||||
// We can rely on this to return from .await because when we SIGKILL
|
||||
// the child, the writing end of the stderr pipe gets closed.
|
||||
match stderr.readable_mut().await {
|
||||
Ok(mut guard) => {
|
||||
let mut errbuf = [0; 16384];
|
||||
let res = guard.try_io(|fd| {
|
||||
use std::io::Read;
|
||||
fd.get_mut().read(&mut errbuf)
|
||||
});
|
||||
match res {
|
||||
Ok(Ok(0)) => {
|
||||
// it closed the stderr pipe
|
||||
break;
|
||||
}
|
||||
Ok(Ok(n)) => {
|
||||
// The message might not be split correctly into lines here. But this is
|
||||
// good enough, the important thing is to get the message to the log.
|
||||
let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
|
||||
error!(output, "received output");
|
||||
},
|
||||
Ok(Err(e)) => {
|
||||
error!(error = ?e, "read() error, waiting for cancellation");
|
||||
stderr_logger_cancel.cancelled().await;
|
||||
error!(error = ?e, "read() error, cancellation complete");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
let _e: tokio::io::unix::TryIoError = e;
|
||||
// the read() returned WouldBlock, that's expected
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = ?e, "read() error, waiting for cancellation");
|
||||
stderr_logger_cancel.cancelled().await;
|
||||
error!(error = ?e, "read() error, cancellation complete");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
|
||||
});
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdout_fd: stdout.as_raw_fd(),
|
||||
stderr_fd: stderr.as_raw_fd(),
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
@@ -700,7 +764,8 @@ impl WalRedoProcess {
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
stderr: Mutex::new(stderr),
|
||||
stderr_logger_cancel,
|
||||
stderr_logger_task_done: stderr_logger_task_done_rx,
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
@@ -774,19 +839,11 @@ impl WalRedoProcess {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
// Prepare for calling poll()
|
||||
let mut pollfds = [
|
||||
PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT),
|
||||
PollFd::new(proc.stderr_fd, PollFlags::POLLIN),
|
||||
PollFd::new(proc.stdout_fd, PollFlags::POLLIN),
|
||||
];
|
||||
let mut stdin_pollfds = [PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT)];
|
||||
|
||||
// We do two things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin and forward any logging
|
||||
// information that the child writes to its stderr to the page server's log.
|
||||
while nwrite < writebuf.len() {
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
@@ -796,31 +853,8 @@ impl WalRedoProcess {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some messages in stderr, forward them to the log.
|
||||
let err_revents = pollfds[1].revents().unwrap();
|
||||
if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
let mut errbuf: [u8; 16384] = [0; 16384];
|
||||
let mut stderr = self.stderr.lock().unwrap();
|
||||
let len = stderr.read(&mut errbuf)?;
|
||||
|
||||
// The message might not be split correctly into lines here. But this is
|
||||
// good enough, the important thing is to get the message to the log.
|
||||
if len > 0 {
|
||||
error!(
|
||||
"wal-redo-postgres: {}",
|
||||
String::from_utf8_lossy(&errbuf[0..len])
|
||||
);
|
||||
|
||||
// To make sure we capture all log from the process if it fails, keep
|
||||
// reading from the stderr, before checking the stdout.
|
||||
continue;
|
||||
}
|
||||
} else if err_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stderr unexpectedly");
|
||||
}
|
||||
|
||||
// If 'stdin' is writeable, do write.
|
||||
let in_revents = pollfds[0].revents().unwrap();
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
} else if in_revents.contains(PollFlags::POLLHUP) {
|
||||
@@ -845,6 +879,7 @@ impl WalRedoProcess {
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let mut stdout_pollfds = [PollFd::new(output.stdout.as_raw_fd(), PollFlags::POLLIN)];
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
@@ -855,7 +890,10 @@ impl WalRedoProcess {
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
|
||||
match nix::poll::poll(
|
||||
&mut stdout_pollfds[..],
|
||||
wal_redo_timeout.as_millis() as i32,
|
||||
) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
@@ -865,31 +903,8 @@ impl WalRedoProcess {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some messages in stderr, forward them to the log.
|
||||
let err_revents = pollfds[1].revents().unwrap();
|
||||
if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
let mut errbuf: [u8; 16384] = [0; 16384];
|
||||
let mut stderr = self.stderr.lock().unwrap();
|
||||
let len = stderr.read(&mut errbuf)?;
|
||||
|
||||
// The message might not be split correctly into lines here. But this is
|
||||
// good enough, the important thing is to get the message to the log.
|
||||
if len > 0 {
|
||||
error!(
|
||||
"wal-redo-postgres: {}",
|
||||
String::from_utf8_lossy(&errbuf[0..len])
|
||||
);
|
||||
|
||||
// To make sure we capture all log from the process if it fails, keep
|
||||
// reading from the stderr, before checking the stdout.
|
||||
continue;
|
||||
}
|
||||
} else if err_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stderr unexpectedly");
|
||||
}
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = pollfds[2].revents().unwrap();
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
} else if out_revents.contains(PollFlags::POLLHUP) {
|
||||
@@ -985,6 +1000,8 @@ impl Drop for WalRedoProcess {
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait();
|
||||
self.stderr_logger_cancel.cancel();
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1066,7 +1083,7 @@ impl Drop for NoLeakChild {
|
||||
// Offload the kill+wait of the child process into the background.
|
||||
// If someone stops the runtime, we'll leak the child process.
|
||||
// We can ignore that case because we only stop the runtime on pageserver exit.
|
||||
BACKGROUND_RUNTIME.spawn(async move {
|
||||
tokio::runtime::Handle::current().spawn(async move {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
// Intentionally don't inherit the tracing context from whoever is dropping us.
|
||||
// This thread here is going to outlive of our dropper.
|
||||
@@ -1199,6 +1216,22 @@ mod tests {
|
||||
assert_eq!(page, crate::ZERO_PAGE);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_stderr() {
|
||||
let h = RedoHarness::new().unwrap();
|
||||
h
|
||||
.manager
|
||||
.request_redo(
|
||||
Key::from_i128(0),
|
||||
Lsn::INVALID,
|
||||
None,
|
||||
short_records(),
|
||||
16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
#[allow(clippy::octal_escapes)]
|
||||
fn short_records() -> Vec<(Lsn, NeonWalRecord)> {
|
||||
vec![
|
||||
@@ -1227,6 +1260,8 @@ mod tests {
|
||||
|
||||
impl RedoHarness {
|
||||
fn new() -> anyhow::Result<Self> {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let repo_dir = camino_tempfile::tempdir()?;
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
|
||||
@@ -23,6 +23,23 @@ EXTENSION = neon
|
||||
DATA = neon--1.0.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
libwalproposer.a
|
||||
|
||||
WALPROP_OBJS = \
|
||||
$(WIN32RES) \
|
||||
walproposer.o \
|
||||
neon_utils.o \
|
||||
walproposer_compat.o
|
||||
|
||||
.PHONY: walproposer-lib
|
||||
walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB
|
||||
walproposer-lib: libwalproposer.a;
|
||||
|
||||
.PHONY: libwalproposer.a
|
||||
libwalproposer.a: $(WALPROP_OBJS)
|
||||
rm -f $@
|
||||
$(AR) $(AROPT) $@ $^
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
|
||||
@@ -79,7 +79,7 @@ static int CompareLsn(const void *a, const void *b);
|
||||
static char *FormatSafekeeperState(SafekeeperState state);
|
||||
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
|
||||
static uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
|
||||
static char *FormatEvents(uint32 events);
|
||||
static char *FormatEvents(WalProposer *wp, uint32 events);
|
||||
|
||||
WalProposer *
|
||||
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
@@ -98,7 +98,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
port = strchr(host, ':');
|
||||
if (port == NULL)
|
||||
{
|
||||
elog(FATAL, "port is not specified");
|
||||
walprop_log(FATAL, "port is not specified");
|
||||
}
|
||||
*port++ = '\0';
|
||||
sep = strchr(port, ',');
|
||||
@@ -106,12 +106,11 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
*sep++ = '\0';
|
||||
if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
|
||||
{
|
||||
elog(FATAL, "Too many safekeepers");
|
||||
walprop_log(FATAL, "Too many safekeepers");
|
||||
}
|
||||
wp->safekeeper[wp->n_safekeepers].host = host;
|
||||
wp->safekeeper[wp->n_safekeepers].port = port;
|
||||
wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE;
|
||||
wp->safekeeper[wp->n_safekeepers].conn = NULL;
|
||||
wp->safekeeper[wp->n_safekeepers].wp = wp;
|
||||
|
||||
{
|
||||
@@ -122,13 +121,11 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||
sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
|
||||
if (written > MAXCONNINFO || written < 0)
|
||||
elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||
walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||
}
|
||||
|
||||
initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
|
||||
wp->safekeeper[wp->n_safekeepers].xlogreader = wp->api.wal_reader_allocate();
|
||||
if (wp->safekeeper[wp->n_safekeepers].xlogreader == NULL)
|
||||
elog(FATAL, "Failed to allocate xlog reader");
|
||||
wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]);
|
||||
wp->safekeeper[wp->n_safekeepers].flushWrite = false;
|
||||
wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
|
||||
wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
|
||||
@@ -136,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
}
|
||||
if (wp->n_safekeepers < 1)
|
||||
{
|
||||
elog(FATAL, "Safekeepers addresses are not specified");
|
||||
walprop_log(FATAL, "Safekeepers addresses are not specified");
|
||||
}
|
||||
wp->quorum = wp->n_safekeepers / 2 + 1;
|
||||
|
||||
@@ -144,27 +141,47 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
wp->greetRequest.tag = 'g';
|
||||
wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
|
||||
wp->greetRequest.pgVersion = PG_VERSION_NUM;
|
||||
wp->api.strong_random(&wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
|
||||
wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
|
||||
wp->greetRequest.systemId = wp->config->systemId;
|
||||
if (!wp->config->neon_timeline)
|
||||
elog(FATAL, "neon.timeline_id is not provided");
|
||||
walprop_log(FATAL, "neon.timeline_id is not provided");
|
||||
if (*wp->config->neon_timeline != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
|
||||
elog(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
|
||||
walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
|
||||
if (!wp->config->neon_tenant)
|
||||
elog(FATAL, "neon.tenant_id is not provided");
|
||||
walprop_log(FATAL, "neon.tenant_id is not provided");
|
||||
if (*wp->config->neon_tenant != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
|
||||
elog(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);
|
||||
walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);
|
||||
|
||||
wp->greetRequest.timeline = wp->api.get_timeline_id();
|
||||
wp->greetRequest.timeline = wp->config->pgTimeline;
|
||||
wp->greetRequest.walSegSize = wp->config->wal_segment_size;
|
||||
|
||||
wp->api.init_event_set(wp->n_safekeepers);
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
return wp;
|
||||
}
|
||||
|
||||
void
|
||||
WalProposerFree(WalProposer *wp)
|
||||
{
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
Assert(sk->outbuf.data != NULL);
|
||||
pfree(sk->outbuf.data);
|
||||
if (sk->voteResponse.termHistory.entries)
|
||||
pfree(sk->voteResponse.termHistory.entries);
|
||||
sk->voteResponse.termHistory.entries = NULL;
|
||||
}
|
||||
if (wp->propTermHistory.entries != NULL)
|
||||
pfree(wp->propTermHistory.entries);
|
||||
wp->propTermHistory.entries = NULL;
|
||||
|
||||
pfree(wp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create new AppendRequest message and start sending it. This function is
|
||||
* called from walsender every time the new WAL is available.
|
||||
@@ -190,10 +207,10 @@ WalProposerPoll(WalProposer *wp)
|
||||
Safekeeper *sk = NULL;
|
||||
int rc = 0;
|
||||
uint32 events = 0;
|
||||
TimestampTz now = wp->api.get_current_timestamp();
|
||||
TimestampTz now = wp->api.get_current_timestamp(wp);
|
||||
long timeout = TimeToReconnect(wp, now);
|
||||
|
||||
rc = wp->api.wait_event_set(timeout, &sk, &events);
|
||||
rc = wp->api.wait_event_set(wp, timeout, &sk, &events);
|
||||
|
||||
/* Exit loop if latch is set (we got new WAL) */
|
||||
if ((rc == 1 && events & WL_LATCH_SET))
|
||||
@@ -224,14 +241,14 @@ WalProposerPoll(WalProposer *wp)
|
||||
*/
|
||||
if (!wp->config->syncSafekeepers)
|
||||
{
|
||||
XLogRecPtr flushed = wp->api.get_flush_rec_ptr();
|
||||
XLogRecPtr flushed = wp->api.get_flush_rec_ptr(wp);
|
||||
|
||||
if (flushed > wp->availableLsn)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
now = wp->api.get_current_timestamp();
|
||||
now = wp->api.get_current_timestamp(wp);
|
||||
/* timeout expired: poll state */
|
||||
if (rc == 0 || TimeToReconnect(wp, now) <= 0)
|
||||
{
|
||||
@@ -249,7 +266,7 @@ WalProposerPoll(WalProposer *wp)
|
||||
/*
|
||||
* Abandon connection attempts which take too long.
|
||||
*/
|
||||
now = wp->api.get_current_timestamp();
|
||||
now = wp->api.get_current_timestamp(wp);
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
@@ -257,7 +274,7 @@ WalProposerPoll(WalProposer *wp)
|
||||
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
|
||||
wp->config->safekeeper_connection_timeout))
|
||||
{
|
||||
elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
|
||||
walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
|
||||
ShutdownConnection(sk);
|
||||
}
|
||||
@@ -296,10 +313,10 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove)
|
||||
{
|
||||
WalProposer *wp = to_remove->wp;
|
||||
|
||||
/* Remove the existing event set */
|
||||
wp->api.free_event_set();
|
||||
/* Remove the existing event set, assign sk->eventPos = -1 */
|
||||
wp->api.free_event_set(wp);
|
||||
/* Re-initialize it without adding any safekeeper events */
|
||||
wp->api.init_event_set(wp->n_safekeepers);
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
/*
|
||||
* loop through the existing safekeepers. If they aren't the one we're
|
||||
@@ -311,13 +328,11 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove)
|
||||
uint32 desired_events = WL_NO_EVENTS;
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
sk->eventPos = -1;
|
||||
|
||||
if (sk == to_remove)
|
||||
continue;
|
||||
|
||||
/* If this safekeeper isn't offline, add an event for it! */
|
||||
if (sk->conn != NULL)
|
||||
if (sk->state != SS_OFFLINE)
|
||||
{
|
||||
desired_events = SafekeeperStateDesiredEvents(sk->state);
|
||||
/* will set sk->eventPos */
|
||||
@@ -330,9 +345,7 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove)
|
||||
static void
|
||||
ShutdownConnection(Safekeeper *sk)
|
||||
{
|
||||
if (sk->conn)
|
||||
sk->wp->api.conn_finish(sk->conn);
|
||||
sk->conn = NULL;
|
||||
sk->wp->api.conn_finish(sk);
|
||||
sk->state = SS_OFFLINE;
|
||||
sk->flushWrite = false;
|
||||
sk->streamingAt = InvalidXLogRecPtr;
|
||||
@@ -361,23 +374,16 @@ ResetConnection(Safekeeper *sk)
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to establish new connection
|
||||
* Try to establish new connection, it will update sk->conn.
|
||||
*/
|
||||
sk->conn = wp->api.conn_connect_start((char *) &sk->conninfo);
|
||||
|
||||
/*
|
||||
* "If the result is null, then libpq has been unable to allocate a new
|
||||
* PGconn structure"
|
||||
*/
|
||||
if (!sk->conn)
|
||||
elog(FATAL, "failed to allocate new PGconn object");
|
||||
wp->api.conn_connect_start(sk);
|
||||
|
||||
/*
|
||||
* PQconnectStart won't actually start connecting until we run
|
||||
* PQconnectPoll. Before we do that though, we need to check that it
|
||||
* didn't immediately fail.
|
||||
*/
|
||||
if (wp->api.conn_status(sk->conn) == WP_CONNECTION_BAD)
|
||||
if (wp->api.conn_status(sk) == WP_CONNECTION_BAD)
|
||||
{
|
||||
/*---
|
||||
* According to libpq docs:
|
||||
@@ -388,15 +394,14 @@ ResetConnection(Safekeeper *sk)
|
||||
*
|
||||
* https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
|
||||
*/
|
||||
elog(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk->conn));
|
||||
walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
|
||||
/*
|
||||
* Even though the connection failed, we still need to clean up the
|
||||
* object
|
||||
*/
|
||||
wp->api.conn_finish(sk->conn);
|
||||
sk->conn = NULL;
|
||||
wp->api.conn_finish(sk);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -413,10 +418,10 @@ ResetConnection(Safekeeper *sk)
|
||||
* (see libpqrcv_connect, defined in
|
||||
* src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
|
||||
*/
|
||||
elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
|
||||
walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
|
||||
|
||||
sk->state = SS_CONNECTING_WRITE;
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp();
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
|
||||
wp->api.add_safekeeper_event_set(sk, WL_SOCKET_WRITEABLE);
|
||||
return;
|
||||
@@ -447,7 +452,7 @@ TimeToReconnect(WalProposer *wp, TimestampTz now)
|
||||
static void
|
||||
ReconnectSafekeepers(WalProposer *wp)
|
||||
{
|
||||
TimestampTz now = wp->api.get_current_timestamp();
|
||||
TimestampTz now = wp->api.get_current_timestamp(wp);
|
||||
|
||||
if (TimeToReconnect(wp, now) == 0)
|
||||
{
|
||||
@@ -467,6 +472,8 @@ ReconnectSafekeepers(WalProposer *wp)
|
||||
static void
|
||||
AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
/*
|
||||
* Sanity check. We assume further down that the operations don't block
|
||||
* because the socket is ready.
|
||||
@@ -481,7 +488,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* ResetConnection
|
||||
*/
|
||||
case SS_OFFLINE:
|
||||
elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
|
||||
walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
|
||||
sk->host, sk->port);
|
||||
break; /* actually unreachable, but prevents
|
||||
* -Wimplicit-fallthrough */
|
||||
@@ -517,7 +524,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* requests.
|
||||
*/
|
||||
case SS_VOTING:
|
||||
elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
@@ -546,7 +553,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* Idle state for waiting votes from quorum.
|
||||
*/
|
||||
case SS_IDLE:
|
||||
elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
@@ -564,7 +571,7 @@ static void
|
||||
HandleConnectionEvent(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
WalProposerConnectPollStatusType result = wp->api.conn_connect_poll(sk->conn);
|
||||
WalProposerConnectPollStatusType result = wp->api.conn_connect_poll(sk);
|
||||
|
||||
/* The new set of events we'll wait on, after updating */
|
||||
uint32 new_events = WL_NO_EVENTS;
|
||||
@@ -572,9 +579,9 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
switch (result)
|
||||
{
|
||||
case WP_CONN_POLLING_OK:
|
||||
elog(LOG, "connected with node %s:%s", sk->host,
|
||||
walprop_log(LOG, "connected with node %s:%s", sk->host,
|
||||
sk->port);
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp();
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
|
||||
/*
|
||||
* We have to pick some event to update event set. We'll
|
||||
@@ -596,8 +603,8 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
break;
|
||||
|
||||
case WP_CONN_POLLING_FAILED:
|
||||
elog(WARNING, "failed to connect to node '%s:%s': %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk->conn));
|
||||
walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
|
||||
/*
|
||||
* If connecting failed, we don't want to restart the connection
|
||||
@@ -631,10 +638,10 @@ SendStartWALPush(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
if (!wp->api.conn_send_query(sk->conn, "START_WAL_PUSH"))
|
||||
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
|
||||
{
|
||||
elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk->conn));
|
||||
walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
@@ -647,7 +654,7 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
switch (wp->api.conn_get_query_result(sk->conn))
|
||||
switch (wp->api.conn_get_query_result(sk))
|
||||
{
|
||||
/*
|
||||
* Successful result, move on to starting the handshake
|
||||
@@ -670,8 +677,8 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
break;
|
||||
|
||||
case WP_EXEC_FAILED:
|
||||
elog(WARNING, "Failed to send query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk->conn));
|
||||
walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -681,7 +688,7 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
* wrong"
|
||||
*/
|
||||
case WP_EXEC_UNEXPECTED_SUCCESS:
|
||||
elog(WARNING, "Received bad response from safekeeper %s:%s query execution",
|
||||
walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
|
||||
sk->host, sk->port);
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
@@ -717,7 +724,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
|
||||
return;
|
||||
|
||||
elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
|
||||
walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
|
||||
|
||||
/* Protocol is all good, move to voting. */
|
||||
sk->state = SS_VOTING;
|
||||
@@ -737,7 +744,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
if (wp->n_connected == wp->quorum)
|
||||
{
|
||||
wp->propTerm++;
|
||||
elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||
walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||
|
||||
wp->voteRequest = (VoteRequest)
|
||||
{
|
||||
@@ -750,7 +757,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
else if (sk->greetResponse.term > wp->propTerm)
|
||||
{
|
||||
/* Another compute with higher term is running. */
|
||||
elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->greetResponse.term, wp->propTerm);
|
||||
}
|
||||
@@ -792,7 +799,7 @@ SendVoteRequest(Safekeeper *sk)
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
/* We have quorum for voting, send our vote request */
|
||||
elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
|
||||
walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
|
||||
/* On failure, logging & resetting is handled */
|
||||
if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
|
||||
return;
|
||||
@@ -809,7 +816,7 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
|
||||
return;
|
||||
|
||||
elog(LOG,
|
||||
walprop_log(LOG,
|
||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
@@ -824,7 +831,7 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
if ((!sk->voteResponse.voteGiven) &&
|
||||
(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
|
||||
{
|
||||
elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->voteResponse.term, wp->propTerm);
|
||||
}
|
||||
@@ -861,49 +868,27 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
static void
|
||||
HandleElectedProposer(WalProposer *wp)
|
||||
{
|
||||
FILE* f;
|
||||
XLogRecPtr lrRestartLsn;
|
||||
|
||||
DetermineEpochStartLsn(wp);
|
||||
|
||||
/*
|
||||
* If there are active logical replication subscription we need
|
||||
* to provide enough WAL for their WAL senders based on th position
|
||||
* of their replication slots.
|
||||
*/
|
||||
f = fopen("restart.lsn", "rb");
|
||||
if (f != NULL && !wp->config->syncSafekeepers)
|
||||
{
|
||||
fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
|
||||
fclose(f);
|
||||
if (lrRestartLsn != InvalidXLogRecPtr)
|
||||
{
|
||||
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||
/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
|
||||
lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
|
||||
wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if not all safekeepers are up-to-date, we need to download WAL
|
||||
* needed to synchronize them
|
||||
*/
|
||||
if (wp->truncateLsn < wp->propEpochStartLsn)
|
||||
{
|
||||
elog(LOG,
|
||||
walprop_log(LOG,
|
||||
"start recovery because truncateLsn=%X/%X is not "
|
||||
"equal to epochStartLsn=%X/%X",
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn),
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
/* Perform recovery */
|
||||
if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
|
||||
elog(FATAL, "Failed to recover state");
|
||||
walprop_log(FATAL, "Failed to recover state");
|
||||
}
|
||||
else if (wp->config->syncSafekeepers)
|
||||
{
|
||||
/* Sync is not needed: just exit */
|
||||
wp->api.finish_sync_safekeepers(wp->propEpochStartLsn);
|
||||
wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
|
||||
/* unreachable */
|
||||
}
|
||||
|
||||
@@ -1004,7 +989,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (wp->timelineStartLsn != InvalidXLogRecPtr &&
|
||||
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
|
||||
{
|
||||
elog(WARNING,
|
||||
walprop_log(WARNING,
|
||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
||||
@@ -1020,12 +1005,12 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
*/
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
|
||||
{
|
||||
wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn();
|
||||
wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
|
||||
if (wp->timelineStartLsn == InvalidXLogRecPtr)
|
||||
{
|
||||
wp->timelineStartLsn = wp->api.get_redo_start_lsn();
|
||||
wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
|
||||
}
|
||||
elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1052,7 +1037,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
|
||||
|
||||
elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
wp->quorum,
|
||||
wp->propTerm,
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
@@ -1066,7 +1051,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
*/
|
||||
if (!wp->config->syncSafekeepers)
|
||||
{
|
||||
WalproposerShmemState *walprop_shared = wp->api.get_shmem_state();
|
||||
WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp);
|
||||
|
||||
/*
|
||||
* Basebackup LSN always points to the beginning of the record (not
|
||||
@@ -1074,7 +1059,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
* Safekeepers don't skip header as they need continious stream of
|
||||
* data, so correct LSN for comparison.
|
||||
*/
|
||||
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn())
|
||||
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
|
||||
{
|
||||
/*
|
||||
* However, allow to proceed if previously elected leader was me;
|
||||
@@ -1084,14 +1069,21 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
|
||||
walprop_shared->mineLastElectedTerm)))
|
||||
{
|
||||
elog(PANIC,
|
||||
walprop_log(PANIC,
|
||||
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn()));
|
||||
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
|
||||
}
|
||||
}
|
||||
walprop_shared->mineLastElectedTerm = wp->propTerm;
|
||||
}
|
||||
|
||||
/*
|
||||
* WalProposer has just elected itself and initialized history, so
|
||||
* we can call election callback. Usually it updates truncateLsn to
|
||||
* fetch WAL for logical replication.
|
||||
*/
|
||||
wp->api.after_election(wp);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1162,7 +1154,7 @@ SendProposerElected(Safekeeper *sk)
|
||||
*/
|
||||
sk->startStreamingAt = wp->truncateLsn;
|
||||
|
||||
elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
|
||||
walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
|
||||
LSN_FORMAT_ARGS(sk->startStreamingAt));
|
||||
}
|
||||
@@ -1197,7 +1189,7 @@ SendProposerElected(Safekeeper *sk)
|
||||
msg.timelineStartLsn = wp->timelineStartLsn;
|
||||
|
||||
lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
|
||||
elog(LOG,
|
||||
walprop_log(LOG,
|
||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||
|
||||
@@ -1362,13 +1354,12 @@ SendAppendRequests(Safekeeper *sk)
|
||||
req = &sk->appendRequest;
|
||||
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
|
||||
|
||||
ereport(DEBUG2,
|
||||
(errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||
walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||
req->endLsn - req->beginLsn,
|
||||
LSN_FORMAT_ARGS(req->beginLsn),
|
||||
LSN_FORMAT_ARGS(req->endLsn),
|
||||
LSN_FORMAT_ARGS(req->commitLsn),
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port)));
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
@@ -1378,13 +1369,13 @@ SendAppendRequests(Safekeeper *sk)
|
||||
/* write the WAL itself */
|
||||
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
|
||||
/* wal_read will raise error on failure */
|
||||
wp->api.wal_read(sk->xlogreader,
|
||||
wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req->endLsn - req->beginLsn);
|
||||
sk->outbuf.len += req->endLsn - req->beginLsn;
|
||||
|
||||
writeResult = wp->api.conn_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len);
|
||||
writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
|
||||
|
||||
/* Mark current message as sent, whatever the result is */
|
||||
sk->streamingAt = endLsn;
|
||||
@@ -1406,9 +1397,9 @@ SendAppendRequests(Safekeeper *sk)
|
||||
return true;
|
||||
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk->conn));
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1446,17 +1437,16 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
|
||||
break;
|
||||
|
||||
ereport(DEBUG2,
|
||||
(errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
|
||||
walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
|
||||
sk->appendResponse.term,
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
|
||||
sk->host, sk->port)));
|
||||
sk->host, sk->port);
|
||||
|
||||
if (sk->appendResponse.term > wp->propTerm)
|
||||
{
|
||||
/* Another compute with higher term is running. */
|
||||
elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
||||
walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, wp->propTerm);
|
||||
}
|
||||
@@ -1484,7 +1474,7 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
|
||||
/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
|
||||
void
|
||||
ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
|
||||
ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
|
||||
{
|
||||
uint8 nkeys;
|
||||
int i;
|
||||
@@ -1502,7 +1492,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->currentClusterSize = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
|
||||
rf->currentClusterSize);
|
||||
}
|
||||
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
|
||||
@@ -1510,7 +1500,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->last_received_lsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
|
||||
@@ -1518,7 +1508,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
|
||||
@@ -1526,7 +1516,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
|
||||
@@ -1539,7 +1529,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
|
||||
|
||||
/* Copy because timestamptz_to_str returns a static buffer */
|
||||
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
|
||||
rf->replytime, replyTimeStr);
|
||||
|
||||
pfree(replyTimeStr);
|
||||
@@ -1554,7 +1544,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
|
||||
* Skip unknown keys to support backward compatibile protocol
|
||||
* changes
|
||||
*/
|
||||
elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
pq_getmsgbytes(reply_message, len);
|
||||
};
|
||||
}
|
||||
@@ -1637,7 +1627,7 @@ HandleSafekeeperResponse(WalProposer *wp)
|
||||
* Advance the replication slot to free up old WAL files. Note that
|
||||
* slot doesn't exist if we are in syncSafekeepers mode.
|
||||
*/
|
||||
wp->api.confirm_wal_streamed(wp->truncateLsn);
|
||||
wp->api.confirm_wal_streamed(wp, wp->truncateLsn);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1684,7 +1674,7 @@ HandleSafekeeperResponse(WalProposer *wp)
|
||||
*/
|
||||
BroadcastAppendRequest(wp);
|
||||
|
||||
wp->api.finish_sync_safekeepers(wp->propEpochStartLsn);
|
||||
wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
|
||||
/* unreachable */
|
||||
}
|
||||
}
|
||||
@@ -1699,7 +1689,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
switch (wp->api.conn_async_read(sk->conn, buf, buf_size))
|
||||
switch (wp->api.conn_async_read(sk, buf, buf_size))
|
||||
{
|
||||
case PG_ASYNC_READ_SUCCESS:
|
||||
return true;
|
||||
@@ -1709,9 +1699,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
return false;
|
||||
|
||||
case PG_ASYNC_READ_FAIL:
|
||||
elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
|
||||
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk->conn));
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1749,12 +1739,12 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
tag = pq_getmsgint64_le(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp();
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
switch (tag)
|
||||
{
|
||||
case 'g':
|
||||
@@ -1798,7 +1788,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
msg->hs.xmin.value = pq_getmsgint64_le(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
|
||||
if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
|
||||
ParsePageserverFeedbackMessage(&s, &msg->rf);
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->rf);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
@@ -1823,11 +1813,11 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
|
||||
WalProposer *wp = sk->wp;
|
||||
uint32 events;
|
||||
|
||||
if (!wp->api.conn_blocking_write(sk->conn, msg, msg_size))
|
||||
if (!wp->api.conn_blocking_write(sk, msg, msg_size))
|
||||
{
|
||||
elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk->conn));
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1857,7 +1847,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
switch (wp->api.conn_async_write(sk->conn, msg, msg_size))
|
||||
switch (wp->api.conn_async_write(sk, msg, msg_size))
|
||||
{
|
||||
case PG_ASYNC_WRITE_SUCCESS:
|
||||
return true;
|
||||
@@ -1872,9 +1862,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
|
||||
wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
|
||||
return false;
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk->conn));
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1902,7 +1892,7 @@ AsyncFlush(Safekeeper *sk)
|
||||
* 1 if unable to send everything yet [call PQflush again]
|
||||
* -1 if it failed [emit an error]
|
||||
*/
|
||||
switch (wp->api.conn_flush(sk->conn))
|
||||
switch (wp->api.conn_flush(sk))
|
||||
{
|
||||
case 0:
|
||||
/* flush is done */
|
||||
@@ -1911,9 +1901,9 @@ AsyncFlush(Safekeeper *sk)
|
||||
/* Nothing to do; try again when the socket's ready */
|
||||
return false;
|
||||
case -1:
|
||||
elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
|
||||
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk->conn));
|
||||
wp->api.conn_error_message(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1942,11 +1932,11 @@ CompareLsn(const void *a, const void *b)
|
||||
*
|
||||
* The strings are intended to be used as a prefix to "state", e.g.:
|
||||
*
|
||||
* elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
|
||||
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
|
||||
*
|
||||
* If this sort of phrasing doesn't fit the message, instead use something like:
|
||||
*
|
||||
* elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
|
||||
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
|
||||
*/
|
||||
static char *
|
||||
FormatSafekeeperState(SafekeeperState state)
|
||||
@@ -1994,6 +1984,7 @@ FormatSafekeeperState(SafekeeperState state)
|
||||
static void
|
||||
AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
uint32 expected = SafekeeperStateDesiredEvents(sk->state);
|
||||
|
||||
/*
|
||||
@@ -2016,8 +2007,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
* To give a descriptive message in the case of failure, we use elog
|
||||
* and then an assertion that's guaranteed to fail.
|
||||
*/
|
||||
elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
|
||||
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
|
||||
Assert(events_ok_for_state);
|
||||
}
|
||||
}
|
||||
@@ -2090,7 +2081,7 @@ SafekeeperStateDesiredEvents(SafekeeperState state)
|
||||
* The string should not be freed. It should also not be expected to remain the same between
|
||||
* function calls. */
|
||||
static char *
|
||||
FormatEvents(uint32 events)
|
||||
FormatEvents(WalProposer *wp, uint32 events)
|
||||
{
|
||||
static char return_str[8];
|
||||
|
||||
@@ -2119,7 +2110,7 @@ FormatEvents(uint32 events)
|
||||
|
||||
if (events & (~all_flags))
|
||||
{
|
||||
elog(WARNING, "Event formatting found unexpected component %d",
|
||||
walprop_log(WARNING, "Event formatting found unexpected component %d",
|
||||
events & (~all_flags));
|
||||
return_str[6] = '*';
|
||||
return_str[7] = '\0';
|
||||
|
||||
@@ -333,24 +333,11 @@ typedef struct Safekeeper
|
||||
*/
|
||||
char conninfo[MAXCONNINFO];
|
||||
|
||||
/*
|
||||
* postgres protocol connection to the WAL acceptor
|
||||
*
|
||||
* Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
|
||||
* reach SS_ACTIVE; not before.
|
||||
*/
|
||||
WalProposerConn *conn;
|
||||
|
||||
/*
|
||||
* Temporary buffer for the message being sent to the safekeeper.
|
||||
*/
|
||||
StringInfoData outbuf;
|
||||
|
||||
/*
|
||||
* WAL reader, allocated for each safekeeper.
|
||||
*/
|
||||
XLogReaderState *xlogreader;
|
||||
|
||||
/*
|
||||
* Streaming will start here; must be record boundary.
|
||||
*/
|
||||
@@ -361,13 +348,43 @@ typedef struct Safekeeper
|
||||
XLogRecPtr streamingAt; /* current streaming position */
|
||||
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
|
||||
|
||||
int eventPos; /* position in wait event set. Equal to -1 if*
|
||||
* no event */
|
||||
SafekeeperState state; /* safekeeper state machine state */
|
||||
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
|
||||
AcceptorGreeting greetResponse; /* acceptor greeting */
|
||||
VoteResponse voteResponse; /* the vote */
|
||||
AppendResponse appendResponse; /* feedback for master */
|
||||
|
||||
|
||||
/* postgres-specific fields */
|
||||
#ifndef WALPROPOSER_LIB
|
||||
/*
|
||||
* postgres protocol connection to the WAL acceptor
|
||||
*
|
||||
* Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
|
||||
* reach SS_ACTIVE; not before.
|
||||
*/
|
||||
WalProposerConn *conn;
|
||||
|
||||
/*
|
||||
* WAL reader, allocated for each safekeeper.
|
||||
*/
|
||||
XLogReaderState *xlogreader;
|
||||
|
||||
/*
|
||||
* Position in wait event set. Equal to -1 if no event
|
||||
*/
|
||||
int eventPos;
|
||||
#endif
|
||||
|
||||
|
||||
/* WalProposer library specifics */
|
||||
#ifdef WALPROPOSER_LIB
|
||||
/*
|
||||
* Buffer for incoming messages. Usually Rust vector is stored here.
|
||||
* Caller is responsible for freeing the buffer.
|
||||
*/
|
||||
StringInfoData inbuf;
|
||||
#endif
|
||||
} Safekeeper;
|
||||
|
||||
/* Re-exported PostgresPollingStatusType */
|
||||
@@ -433,7 +450,7 @@ typedef struct walproposer_api
|
||||
* Get WalproposerShmemState. This is used to store information about last
|
||||
* elected term.
|
||||
*/
|
||||
WalproposerShmemState *(*get_shmem_state) (void);
|
||||
WalproposerShmemState *(*get_shmem_state) (WalProposer *wp);
|
||||
|
||||
/*
|
||||
* Start receiving notifications about new WAL. This is an infinite loop
|
||||
@@ -443,61 +460,63 @@ typedef struct walproposer_api
|
||||
void (*start_streaming) (WalProposer *wp, XLogRecPtr startpos);
|
||||
|
||||
/* Get pointer to the latest available WAL. */
|
||||
XLogRecPtr (*get_flush_rec_ptr) (void);
|
||||
XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp);
|
||||
|
||||
/* Get current time. */
|
||||
TimestampTz (*get_current_timestamp) (void);
|
||||
|
||||
/* Get postgres timeline. */
|
||||
TimeLineID (*get_timeline_id) (void);
|
||||
TimestampTz (*get_current_timestamp) (WalProposer *wp);
|
||||
|
||||
/* Current error message, aka PQerrorMessage. */
|
||||
char *(*conn_error_message) (WalProposerConn *conn);
|
||||
char *(*conn_error_message) (Safekeeper *sk);
|
||||
|
||||
/* Connection status, aka PQstatus. */
|
||||
WalProposerConnStatusType (*conn_status) (WalProposerConn *conn);
|
||||
WalProposerConnStatusType (*conn_status) (Safekeeper *sk);
|
||||
|
||||
/* Start the connection, aka PQconnectStart. */
|
||||
WalProposerConn *(*conn_connect_start) (char *conninfo);
|
||||
void (*conn_connect_start) (Safekeeper *sk);
|
||||
|
||||
/* Poll an asynchronous connection, aka PQconnectPoll. */
|
||||
WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn);
|
||||
WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);
|
||||
|
||||
/* Send a blocking SQL query, aka PQsendQuery. */
|
||||
bool (*conn_send_query) (WalProposerConn *conn, char *query);
|
||||
bool (*conn_send_query) (Safekeeper *sk, char *query);
|
||||
|
||||
/* Read the query result, aka PQgetResult. */
|
||||
WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn);
|
||||
WalProposerExecStatusType (*conn_get_query_result) (Safekeeper *sk);
|
||||
|
||||
/* Flush buffer to the network, aka PQflush. */
|
||||
int (*conn_flush) (WalProposerConn *conn);
|
||||
int (*conn_flush) (Safekeeper *sk);
|
||||
|
||||
/* Close the connection, aka PQfinish. */
|
||||
void (*conn_finish) (WalProposerConn *conn);
|
||||
void (*conn_finish) (Safekeeper *sk);
|
||||
|
||||
/* Try to read CopyData message, aka PQgetCopyData. */
|
||||
PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount);
|
||||
/*
|
||||
* Try to read CopyData message from the safekeeper, aka PQgetCopyData.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
|
||||
|
||||
/* Try to write CopyData message, aka PQputCopyData. */
|
||||
PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size);
|
||||
PGAsyncWriteResult (*conn_async_write) (Safekeeper *sk, void const *buf, size_t size);
|
||||
|
||||
/* Blocking CopyData write, aka PQputCopyData + PQflush. */
|
||||
bool (*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size);
|
||||
bool (*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);
|
||||
|
||||
/* Download WAL from startpos to endpos and make it available locally. */
|
||||
bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
|
||||
/* Read WAL from disk to buf. */
|
||||
void (*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count);
|
||||
void (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
|
||||
|
||||
/* Allocate WAL reader. */
|
||||
XLogReaderState *(*wal_reader_allocate) (void);
|
||||
void (*wal_reader_allocate) (Safekeeper *sk);
|
||||
|
||||
/* Deallocate event set. */
|
||||
void (*free_event_set) (void);
|
||||
void (*free_event_set) (WalProposer *wp);
|
||||
|
||||
/* Initialize event set. */
|
||||
void (*init_event_set) (int n_safekeepers);
|
||||
void (*init_event_set) (WalProposer *wp);
|
||||
|
||||
/* Update events for an existing safekeeper connection. */
|
||||
void (*update_event_set) (Safekeeper *sk, uint32 events);
|
||||
@@ -513,22 +532,22 @@ typedef struct walproposer_api
|
||||
* events mask to indicate events and sets sk to the safekeeper which has
|
||||
* an event.
|
||||
*/
|
||||
int (*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events);
|
||||
int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
|
||||
|
||||
/* Read random bytes. */
|
||||
bool (*strong_random) (void *buf, size_t len);
|
||||
bool (*strong_random) (WalProposer *wp, void *buf, size_t len);
|
||||
|
||||
/*
|
||||
* Get a basebackup LSN. Used to cross-validate with the latest available
|
||||
* LSN on the safekeepers.
|
||||
*/
|
||||
XLogRecPtr (*get_redo_start_lsn) (void);
|
||||
XLogRecPtr (*get_redo_start_lsn) (WalProposer *wp);
|
||||
|
||||
/*
|
||||
* Finish sync safekeepers with the given LSN. This function should not
|
||||
* return and should exit the program.
|
||||
*/
|
||||
void (*finish_sync_safekeepers) (XLogRecPtr lsn);
|
||||
void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
|
||||
|
||||
/*
|
||||
* Called after every new message from the safekeeper. Used to propagate
|
||||
@@ -541,7 +560,22 @@ typedef struct walproposer_api
|
||||
* Called on peer_horizon_lsn updates. Used to advance replication slot
|
||||
* and to free up disk space by deleting unnecessary WAL.
|
||||
*/
|
||||
void (*confirm_wal_streamed) (XLogRecPtr lsn);
|
||||
void (*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
|
||||
|
||||
/*
|
||||
* Write a log message to the internal log processor. This is used only
|
||||
* when walproposer is compiled as a library. Otherwise, all logging is
|
||||
* handled by elog().
|
||||
*/
|
||||
void (*log_internal) (WalProposer *wp, int level, const char *line);
|
||||
|
||||
/*
|
||||
* Called right after the proposer was elected, but before it started
|
||||
* recovery and sent ProposerElected message to the safekeepers.
|
||||
*
|
||||
* Used by logical replication to update truncateLsn.
|
||||
*/
|
||||
void (*after_election) (WalProposer *wp);
|
||||
} walproposer_api;
|
||||
|
||||
/*
|
||||
@@ -590,6 +624,13 @@ typedef struct WalProposerConfig
|
||||
|
||||
/* Will be passed to safekeepers in greet request. */
|
||||
uint64 systemId;
|
||||
|
||||
/* Will be passed to safekeepers in greet request. */
|
||||
TimeLineID pgTimeline;
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void *callback_data;
|
||||
#endif
|
||||
} WalProposerConfig;
|
||||
|
||||
|
||||
@@ -666,7 +707,16 @@ extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api
|
||||
extern void WalProposerStart(WalProposer *wp);
|
||||
extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
extern void WalProposerPoll(WalProposer *wp);
|
||||
extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
|
||||
PageserverFeedback *rf);
|
||||
extern void WalProposerFree(WalProposer *wp);
|
||||
|
||||
|
||||
#define WPEVENT 1337 /* special log level for walproposer internal events */
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...);
|
||||
#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
|
||||
#else
|
||||
#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#endif /* __NEON_WALPROPOSER_H__ */
|
||||
|
||||
192
pgxn/neon/walproposer_compat.c
Normal file
192
pgxn/neon/walproposer_compat.c
Normal file
@@ -0,0 +1,192 @@
|
||||
/*
|
||||
* Contains copied/adapted functions from libpq and some internal postgres functions.
|
||||
* This is needed to avoid linking to full postgres server installation. This file
|
||||
* is compiled as a part of libwalproposer static library.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "walproposer.h"
|
||||
#include "utils/datetime.h"
|
||||
#include "miscadmin.h"
|
||||
|
||||
void ExceptionalCondition(const char *conditionName,
|
||||
const char *fileName, int lineNumber)
|
||||
{
|
||||
fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
|
||||
fileName, lineNumber, conditionName);
|
||||
fprintf(stderr, "aborting...\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void
|
||||
pq_copymsgbytes(StringInfo msg, char *buf, int datalen)
|
||||
{
|
||||
if (datalen < 0 || datalen > (msg->len - msg->cursor))
|
||||
ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__);
|
||||
memcpy(buf, &msg->data[msg->cursor], datalen);
|
||||
msg->cursor += datalen;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint - get a binary integer from a message buffer
|
||||
*
|
||||
* Values are treated as unsigned.
|
||||
* --------------------------------
|
||||
*/
|
||||
unsigned int
|
||||
pq_getmsgint(StringInfo msg, int b)
|
||||
{
|
||||
unsigned int result;
|
||||
unsigned char n8;
|
||||
uint16 n16;
|
||||
uint32 n32;
|
||||
|
||||
switch (b)
|
||||
{
|
||||
case 1:
|
||||
pq_copymsgbytes(msg, (char *) &n8, 1);
|
||||
result = n8;
|
||||
break;
|
||||
case 2:
|
||||
pq_copymsgbytes(msg, (char *) &n16, 2);
|
||||
result = pg_ntoh16(n16);
|
||||
break;
|
||||
case 4:
|
||||
pq_copymsgbytes(msg, (char *) &n32, 4);
|
||||
result = pg_ntoh32(n32);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "unsupported integer size %d\n", b);
|
||||
ExceptionalCondition("unsupported integer size", __FILE__, __LINE__);
|
||||
result = 0; /* keep compiler quiet */
|
||||
break;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint64 - get a binary 8-byte int from a message buffer
|
||||
*
|
||||
* It is tempting to merge this with pq_getmsgint, but we'd have to make the
|
||||
* result int64 for all data widths --- that could be a big performance
|
||||
* hit on machines where int64 isn't efficient.
|
||||
* --------------------------------
|
||||
*/
|
||||
int64
|
||||
pq_getmsgint64(StringInfo msg)
|
||||
{
|
||||
uint64 n64;
|
||||
|
||||
pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
|
||||
|
||||
return pg_ntoh64(n64);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgbyte - get a raw byte from a message buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
int
|
||||
pq_getmsgbyte(StringInfo msg)
|
||||
{
|
||||
if (msg->cursor >= msg->len)
|
||||
ExceptionalCondition("no data left in message", __FILE__, __LINE__);
|
||||
return (unsigned char) msg->data[msg->cursor++];
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgbytes - get raw data from a message buffer
|
||||
*
|
||||
* Returns a pointer directly into the message buffer; note this
|
||||
* may not have any particular alignment.
|
||||
* --------------------------------
|
||||
*/
|
||||
const char *
|
||||
pq_getmsgbytes(StringInfo msg, int datalen)
|
||||
{
|
||||
const char *result;
|
||||
|
||||
if (datalen < 0 || datalen > (msg->len - msg->cursor))
|
||||
ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__);
|
||||
result = &msg->data[msg->cursor];
|
||||
msg->cursor += datalen;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgstring - get a null-terminated text string (with conversion)
|
||||
*
|
||||
* May return a pointer directly into the message buffer, or a pointer
|
||||
* to a palloc'd conversion result.
|
||||
* --------------------------------
|
||||
*/
|
||||
const char *
|
||||
pq_getmsgstring(StringInfo msg)
|
||||
{
|
||||
char *str;
|
||||
int slen;
|
||||
|
||||
str = &msg->data[msg->cursor];
|
||||
|
||||
/*
|
||||
* It's safe to use strlen() here because a StringInfo is guaranteed to
|
||||
* have a trailing null byte. But check we found a null inside the
|
||||
* message.
|
||||
*/
|
||||
slen = strlen(str);
|
||||
if (msg->cursor + slen >= msg->len)
|
||||
ExceptionalCondition("invalid string in message", __FILE__, __LINE__);
|
||||
msg->cursor += slen + 1;
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgend - verify message fully consumed
|
||||
* --------------------------------
|
||||
*/
|
||||
void
|
||||
pq_getmsgend(StringInfo msg)
|
||||
{
|
||||
if (msg->cursor != msg->len)
|
||||
ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Produce a C-string representation of a TimestampTz.
|
||||
*
|
||||
* This is mostly for use in emitting messages.
|
||||
*/
|
||||
const char *
|
||||
timestamptz_to_str(TimestampTz t)
|
||||
{
|
||||
static char buf[MAXDATELEN + 1];
|
||||
|
||||
snprintf(buf, sizeof(buf), "TimestampTz(%ld)", t);
|
||||
return buf;
|
||||
}
|
||||
|
||||
bool
|
||||
TimestampDifferenceExceeds(TimestampTz start_time,
|
||||
TimestampTz stop_time,
|
||||
int msec)
|
||||
{
|
||||
TimestampTz diff = stop_time - start_time;
|
||||
return (diff >= msec * INT64CONST(1000));
|
||||
}
|
||||
|
||||
void
|
||||
WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...)
|
||||
{
|
||||
char buf[1024];
|
||||
va_list args;
|
||||
|
||||
fmt = _(fmt);
|
||||
|
||||
va_start(args, fmt);
|
||||
vsnprintf(buf, sizeof(buf), fmt, args);
|
||||
va_end(args);
|
||||
|
||||
wp->api.log_internal(wp, elevel, buf);
|
||||
}
|
||||
@@ -73,7 +73,8 @@ static void walprop_register_bgworker(void);
|
||||
static void walprop_pg_init_standalone_sync_safekeepers(void);
|
||||
static void walprop_pg_init_walsender(void);
|
||||
static void walprop_pg_init_bgworker(void);
|
||||
static TimestampTz walprop_pg_get_current_timestamp(void);
|
||||
static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
|
||||
static TimeLineID walprop_pg_get_timeline_id(void);
|
||||
static void walprop_pg_load_libpqwalreceiver(void);
|
||||
|
||||
static process_interrupts_callback_t PrevProcessInterruptsCallback;
|
||||
@@ -104,6 +105,7 @@ init_walprop_config(bool syncSafekeepers)
|
||||
walprop_config.systemId = GetSystemIdentifier();
|
||||
else
|
||||
walprop_config.systemId = 0;
|
||||
walprop_config.pgTimeline = walprop_pg_get_timeline_id();
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -136,7 +138,7 @@ WalProposerMain(Datum main_arg)
|
||||
walprop_pg_load_libpqwalreceiver();
|
||||
|
||||
wp = WalProposerCreate(&walprop_config, walprop_pg);
|
||||
wp->last_reconnect_attempt = walprop_pg_get_current_timestamp();
|
||||
wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp);
|
||||
|
||||
walprop_pg_init_walsender();
|
||||
WalProposerStart(wp);
|
||||
@@ -379,7 +381,7 @@ nwp_shmem_startup_hook(void)
|
||||
}
|
||||
|
||||
static WalproposerShmemState *
|
||||
walprop_pg_get_shmem_state(void)
|
||||
walprop_pg_get_shmem_state(WalProposer *wp)
|
||||
{
|
||||
Assert(walprop_shared != NULL);
|
||||
return walprop_shared;
|
||||
@@ -505,7 +507,7 @@ walprop_pg_init_bgworker(void)
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
walprop_pg_get_flush_rec_ptr(void)
|
||||
walprop_pg_get_flush_rec_ptr(WalProposer *wp)
|
||||
{
|
||||
#if PG_MAJORVERSION_NUM < 15
|
||||
return GetFlushRecPtr();
|
||||
@@ -515,7 +517,7 @@ walprop_pg_get_flush_rec_ptr(void)
|
||||
}
|
||||
|
||||
static TimestampTz
|
||||
walprop_pg_get_current_timestamp(void)
|
||||
walprop_pg_get_current_timestamp(WalProposer *wp)
|
||||
{
|
||||
return GetCurrentTimestamp();
|
||||
}
|
||||
@@ -565,15 +567,15 @@ ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||
|
||||
/* Exported function definitions */
|
||||
static char *
|
||||
walprop_error_message(WalProposerConn *conn)
|
||||
walprop_error_message(Safekeeper *sk)
|
||||
{
|
||||
return PQerrorMessage(conn->pg_conn);
|
||||
return PQerrorMessage(sk->conn->pg_conn);
|
||||
}
|
||||
|
||||
static WalProposerConnStatusType
|
||||
walprop_status(WalProposerConn *conn)
|
||||
walprop_status(Safekeeper *sk)
|
||||
{
|
||||
switch (PQstatus(conn->pg_conn))
|
||||
switch (PQstatus(sk->conn->pg_conn))
|
||||
{
|
||||
case CONNECTION_OK:
|
||||
return WP_CONNECTION_OK;
|
||||
@@ -584,16 +586,17 @@ walprop_status(WalProposerConn *conn)
|
||||
}
|
||||
}
|
||||
|
||||
static WalProposerConn *
|
||||
walprop_connect_start(char *conninfo)
|
||||
static void
|
||||
walprop_connect_start(Safekeeper *sk)
|
||||
{
|
||||
WalProposerConn *conn;
|
||||
PGconn *pg_conn;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
char *password = neon_auth_token;
|
||||
|
||||
Assert(sk->conn == NULL);
|
||||
|
||||
/*
|
||||
* Connect using the given connection string. If the NEON_AUTH_TOKEN
|
||||
* environment variable was set, use that as the password.
|
||||
@@ -611,7 +614,7 @@ walprop_connect_start(char *conninfo)
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = conninfo;
|
||||
values[n] = sk->conninfo;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
@@ -619,11 +622,11 @@ walprop_connect_start(char *conninfo)
|
||||
pg_conn = PQconnectStartParams(keywords, values, 1);
|
||||
|
||||
/*
|
||||
* Allocation of a PQconn can fail, and will return NULL. We want to fully
|
||||
* replicate the behavior of PQconnectStart here.
|
||||
* "If the result is null, then libpq has been unable to allocate a new
|
||||
* PGconn structure"
|
||||
*/
|
||||
if (!pg_conn)
|
||||
return NULL;
|
||||
elog(FATAL, "failed to allocate new PGconn object");
|
||||
|
||||
/*
|
||||
* And in theory this allocation can fail as well, but it's incredibly
|
||||
@@ -632,20 +635,19 @@ walprop_connect_start(char *conninfo)
|
||||
* palloc will exit on failure though, so there's not much we could do if
|
||||
* it *did* fail.
|
||||
*/
|
||||
conn = palloc(sizeof(WalProposerConn));
|
||||
conn->pg_conn = pg_conn;
|
||||
conn->is_nonblocking = false; /* connections always start in blocking
|
||||
sk->conn = palloc(sizeof(WalProposerConn));
|
||||
sk->conn->pg_conn = pg_conn;
|
||||
sk->conn->is_nonblocking = false; /* connections always start in blocking
|
||||
* mode */
|
||||
conn->recvbuf = NULL;
|
||||
return conn;
|
||||
sk->conn->recvbuf = NULL;
|
||||
}
|
||||
|
||||
static WalProposerConnectPollStatusType
|
||||
walprop_connect_poll(WalProposerConn *conn)
|
||||
walprop_connect_poll(Safekeeper *sk)
|
||||
{
|
||||
WalProposerConnectPollStatusType return_val;
|
||||
|
||||
switch (PQconnectPoll(conn->pg_conn))
|
||||
switch (PQconnectPoll(sk->conn->pg_conn))
|
||||
{
|
||||
case PGRES_POLLING_FAILED:
|
||||
return_val = WP_CONN_POLLING_FAILED;
|
||||
@@ -682,24 +684,24 @@ walprop_connect_poll(WalProposerConn *conn)
|
||||
}
|
||||
|
||||
static bool
|
||||
walprop_send_query(WalProposerConn *conn, char *query)
|
||||
walprop_send_query(Safekeeper *sk, char *query)
|
||||
{
|
||||
/*
|
||||
* We need to be in blocking mode for sending the query to run without
|
||||
* requiring a call to PQflush
|
||||
*/
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
if (!ensure_nonblocking_status(sk->conn, false))
|
||||
return false;
|
||||
|
||||
/* PQsendQuery returns 1 on success, 0 on failure */
|
||||
if (!PQsendQuery(conn->pg_conn, query))
|
||||
if (!PQsendQuery(sk->conn->pg_conn, query))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static WalProposerExecStatusType
|
||||
walprop_get_query_result(WalProposerConn *conn)
|
||||
walprop_get_query_result(Safekeeper *sk)
|
||||
{
|
||||
PGresult *result;
|
||||
WalProposerExecStatusType return_val;
|
||||
@@ -708,14 +710,14 @@ walprop_get_query_result(WalProposerConn *conn)
|
||||
char *unexpected_success = NULL;
|
||||
|
||||
/* Consume any input that we might be missing */
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
if (!PQconsumeInput(sk->conn->pg_conn))
|
||||
return WP_EXEC_FAILED;
|
||||
|
||||
if (PQisBusy(conn->pg_conn))
|
||||
if (PQisBusy(sk->conn->pg_conn))
|
||||
return WP_EXEC_NEEDS_INPUT;
|
||||
|
||||
|
||||
result = PQgetResult(conn->pg_conn);
|
||||
result = PQgetResult(sk->conn->pg_conn);
|
||||
|
||||
/*
|
||||
* PQgetResult returns NULL only if getting the result was successful &
|
||||
@@ -777,24 +779,28 @@ walprop_get_query_result(WalProposerConn *conn)
|
||||
}
|
||||
|
||||
static pgsocket
|
||||
walprop_socket(WalProposerConn *conn)
|
||||
walprop_socket(Safekeeper *sk)
|
||||
{
|
||||
return PQsocket(conn->pg_conn);
|
||||
return PQsocket(sk->conn->pg_conn);
|
||||
}
|
||||
|
||||
static int
|
||||
walprop_flush(WalProposerConn *conn)
|
||||
walprop_flush(Safekeeper *sk)
|
||||
{
|
||||
return (PQflush(conn->pg_conn));
|
||||
return (PQflush(sk->conn->pg_conn));
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_finish(WalProposerConn *conn)
|
||||
walprop_finish(Safekeeper *sk)
|
||||
{
|
||||
if (conn->recvbuf != NULL)
|
||||
PQfreemem(conn->recvbuf);
|
||||
PQfinish(conn->pg_conn);
|
||||
pfree(conn);
|
||||
if (!sk->conn)
|
||||
return;
|
||||
|
||||
if (sk->conn->recvbuf != NULL)
|
||||
PQfreemem(sk->conn->recvbuf);
|
||||
PQfinish(sk->conn->pg_conn);
|
||||
pfree(sk->conn);
|
||||
sk->conn = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -804,18 +810,18 @@ walprop_finish(WalProposerConn *conn)
|
||||
* to this function.
|
||||
*/
|
||||
static PGAsyncReadResult
|
||||
walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
walprop_async_read(Safekeeper *sk, char **buf, int *amount)
|
||||
{
|
||||
int result;
|
||||
|
||||
if (conn->recvbuf != NULL)
|
||||
if (sk->conn->recvbuf != NULL)
|
||||
{
|
||||
PQfreemem(conn->recvbuf);
|
||||
conn->recvbuf = NULL;
|
||||
PQfreemem(sk->conn->recvbuf);
|
||||
sk->conn->recvbuf = NULL;
|
||||
}
|
||||
|
||||
/* Call PQconsumeInput so that we have the data we need */
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
if (!PQconsumeInput(sk->conn->pg_conn))
|
||||
{
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
@@ -833,7 +839,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
* sometimes be triggered by the server returning an ErrorResponse (which
|
||||
* also happens to have the effect that the copy is done).
|
||||
*/
|
||||
switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
|
||||
switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
|
||||
{
|
||||
case 0:
|
||||
*amount = 0;
|
||||
@@ -848,7 +854,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
* We can check PQgetResult to make sure that the server
|
||||
* failed; it'll always result in PGRES_FATAL_ERROR
|
||||
*/
|
||||
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
|
||||
ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));
|
||||
|
||||
if (status != PGRES_FATAL_ERROR)
|
||||
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
|
||||
@@ -869,18 +875,18 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
default:
|
||||
/* Positive values indicate the size of the returned result */
|
||||
*amount = result;
|
||||
*buf = conn->recvbuf;
|
||||
*buf = sk->conn->recvbuf;
|
||||
return PG_ASYNC_READ_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
static PGAsyncWriteResult
|
||||
walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
/* If we aren't in non-blocking mode, switch to it. */
|
||||
if (!ensure_nonblocking_status(conn, true))
|
||||
if (!ensure_nonblocking_status(sk->conn, true))
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
|
||||
/*
|
||||
@@ -888,7 +894,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
* queued, 0 if it was not queued because of full buffers, or -1 if an
|
||||
* error occurred
|
||||
*/
|
||||
result = PQputCopyData(conn->pg_conn, buf, size);
|
||||
result = PQputCopyData(sk->conn->pg_conn, buf, size);
|
||||
|
||||
/*
|
||||
* We won't get a result of zero because walproposer always empties the
|
||||
@@ -916,7 +922,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
* sucessful, 1 if it was unable to send all the data in the send queue
|
||||
* yet -1 if it failed for some reason
|
||||
*/
|
||||
switch (result = PQflush(conn->pg_conn))
|
||||
switch (result = PQflush(sk->conn->pg_conn))
|
||||
{
|
||||
case 0:
|
||||
return PG_ASYNC_WRITE_SUCCESS;
|
||||
@@ -934,22 +940,22 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
* information, refer to the comments there.
|
||||
*/
|
||||
static bool
|
||||
walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
/* If we are in non-blocking mode, switch out of it. */
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
if (!ensure_nonblocking_status(sk->conn, false))
|
||||
return false;
|
||||
|
||||
if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
|
||||
if ((result = PQputCopyData(sk->conn->pg_conn, buf, size)) == -1)
|
||||
return false;
|
||||
|
||||
Assert(result == 1);
|
||||
|
||||
/* Because the connection is non-blocking, flushing returns 0 or -1 */
|
||||
|
||||
if ((result = PQflush(conn->pg_conn)) == -1)
|
||||
if ((result = PQflush(sk->conn->pg_conn)) == -1)
|
||||
return false;
|
||||
|
||||
Assert(result == 0);
|
||||
@@ -1381,11 +1387,11 @@ XLogWalPropClose(XLogRecPtr recptr)
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_wal_read(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count)
|
||||
walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
|
||||
{
|
||||
WALReadError errinfo;
|
||||
|
||||
if (!WALRead(state,
|
||||
if (!WALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
@@ -1396,31 +1402,38 @@ walprop_pg_wal_read(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size
|
||||
}
|
||||
}
|
||||
|
||||
static XLogReaderState *
|
||||
walprop_pg_wal_reader_allocate(void)
|
||||
static void
|
||||
walprop_pg_wal_reader_allocate(Safekeeper *sk)
|
||||
{
|
||||
return XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
|
||||
sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
|
||||
if (sk->xlogreader == NULL)
|
||||
elog(FATAL, "Failed to allocate xlog reader");
|
||||
}
|
||||
|
||||
static WaitEventSet *waitEvents;
|
||||
|
||||
static void
|
||||
walprop_pg_free_event_set(void)
|
||||
walprop_pg_free_event_set(WalProposer *wp)
|
||||
{
|
||||
if (waitEvents)
|
||||
{
|
||||
FreeWaitEventSet(waitEvents);
|
||||
waitEvents = NULL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
wp->safekeeper[i].eventPos = -1;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_init_event_set(int n_safekeepers)
|
||||
walprop_pg_init_event_set(WalProposer *wp)
|
||||
{
|
||||
if (waitEvents)
|
||||
elog(FATAL, "double-initialization of event set");
|
||||
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
|
||||
AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
@@ -1439,11 +1452,11 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
|
||||
static void
|
||||
walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk->conn), NULL, sk);
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
|
||||
}
|
||||
|
||||
static int
|
||||
walprop_pg_wait_event_set(long timeout, Safekeeper **sk, uint32 *events)
|
||||
walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events)
|
||||
{
|
||||
WaitEvent event = {0};
|
||||
int rc = 0;
|
||||
@@ -1499,7 +1512,7 @@ walprop_pg_wait_event_set(long timeout, Safekeeper **sk, uint32 *events)
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_finish_sync_safekeepers(XLogRecPtr lsn)
|
||||
walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
|
||||
{
|
||||
fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(lsn));
|
||||
exit(0);
|
||||
@@ -1611,7 +1624,7 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
* pageserver.
|
||||
*/
|
||||
quorumFeedback.rf.disk_consistent_lsn,
|
||||
walprop_pg_get_current_timestamp(), false);
|
||||
walprop_pg_get_current_timestamp(wp), false);
|
||||
}
|
||||
|
||||
CombineHotStanbyFeedbacks(&hsFeedback, wp);
|
||||
@@ -1628,18 +1641,65 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_confirm_wal_streamed(XLogRecPtr lsn)
|
||||
walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
|
||||
{
|
||||
if (MyReplicationSlot)
|
||||
PhysicalConfirmReceivedLocation(lsn);
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
walprop_pg_get_redo_start_lsn(WalProposer *wp)
|
||||
{
|
||||
return GetRedoStartLsn();
|
||||
}
|
||||
|
||||
static bool
|
||||
walprop_pg_strong_random(WalProposer *wp, void *buf, size_t len)
|
||||
{
|
||||
return pg_strong_random(buf, len);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
|
||||
{
|
||||
elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_after_election(WalProposer *wp)
|
||||
{
|
||||
FILE* f;
|
||||
XLogRecPtr lrRestartLsn;
|
||||
|
||||
/* We don't need to do anything in syncSafekeepers mode.*/
|
||||
if (wp->config->syncSafekeepers)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If there are active logical replication subscription we need
|
||||
* to provide enough WAL for their WAL senders based on th position
|
||||
* of their replication slots.
|
||||
*/
|
||||
f = fopen("restart.lsn", "rb");
|
||||
if (f != NULL && !wp->config->syncSafekeepers)
|
||||
{
|
||||
fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
|
||||
fclose(f);
|
||||
if (lrRestartLsn != InvalidXLogRecPtr)
|
||||
{
|
||||
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||
/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
|
||||
lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
|
||||
wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const walproposer_api walprop_pg = {
|
||||
.get_shmem_state = walprop_pg_get_shmem_state,
|
||||
.start_streaming = walprop_pg_start_streaming,
|
||||
.get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr,
|
||||
.get_current_timestamp = walprop_pg_get_current_timestamp,
|
||||
.get_timeline_id = walprop_pg_get_timeline_id,
|
||||
.conn_error_message = walprop_error_message,
|
||||
.conn_status = walprop_status,
|
||||
.conn_connect_start = walprop_connect_start,
|
||||
@@ -1659,9 +1719,11 @@ static const walproposer_api walprop_pg = {
|
||||
.update_event_set = walprop_pg_update_event_set,
|
||||
.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
|
||||
.wait_event_set = walprop_pg_wait_event_set,
|
||||
.strong_random = pg_strong_random,
|
||||
.get_redo_start_lsn = GetRedoStartLsn,
|
||||
.strong_random = walprop_pg_strong_random,
|
||||
.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
|
||||
.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
|
||||
.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
|
||||
.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
|
||||
.log_internal = walprop_pg_log_internal,
|
||||
.after_election = walprop_pg_after_election,
|
||||
};
|
||||
|
||||
@@ -6,6 +6,7 @@ pub use link::LinkAuthError;
|
||||
|
||||
use crate::{
|
||||
auth::{self, ClientCredentials},
|
||||
config::AuthenticationConfig,
|
||||
console::{
|
||||
self,
|
||||
provider::{CachedNodeInfo, ConsoleReqExtra},
|
||||
@@ -124,6 +125,7 @@ async fn auth_quirks(
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the endpoint (project) name.
|
||||
@@ -145,7 +147,7 @@ async fn auth_quirks(
|
||||
}
|
||||
|
||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||
classic::authenticate(api, extra, creds, client).await
|
||||
classic::authenticate(api, extra, creds, client, config).await
|
||||
}
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
@@ -180,6 +182,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
use BackendType::*;
|
||||
|
||||
@@ -192,7 +195,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
);
|
||||
|
||||
let api = api.as_ref();
|
||||
auth_quirks(api, extra, creds, client, allow_cleartext).await?
|
||||
auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
|
||||
}
|
||||
Postgres(api, creds) => {
|
||||
info!(
|
||||
@@ -202,7 +205,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
);
|
||||
|
||||
let api = api.as_ref();
|
||||
auth_quirks(api, extra, creds, client, allow_cleartext).await?
|
||||
auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url) => {
|
||||
|
||||
@@ -4,6 +4,7 @@ use super::AuthSuccess;
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
compute,
|
||||
config::AuthenticationConfig,
|
||||
console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
|
||||
proxy::{handle_try_wake, retry_after},
|
||||
sasl, scram,
|
||||
@@ -17,6 +18,7 @@ pub(super) async fn authenticate(
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
info!("fetching user's authentication info");
|
||||
let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
|
||||
@@ -42,7 +44,16 @@ pub(super) async fn authenticate(
|
||||
error
|
||||
})?;
|
||||
|
||||
let auth_outcome = auth_flow.authenticate().await.map_err(|error| {
|
||||
let auth_outcome = tokio::time::timeout(
|
||||
config.scram_protocol_timeout,
|
||||
auth_flow.authenticate(),
|
||||
)
|
||||
.await
|
||||
.map_err(|error| {
|
||||
warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
|
||||
auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
|
||||
})?
|
||||
.map_err(|error| {
|
||||
warn!(?error, "error processing scram messages");
|
||||
error
|
||||
})?;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use futures::future::Either;
|
||||
use proxy::auth;
|
||||
use proxy::config::AuthenticationConfig;
|
||||
use proxy::config::HttpConfig;
|
||||
use proxy::console;
|
||||
use proxy::http;
|
||||
@@ -83,7 +84,9 @@ struct ProxyCliArgs {
|
||||
/// timeout for http connections
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_timeout: tokio::time::Duration,
|
||||
|
||||
/// timeout for scram authentication protocol
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
scram_protocol_timeout: tokio::time::Duration,
|
||||
/// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
require_client_ip: bool,
|
||||
@@ -231,12 +234,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let http_config = HttpConfig {
|
||||
sql_over_http_timeout: args.sql_over_http_timeout,
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
};
|
||||
let config = Box::leak(Box::new(ProxyConfig {
|
||||
tls_config,
|
||||
auth_backend,
|
||||
metric_collection,
|
||||
allow_self_signed_compute: args.allow_self_signed_compute,
|
||||
http_config,
|
||||
authentication_config,
|
||||
require_client_ip: args.require_client_ip,
|
||||
}));
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use hashbrown::HashMap;
|
||||
use anyhow::{bail, Context};
|
||||
use dashmap::DashMap;
|
||||
use pq_proto::CancelKeyData;
|
||||
use std::net::SocketAddr;
|
||||
use tokio::net::TcpStream;
|
||||
@@ -8,7 +8,7 @@ use tracing::info;
|
||||
|
||||
/// Enables serving `CancelRequest`s.
|
||||
#[derive(Default)]
|
||||
pub struct CancelMap(parking_lot::RwLock<HashMap<CancelKeyData, Option<CancelClosure>>>);
|
||||
pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);
|
||||
|
||||
impl CancelMap {
|
||||
/// Cancel a running query for the corresponding connection.
|
||||
@@ -16,7 +16,6 @@ impl CancelMap {
|
||||
// NB: we should immediately release the lock after cloning the token.
|
||||
let cancel_closure = self
|
||||
.0
|
||||
.read()
|
||||
.get(&key)
|
||||
.and_then(|x| x.clone())
|
||||
.with_context(|| format!("query cancellation key not found: {key}"))?;
|
||||
@@ -40,15 +39,19 @@ impl CancelMap {
|
||||
|
||||
// Random key collisions are unlikely to happen here, but they're still possible,
|
||||
// which is why we have to take care not to rewrite an existing key.
|
||||
self.0
|
||||
.write()
|
||||
.try_insert(key, None)
|
||||
.map_err(|_| anyhow!("query cancellation key already exists: {key}"))?;
|
||||
match self.0.entry(key) {
|
||||
dashmap::mapref::entry::Entry::Occupied(_) => {
|
||||
bail!("query cancellation key already exists: {key}")
|
||||
}
|
||||
dashmap::mapref::entry::Entry::Vacant(e) => {
|
||||
e.insert(None);
|
||||
}
|
||||
}
|
||||
|
||||
// This will guarantee that the session gets dropped
|
||||
// as soon as the future is finished.
|
||||
scopeguard::defer! {
|
||||
self.0.write().remove(&key);
|
||||
self.0.remove(&key);
|
||||
info!("dropped query cancellation key {key}");
|
||||
}
|
||||
|
||||
@@ -59,12 +62,12 @@ impl CancelMap {
|
||||
|
||||
#[cfg(test)]
|
||||
fn contains(&self, session: &Session) -> bool {
|
||||
self.0.read().contains_key(&session.key)
|
||||
self.0.contains_key(&session.key)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn is_empty(&self) -> bool {
|
||||
self.0.read().is_empty()
|
||||
self.0.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,10 +116,7 @@ impl Session<'_> {
|
||||
/// This enables query cancellation in `crate::proxy::prepare_client_connection`.
|
||||
pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
info!("enabling query cancellation for this session");
|
||||
self.cancel_map
|
||||
.0
|
||||
.write()
|
||||
.insert(self.key, Some(cancel_closure));
|
||||
self.cancel_map.0.insert(self.key, Some(cancel_closure));
|
||||
|
||||
self.key
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ pub struct ProxyConfig {
|
||||
pub metric_collection: Option<MetricCollectionConfig>,
|
||||
pub allow_self_signed_compute: bool,
|
||||
pub http_config: HttpConfig,
|
||||
pub authentication_config: AuthenticationConfig,
|
||||
pub require_client_ip: bool,
|
||||
}
|
||||
|
||||
@@ -32,6 +33,10 @@ pub struct HttpConfig {
|
||||
pub sql_over_http_timeout: tokio::time::Duration,
|
||||
}
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
pub scram_protocol_timeout: tokio::time::Duration,
|
||||
}
|
||||
|
||||
impl TlsConfig {
|
||||
pub fn to_server_config(&self) -> Arc<rustls::ServerConfig> {
|
||||
self.config.clone()
|
||||
|
||||
@@ -194,9 +194,10 @@ impl GlobalConnPool {
|
||||
info!("pool: cached connection '{conn_info}' is closed, opening a new one");
|
||||
connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
|
||||
} else {
|
||||
latency_timer.pool_hit();
|
||||
info!("pool: reusing connection '{conn_info}'");
|
||||
client.session.send(session_id)?;
|
||||
latency_timer.pool_hit();
|
||||
latency_timer.success();
|
||||
return Ok(Client {
|
||||
inner: Some(client),
|
||||
span: Span::current(),
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::{
|
||||
auth::{self, backend::AuthSuccess},
|
||||
cancellation::{self, CancelMap},
|
||||
compute::{self, PostgresConnection},
|
||||
config::{ProxyConfig, TlsConfig},
|
||||
config::{AuthenticationConfig, ProxyConfig, TlsConfig},
|
||||
console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
|
||||
http::StatusCode,
|
||||
metrics::{Ids, USAGE_METRICS},
|
||||
@@ -96,7 +96,9 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"proxy_compute_connection_latency_seconds",
|
||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||
&["protocol", "cache_miss", "pool_miss"],
|
||||
// http/ws/tcp, true/false, true/false, success/failure
|
||||
// 3 * 2 * 2 * 2 = 24 counters
|
||||
&["protocol", "cache_miss", "pool_miss", "outcome"],
|
||||
// largest bucket = 2^16 * 0.5ms = 32s
|
||||
exponential_buckets(0.0005, 2.0, 16).unwrap(),
|
||||
)
|
||||
@@ -105,19 +107,22 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
|
||||
pub struct LatencyTimer {
|
||||
start: Instant,
|
||||
pool_miss: bool,
|
||||
cache_miss: bool,
|
||||
protocol: &'static str,
|
||||
cache_miss: bool,
|
||||
pool_miss: bool,
|
||||
outcome: &'static str,
|
||||
}
|
||||
|
||||
impl LatencyTimer {
|
||||
pub fn new(protocol: &'static str) -> Self {
|
||||
Self {
|
||||
start: Instant::now(),
|
||||
protocol,
|
||||
cache_miss: false,
|
||||
// by default we don't do pooling
|
||||
pool_miss: true,
|
||||
protocol,
|
||||
// assume failed unless otherwise specified
|
||||
outcome: "failed",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,6 +133,10 @@ impl LatencyTimer {
|
||||
pub fn pool_hit(&mut self) {
|
||||
self.pool_miss = false;
|
||||
}
|
||||
|
||||
pub fn success(mut self) {
|
||||
self.outcome = "success";
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LatencyTimer {
|
||||
@@ -138,6 +147,7 @@ impl Drop for LatencyTimer {
|
||||
self.protocol,
|
||||
bool_to_str(self.cache_miss),
|
||||
bool_to_str(self.pool_miss),
|
||||
self.outcome,
|
||||
])
|
||||
.observe(duration)
|
||||
}
|
||||
@@ -340,7 +350,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
mode.allow_self_signed_compute(config),
|
||||
);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session, mode))
|
||||
.with_session(|session| client.connect_to_db(session, mode, &config.authentication_config))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -547,7 +557,10 @@ where
|
||||
|
||||
// try once
|
||||
let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
|
||||
Ok(res) => return Ok(res),
|
||||
Ok(res) => {
|
||||
latency_timer.success();
|
||||
return Ok(res);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = ?e, "could not connect to compute node");
|
||||
(invalidate_cache(node_info), e)
|
||||
@@ -601,7 +614,10 @@ where
|
||||
info!("wake_compute success. attempting to connect");
|
||||
loop {
|
||||
match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
|
||||
Ok(res) => return Ok(res),
|
||||
Ok(res) => {
|
||||
latency_timer.success();
|
||||
return Ok(res);
|
||||
}
|
||||
Err(e) => {
|
||||
let retriable = e.should_retry(num_retries);
|
||||
if !retriable {
|
||||
@@ -818,6 +834,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
self,
|
||||
session: cancellation::Session<'_>,
|
||||
mode: ClientMode,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> anyhow::Result<()> {
|
||||
let Self {
|
||||
mut stream,
|
||||
@@ -835,7 +852,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
let latency_timer = LatencyTimer::new(mode.protocol_label());
|
||||
|
||||
let auth_result = match creds
|
||||
.authenticate(&extra, &mut stream, mode.allow_cleartext())
|
||||
.authenticate(&extra, &mut stream, mode.allow_cleartext(), config)
|
||||
.await
|
||||
{
|
||||
Ok(auth_result) => auth_result,
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
//
|
||||
use anyhow::{bail, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Parser;
|
||||
use clap::{ArgAction, Parser};
|
||||
use futures::future::BoxFuture;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
@@ -105,6 +105,9 @@ struct Args {
|
||||
/// it during this period passed as a human readable duration.
|
||||
#[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT, verbatim_doc_comment)]
|
||||
heartbeat_timeout: Duration,
|
||||
/// Enable/disable peer recovery.
|
||||
#[arg(long, default_value = "false", action=ArgAction::Set)]
|
||||
peer_recovery: bool,
|
||||
/// Remote storage configuration for WAL backup (offloading to s3) as TOML
|
||||
/// inline table, e.g.
|
||||
/// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
|
||||
@@ -265,6 +268,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
broker_endpoint: args.broker_endpoint,
|
||||
broker_keepalive_interval: args.broker_keepalive_interval,
|
||||
heartbeat_timeout: args.heartbeat_timeout,
|
||||
peer_recovery_enabled: args.peer_recovery,
|
||||
remote_storage: args.remote_storage,
|
||||
max_offloader_lag_bytes: args.max_offloader_lag,
|
||||
wal_backup_enabled: !args.disable_wal_backup,
|
||||
|
||||
@@ -372,6 +372,13 @@ impl SafekeeperPostgresHandler {
|
||||
/// from a walproposer recovery function. This connection gets a special handling:
|
||||
/// safekeeper must stream all local WAL till the flush_lsn, whether committed or not.
|
||||
pub fn is_walproposer_recovery(&self) -> bool {
|
||||
self.appname == Some("wal_proposer_recovery".to_string())
|
||||
match &self.appname {
|
||||
None => false,
|
||||
Some(appname) => {
|
||||
appname == "wal_proposer_recovery" ||
|
||||
// set by safekeeper peer recovery
|
||||
appname.starts_with("safekeeper")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@ use tokio::io::AsyncReadExt;
|
||||
use utils::http::endpoint::request_span;
|
||||
|
||||
use crate::receive_wal::WalReceiverState;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::safekeeper::Term;
|
||||
use crate::safekeeper::{ServerInfo, TermLsn};
|
||||
use crate::send_wal::WalSenderState;
|
||||
use crate::timeline::PeerInfo;
|
||||
use crate::{debug_dump, pull_timeline};
|
||||
@@ -60,16 +60,25 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Same as TermSwitchEntry, but serializes LSN using display serializer
|
||||
/// Same as TermLsn, but serializes LSN using display serializer
|
||||
/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct TermSwitchApiEntry {
|
||||
pub term: Term,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl From<TermSwitchApiEntry> for TermLsn {
|
||||
fn from(api_val: TermSwitchApiEntry) -> Self {
|
||||
TermLsn {
|
||||
term: api_val.term,
|
||||
lsn: api_val.lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Augment AcceptorState with epoch for convenience
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct AcceptorStateStatus {
|
||||
|
||||
@@ -62,6 +62,7 @@ pub struct SafeKeeperConf {
|
||||
pub broker_endpoint: Uri,
|
||||
pub broker_keepalive_interval: Duration,
|
||||
pub heartbeat_timeout: Duration,
|
||||
pub peer_recovery_enabled: bool,
|
||||
pub remote_storage: Option<RemoteStorageConfig>,
|
||||
pub max_offloader_lag_bytes: u64,
|
||||
pub backup_parallel_jobs: usize,
|
||||
@@ -100,6 +101,7 @@ impl SafeKeeperConf {
|
||||
.parse()
|
||||
.expect("failed to parse default broker endpoint"),
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
peer_recovery_enabled: true,
|
||||
wal_backup_enabled: true,
|
||||
backup_parallel_jobs: 1,
|
||||
pg_auth: None,
|
||||
|
||||
@@ -55,9 +55,12 @@ impl WalReceivers {
|
||||
|
||||
/// Register new walreceiver. Returned guard provides access to the slot and
|
||||
/// automatically deregisters in Drop.
|
||||
pub fn register(self: &Arc<WalReceivers>) -> WalReceiverGuard {
|
||||
pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
|
||||
let slots = &mut self.mutex.lock().slots;
|
||||
let walreceiver = WalReceiverState::Voting;
|
||||
let walreceiver = WalReceiverState {
|
||||
conn_id,
|
||||
status: WalReceiverStatus::Voting,
|
||||
};
|
||||
// find empty slot or create new one
|
||||
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
|
||||
slots[pos] = Some(walreceiver);
|
||||
@@ -96,6 +99,18 @@ impl WalReceivers {
|
||||
self.mutex.lock().slots.iter().flatten().cloned().collect()
|
||||
}
|
||||
|
||||
/// Get number of streaming walreceivers (normally 0 or 1) from compute.
|
||||
pub fn get_num_streaming(self: &Arc<WalReceivers>) -> usize {
|
||||
self.mutex
|
||||
.lock()
|
||||
.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
// conn_id.is_none skips recovery which also registers here
|
||||
.filter(|s| s.conn_id.is_some() && matches!(s.status, WalReceiverStatus::Streaming))
|
||||
.count()
|
||||
}
|
||||
|
||||
/// Unregister walsender.
|
||||
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
|
||||
let mut shared = self.mutex.lock();
|
||||
@@ -108,10 +123,17 @@ struct WalReceiversShared {
|
||||
slots: Vec<Option<WalReceiverState>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalReceiverState {
|
||||
/// None means it is recovery initiated by us (this safekeeper).
|
||||
pub conn_id: Option<ConnectionId>,
|
||||
pub status: WalReceiverStatus,
|
||||
}
|
||||
|
||||
/// Walreceiver status. Currently only whether it passed voting stage and
|
||||
/// started receiving the stream, but it is easy to add more if needed.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum WalReceiverState {
|
||||
pub enum WalReceiverStatus {
|
||||
Voting,
|
||||
Streaming,
|
||||
}
|
||||
@@ -136,8 +158,8 @@ impl Drop for WalReceiverGuard {
|
||||
}
|
||||
}
|
||||
|
||||
const MSG_QUEUE_SIZE: usize = 256;
|
||||
const REPLY_QUEUE_SIZE: usize = 16;
|
||||
pub const MSG_QUEUE_SIZE: usize = 256;
|
||||
pub const REPLY_QUEUE_SIZE: usize = 16;
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
/// Wrapper around handle_start_wal_push_guts handling result. Error is
|
||||
@@ -261,7 +283,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
tli.clone(),
|
||||
msg_rx,
|
||||
reply_tx,
|
||||
self.conn_id,
|
||||
Some(self.conn_id),
|
||||
));
|
||||
|
||||
// Forward all messages to WalAcceptor
|
||||
@@ -317,31 +339,41 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
// even when it writes a steady stream of messages.
|
||||
const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Takes messages from msg_rx, processes and pushes replies to reply_tx.
|
||||
struct WalAcceptor {
|
||||
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
|
||||
/// replies to reply_tx; reading from socket and writing to disk in parallel is
|
||||
/// beneficial for performance, this struct provides writing to disk part.
|
||||
pub struct WalAcceptor {
|
||||
tli: Arc<Timeline>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: Option<ConnectionId>,
|
||||
}
|
||||
|
||||
impl WalAcceptor {
|
||||
/// Spawn thread with WalAcceptor running, return handle to it.
|
||||
fn spawn(
|
||||
/// Spawn task with WalAcceptor running, return handle to it. Task returns
|
||||
/// Ok(()) if either of channels has closed, and Err if any error during
|
||||
/// message processing is encountered.
|
||||
///
|
||||
/// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
|
||||
pub fn spawn(
|
||||
tli: Arc<Timeline>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: ConnectionId,
|
||||
conn_id: Option<ConnectionId>,
|
||||
) -> JoinHandle<anyhow::Result<()>> {
|
||||
task::spawn(async move {
|
||||
let mut wa = WalAcceptor {
|
||||
tli,
|
||||
msg_rx,
|
||||
reply_tx,
|
||||
conn_id,
|
||||
};
|
||||
|
||||
let span_ttid = wa.tli.ttid; // satisfy borrow checker
|
||||
wa.run()
|
||||
.instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid))
|
||||
.instrument(
|
||||
info_span!("WAL acceptor", cid = %conn_id.unwrap_or(0), ttid = %span_ttid),
|
||||
)
|
||||
.await
|
||||
})
|
||||
}
|
||||
@@ -355,7 +387,7 @@ impl WalAcceptor {
|
||||
let _compute_conn_guard = ComputeConnectionGuard {
|
||||
timeline: Arc::clone(&self.tli),
|
||||
};
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register();
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
|
||||
self.tli.update_status_notify().await?;
|
||||
|
||||
// After this timestamp we will stop processing AppendRequests and send a response
|
||||
@@ -372,7 +404,7 @@ impl WalAcceptor {
|
||||
|
||||
// Update walreceiver state in shmem for reporting.
|
||||
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
|
||||
*walreceiver_guard.get() = WalReceiverState::Streaming;
|
||||
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
|
||||
}
|
||||
|
||||
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
|
||||
|
||||
@@ -1,17 +1,41 @@
|
||||
//! This module implements pulling WAL from peer safekeepers if compute can't
|
||||
//! provide it, i.e. safekeeper lags too much.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use std::{fmt, pin::pin, sync::Arc};
|
||||
|
||||
use tokio::{select, time::sleep, time::Duration};
|
||||
use tracing::{info, instrument};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::StreamExt;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use tokio::sync::mpsc::{channel, Receiver, Sender};
|
||||
use tokio::time::timeout;
|
||||
use tokio::{
|
||||
select,
|
||||
time::sleep,
|
||||
time::{self, Duration},
|
||||
};
|
||||
use tokio_postgres::replication::ReplicationStream;
|
||||
use tokio_postgres::types::PgLsn;
|
||||
use tracing::*;
|
||||
use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
|
||||
|
||||
use crate::{timeline::Timeline, SafeKeeperConf};
|
||||
use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
|
||||
use crate::safekeeper::{AppendRequest, AppendRequestHeader};
|
||||
use crate::{
|
||||
http::routes::TimelineStatus,
|
||||
receive_wal::MSG_QUEUE_SIZE,
|
||||
safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
|
||||
TermLsn, VoteRequest,
|
||||
},
|
||||
timeline::{PeerInfo, Timeline},
|
||||
SafeKeeperConf,
|
||||
};
|
||||
|
||||
/// Entrypoint for per timeline task which always runs, checking whether
|
||||
/// recovery for this safekeeper is needed and starting it if so.
|
||||
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
|
||||
pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
info!("started");
|
||||
let mut cancellation_rx = match tli.get_cancellation_rx() {
|
||||
Ok(rx) => rx,
|
||||
@@ -22,19 +46,387 @@ pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
|
||||
};
|
||||
|
||||
select! {
|
||||
_ = recovery_main_loop(tli) => { unreachable!() }
|
||||
_ = recovery_main_loop(tli, conf) => { unreachable!() }
|
||||
_ = cancellation_rx.changed() => {
|
||||
info!("stopped");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
|
||||
/// fields to explain the choice.
|
||||
#[derive(Debug)]
|
||||
pub struct RecoveryNeededInfo {
|
||||
/// my term
|
||||
pub term: Term,
|
||||
/// my last_log_term
|
||||
pub last_log_term: Term,
|
||||
/// my flush_lsn
|
||||
pub flush_lsn: Lsn,
|
||||
/// peers from which we can fetch WAL, for observability.
|
||||
pub peers: Vec<PeerInfo>,
|
||||
/// for observability
|
||||
pub num_streaming_computes: usize,
|
||||
pub donors: Vec<Donor>,
|
||||
}
|
||||
|
||||
// Custom to omit not important fields from PeerInfo.
|
||||
impl fmt::Display for RecoveryNeededInfo {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{{")?;
|
||||
write!(
|
||||
f,
|
||||
"term: {}, last_log_term: {}, flush_lsn: {}, peers: {{",
|
||||
self.term, self.last_log_term, self.flush_lsn
|
||||
)?;
|
||||
for p in self.peers.iter() {
|
||||
write!(
|
||||
f,
|
||||
"PeerInfo {{ sk_id: {}, term: {}, last_log_term: {}, flush_lsn: {} }}, ",
|
||||
p.sk_id, p.term, p.last_log_term, p.flush_lsn
|
||||
)?;
|
||||
}
|
||||
write!(
|
||||
f,
|
||||
"}} num_streaming_computes: {}, donors: {:?}",
|
||||
self.num_streaming_computes, self.donors
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Donor {
|
||||
pub sk_id: NodeId,
|
||||
/// equals to last_log_term
|
||||
pub term: Term,
|
||||
pub flush_lsn: Lsn,
|
||||
pub pg_connstr: String,
|
||||
pub http_connstr: String,
|
||||
}
|
||||
|
||||
impl From<&PeerInfo> for Donor {
|
||||
fn from(p: &PeerInfo) -> Self {
|
||||
Donor {
|
||||
sk_id: p.sk_id,
|
||||
term: p.term,
|
||||
flush_lsn: p.flush_lsn,
|
||||
pg_connstr: p.pg_connstr.clone(),
|
||||
http_connstr: p.http_connstr.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const CHECK_INTERVAL_MS: u64 = 2000;
|
||||
|
||||
/// Check regularly whether we need to start recovery.
|
||||
async fn recovery_main_loop(_tli: Arc<Timeline>) {
|
||||
async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
|
||||
loop {
|
||||
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
|
||||
match recovery_needed_info.donors.first() {
|
||||
Some(donor) => {
|
||||
info!(
|
||||
"starting recovery from donor {}: {}",
|
||||
donor.sk_id, recovery_needed_info
|
||||
);
|
||||
match recover(tli.clone(), donor, &conf).await {
|
||||
// Note: 'write_wal rewrites WAL written before' error is
|
||||
// expected here and might happen if compute and recovery
|
||||
// concurrently write the same data. Eventually compute
|
||||
// should win.
|
||||
Err(e) => warn!("recovery failed: {:#}", e),
|
||||
Ok(msg) => info!("recovery finished: {}", msg),
|
||||
}
|
||||
}
|
||||
None => {
|
||||
trace!(
|
||||
"recovery not needed or not possible: {}",
|
||||
recovery_needed_info
|
||||
);
|
||||
}
|
||||
}
|
||||
sleep(check_duration).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Recover from the specified donor. Returns message explaining normal finish
|
||||
/// reason or error.
|
||||
async fn recover(
|
||||
tli: Arc<Timeline>,
|
||||
donor: &Donor,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> anyhow::Result<String> {
|
||||
// Learn donor term switch history to figure out starting point.
|
||||
let client = reqwest::Client::new();
|
||||
let timeline_info: TimelineStatus = client
|
||||
.get(format!(
|
||||
"http://{}/v1/tenant/{}/timeline/{}",
|
||||
donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id
|
||||
))
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
if timeline_info.acceptor_state.term != donor.term {
|
||||
bail!(
|
||||
"donor term changed from {} to {}",
|
||||
donor.term,
|
||||
timeline_info.acceptor_state.term
|
||||
);
|
||||
}
|
||||
// convert from API TermSwitchApiEntry into TermLsn.
|
||||
let donor_th = TermHistory(
|
||||
timeline_info
|
||||
.acceptor_state
|
||||
.term_history
|
||||
.iter()
|
||||
.map(|tl| Into::<TermLsn>::into(*tl))
|
||||
.collect(),
|
||||
);
|
||||
|
||||
// Now understand our term history.
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
|
||||
let vote_response = match tli
|
||||
.process_msg(&vote_request)
|
||||
.await
|
||||
.context("VoteRequest handling")?
|
||||
{
|
||||
Some(AcceptorProposerMessage::VoteResponse(vr)) => vr,
|
||||
_ => {
|
||||
bail!("unexpected VoteRequest response"); // unreachable
|
||||
}
|
||||
};
|
||||
if vote_response.term != donor.term {
|
||||
bail!(
|
||||
"our term changed from {} to {}",
|
||||
donor.term,
|
||||
vote_response.term
|
||||
);
|
||||
}
|
||||
|
||||
let last_common_point = match TermHistory::find_highest_common_point(
|
||||
&donor_th,
|
||||
&vote_response.term_history,
|
||||
vote_response.flush_lsn,
|
||||
) {
|
||||
None => bail!(
|
||||
"couldn't find common point in histories, donor {:?}, sk {:?}",
|
||||
donor_th,
|
||||
vote_response.term_history,
|
||||
),
|
||||
Some(lcp) => lcp,
|
||||
};
|
||||
info!("found last common point at {:?}", last_common_point);
|
||||
|
||||
// truncate WAL locally
|
||||
let pe = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
term: donor.term,
|
||||
start_streaming_at: last_common_point.lsn,
|
||||
term_history: donor_th,
|
||||
timeline_start_lsn: Lsn::INVALID,
|
||||
});
|
||||
// Successful ProposerElected handling always returns None. If term changed,
|
||||
// we'll find out that during the streaming. Note: it is expected to get
|
||||
// 'refusing to overwrite correct WAL' here if walproposer reconnected
|
||||
// concurrently, restart helps here.
|
||||
tli.process_msg(&pe)
|
||||
.await
|
||||
.context("ProposerElected handling")?;
|
||||
|
||||
recovery_stream(tli, donor, last_common_point.lsn, conf).await
|
||||
}
|
||||
|
||||
// Pull WAL from donor, assuming handshake is already done.
|
||||
async fn recovery_stream(
|
||||
tli: Arc<Timeline>,
|
||||
donor: &Donor,
|
||||
start_streaming_at: Lsn,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> anyhow::Result<String> {
|
||||
// TODO: pass auth token
|
||||
let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
|
||||
let mut cfg = cfg.to_tokio_postgres_config();
|
||||
// It will make safekeeper give out not committed WAL (up to flush_lsn).
|
||||
cfg.application_name(&format!("safekeeper_{}", conf.my_id));
|
||||
cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
|
||||
|
||||
let connect_timeout = Duration::from_millis(10000);
|
||||
let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls))
|
||||
.await
|
||||
{
|
||||
Ok(client_and_conn) => client_and_conn?,
|
||||
Err(_elapsed) => {
|
||||
bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open");
|
||||
}
|
||||
};
|
||||
trace!("connected to {:?}", donor);
|
||||
|
||||
// The connection object performs the actual communication with the
|
||||
// server, spawn it off to run on its own.
|
||||
let ttid = tli.ttid;
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection
|
||||
.instrument(info_span!("recovery task connection poll", ttid = %ttid))
|
||||
.await
|
||||
{
|
||||
// This logging isn't very useful as error is anyway forwarded to client.
|
||||
trace!(
|
||||
"tokio_postgres connection object finished with error: {}",
|
||||
e
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
let query = format!(
|
||||
"START_REPLICATION PHYSICAL {} (term='{}')",
|
||||
start_streaming_at, donor.term
|
||||
);
|
||||
|
||||
let copy_stream = client.copy_both_simple(&query).await?;
|
||||
let physical_stream = ReplicationStream::new(copy_stream);
|
||||
|
||||
// As in normal walreceiver, do networking and writing to disk in parallel.
|
||||
let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
|
||||
let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
|
||||
let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None);
|
||||
|
||||
let res = tokio::select! {
|
||||
r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r,
|
||||
r = read_replies(reply_rx, donor.term) => r.map(|()| None),
|
||||
};
|
||||
|
||||
// Join the spawned WalAcceptor. At this point chans to/from it passed to
|
||||
// network routines are dropped, so it will exit as soon as it touches them.
|
||||
match wa.await {
|
||||
Ok(Ok(())) => {
|
||||
// WalAcceptor finished normally, termination reason is different
|
||||
match res {
|
||||
Ok(Some(success_desc)) => Ok(success_desc),
|
||||
Ok(None) => bail!("unexpected recovery end without error/success"), // can't happen
|
||||
Err(e) => Err(e), // network error or term change
|
||||
}
|
||||
}
|
||||
Ok(Err(e)) => Err(e), // error while processing message
|
||||
Err(e) => bail!("WalAcceptor panicked: {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
// Perform network part of streaming: read data and push it to msg_tx, send KA
|
||||
// to make sender hear from us. If there is nothing coming for a while, check
|
||||
// for termination.
|
||||
// Returns
|
||||
// - Ok(None) if channel to WalAcceptor closed -- its task should return error.
|
||||
// - Ok(Some(String)) if recovery successfully completed.
|
||||
// - Err if error happened while reading/writing to socket.
|
||||
async fn network_io(
|
||||
physical_stream: ReplicationStream,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
donor: Donor,
|
||||
tli: Arc<Timeline>,
|
||||
conf: SafeKeeperConf,
|
||||
) -> anyhow::Result<Option<String>> {
|
||||
let mut physical_stream = pin!(physical_stream);
|
||||
let mut last_received_lsn = Lsn::INVALID;
|
||||
// tear down connection if no data arrives withing this period
|
||||
let no_data_timeout = Duration::from_millis(30000);
|
||||
|
||||
loop {
|
||||
let msg = match timeout(no_data_timeout, physical_stream.next()).await {
|
||||
Ok(next) => match next {
|
||||
None => bail!("unexpected end of replication stream"),
|
||||
Some(msg) => msg.context("get replication message")?,
|
||||
},
|
||||
Err(_) => bail!("no message received within {:?}", no_data_timeout),
|
||||
};
|
||||
|
||||
match msg {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
let ar_hdr = AppendRequestHeader {
|
||||
term: donor.term,
|
||||
epoch_start_lsn: Lsn::INVALID, // unused
|
||||
begin_lsn: Lsn(xlog_data.wal_start()),
|
||||
end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
|
||||
commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
|
||||
truncate_lsn: Lsn::INVALID, // do not attempt to advance
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let ar = AppendRequest {
|
||||
h: ar_hdr,
|
||||
wal_data: xlog_data.into_data(),
|
||||
};
|
||||
trace!(
|
||||
"processing AppendRequest {}-{}, len {}",
|
||||
ar.h.begin_lsn,
|
||||
ar.h.end_lsn,
|
||||
ar.wal_data.len()
|
||||
);
|
||||
last_received_lsn = ar.h.end_lsn;
|
||||
if msg_tx
|
||||
.send(ProposerAcceptorMessage::AppendRequest(ar))
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
return Ok(None); // chan closed, WalAcceptor terminated
|
||||
}
|
||||
}
|
||||
ReplicationMessage::PrimaryKeepAlive(_) => {
|
||||
// keepalive means nothing is being streamed for a while. Check whether we need to stop.
|
||||
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
|
||||
// do current donors still contain one we currently connected to?
|
||||
if !recovery_needed_info
|
||||
.donors
|
||||
.iter()
|
||||
.any(|d| d.sk_id == donor.sk_id)
|
||||
{
|
||||
// Most likely it means we are caughtup.
|
||||
// note: just exiting makes tokio_postgres send CopyFail to the far end.
|
||||
return Ok(Some(format!(
|
||||
"terminating at {} as connected safekeeper {} with term {} is not a donor anymore: {}",
|
||||
last_received_lsn, donor.sk_id, donor.term, recovery_needed_info
|
||||
)));
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
// Send reply to each message to keep connection alive. Ideally we
|
||||
// should do that once in a while instead, but this again requires
|
||||
// stream split or similar workaround, and recovery is anyway not that
|
||||
// performance critical.
|
||||
//
|
||||
// We do not know here real write/flush LSNs (need to take mutex again
|
||||
// or check replies which are read in different future), but neither
|
||||
// sender much cares about them, so just send last received.
|
||||
physical_stream
|
||||
.as_mut()
|
||||
.standby_status_update(
|
||||
PgLsn::from(last_received_lsn.0),
|
||||
PgLsn::from(last_received_lsn.0),
|
||||
PgLsn::from(last_received_lsn.0),
|
||||
SystemTime::now(),
|
||||
0,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Read replies from WalAcceptor. We are not interested much in sending them to
|
||||
// donor safekeeper, so don't route them anywhere. However, we should check if
|
||||
// term changes and exit if it does.
|
||||
// Returns Ok(()) if channel closed, Err in case of term change.
|
||||
async fn read_replies(
|
||||
mut reply_rx: Receiver<AcceptorProposerMessage>,
|
||||
donor_term: Term,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
match reply_rx.recv().await {
|
||||
Some(msg) => {
|
||||
if let AcceptorProposerMessage::AppendResponse(ar) = msg {
|
||||
if ar.term != donor_term {
|
||||
bail!("donor term changed from {} to {}", donor_term, ar.term);
|
||||
}
|
||||
}
|
||||
}
|
||||
None => return Ok(()), // chan closed, WalAcceptor terminated
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,6 +91,69 @@ impl TermHistory {
|
||||
}
|
||||
TermHistory(res)
|
||||
}
|
||||
|
||||
/// Find point of divergence between leader (walproposer) term history and
|
||||
/// safekeeper. Arguments are not symmetrics as proposer history ends at
|
||||
/// +infinity while safekeeper at flush_lsn.
|
||||
/// C version is at walproposer SendProposerElected.
|
||||
pub fn find_highest_common_point(
|
||||
prop_th: &TermHistory,
|
||||
sk_th: &TermHistory,
|
||||
sk_wal_end: Lsn,
|
||||
) -> Option<TermLsn> {
|
||||
let (prop_th, sk_th) = (&prop_th.0, &sk_th.0); // avoid .0 below
|
||||
|
||||
if let Some(sk_th_last) = sk_th.last() {
|
||||
assert!(
|
||||
sk_th_last.lsn <= sk_wal_end,
|
||||
"safekeeper term history end {:?} LSN is higher than WAL end {:?}",
|
||||
sk_th_last,
|
||||
sk_wal_end
|
||||
);
|
||||
}
|
||||
|
||||
// find last common term, if any...
|
||||
let mut last_common_idx = None;
|
||||
for i in 0..min(sk_th.len(), prop_th.len()) {
|
||||
if prop_th[i].term != sk_th[i].term {
|
||||
break;
|
||||
}
|
||||
// If term is the same, LSN must be equal as well.
|
||||
assert!(
|
||||
prop_th[i].lsn == sk_th[i].lsn,
|
||||
"same term {} has different start LSNs: prop {}, sk {}",
|
||||
prop_th[i].term,
|
||||
prop_th[i].lsn,
|
||||
sk_th[i].lsn
|
||||
);
|
||||
last_common_idx = Some(i);
|
||||
}
|
||||
let last_common_idx = match last_common_idx {
|
||||
None => return None, // no common point
|
||||
Some(lci) => lci,
|
||||
};
|
||||
// Now find where it ends at both prop and sk and take min. End of
|
||||
// (common) term is the start of the next except it is the last one;
|
||||
// there it is flush_lsn in case of safekeeper or, in case of proposer
|
||||
// +infinity, so we just take flush_lsn then.
|
||||
if last_common_idx == prop_th.len() - 1 {
|
||||
Some(TermLsn {
|
||||
term: prop_th[last_common_idx].term,
|
||||
lsn: sk_wal_end,
|
||||
})
|
||||
} else {
|
||||
let prop_common_term_end = prop_th[last_common_idx + 1].lsn;
|
||||
let sk_common_term_end = if last_common_idx + 1 < sk_th.len() {
|
||||
sk_th[last_common_idx + 1].lsn
|
||||
} else {
|
||||
sk_wal_end
|
||||
};
|
||||
Some(TermLsn {
|
||||
term: prop_th[last_common_idx].term,
|
||||
lsn: min(prop_common_term_end, sk_common_term_end),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Display only latest entries for Debug.
|
||||
@@ -305,19 +368,19 @@ pub struct AcceptorGreeting {
|
||||
/// Vote request sent from proposer to safekeepers
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct VoteRequest {
|
||||
term: Term,
|
||||
pub term: Term,
|
||||
}
|
||||
|
||||
/// Vote itself, sent from safekeeper to proposer
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct VoteResponse {
|
||||
term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
vote_given: u64, // fixme u64 due to padding
|
||||
// Safekeeper flush_lsn (end of WAL) + history of term switches allow
|
||||
// proposer to choose the most advanced one.
|
||||
flush_lsn: Lsn,
|
||||
pub flush_lsn: Lsn,
|
||||
truncate_lsn: Lsn,
|
||||
term_history: TermHistory,
|
||||
pub term_history: TermHistory,
|
||||
timeline_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -344,7 +407,8 @@ pub struct AppendRequest {
|
||||
pub struct AppendRequestHeader {
|
||||
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term,
|
||||
// LSN since the proposer appends WAL; determines epoch switch point.
|
||||
// TODO: remove this field, it in unused -- LSN of term switch can be taken
|
||||
// from ProposerElected (as well as from term history).
|
||||
pub epoch_start_lsn: Lsn,
|
||||
/// start position of message in WAL
|
||||
pub begin_lsn: Lsn,
|
||||
@@ -759,7 +823,7 @@ where
|
||||
bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
|
||||
msg.term, self.flush_lsn(), msg.start_streaming_at)
|
||||
}
|
||||
// Otherwise this shouldn't happen.
|
||||
// Otherwise we must never attempt to truncate committed data.
|
||||
assert!(
|
||||
msg.start_streaming_at >= self.inmem.commit_lsn,
|
||||
"attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
|
||||
@@ -810,6 +874,14 @@ where
|
||||
|
||||
info!("start receiving WAL since {:?}", msg.start_streaming_at);
|
||||
|
||||
// Cache LSN where term starts to immediately fsync control file with
|
||||
// commit_lsn once we reach it -- sync-safekeepers finishes when
|
||||
// persisted commit_lsn on majority of safekeepers aligns.
|
||||
self.epoch_start_lsn = match msg.term_history.0.last() {
|
||||
None => bail!("proposer elected with empty term history"),
|
||||
Some(term_lsn_start) => term_lsn_start.lsn,
|
||||
};
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
@@ -835,10 +907,7 @@ where
|
||||
// file: walproposer in sync mode is very interested when this
|
||||
// happens. Note: this is for sync-safekeepers mode only, as
|
||||
// otherwise commit_lsn might jump over epoch_start_lsn.
|
||||
// Also note that commit_lsn can reach epoch_start_lsn earlier
|
||||
// that we receive new epoch_start_lsn, and we still need to sync
|
||||
// control file in this case.
|
||||
if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
|
||||
if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
|
||||
self.persist_control_file(self.state.clone()).await?;
|
||||
}
|
||||
|
||||
@@ -902,7 +971,6 @@ where
|
||||
// Now we know that we are in the same term as the proposer,
|
||||
// processing the message.
|
||||
|
||||
self.epoch_start_lsn = msg.h.epoch_start_lsn;
|
||||
self.inmem.proposer_uuid = msg.h.proposer_uuid;
|
||||
|
||||
// do the job
|
||||
@@ -1185,4 +1253,65 @@ mod tests {
|
||||
sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
|
||||
assert_eq!(sk.get_epoch(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_highest_common_point_none() {
|
||||
let prop_th = TermHistory(vec![(0, Lsn(1)).into()]);
|
||||
let sk_th = TermHistory(vec![(1, Lsn(1)).into(), (2, Lsn(2)).into()]);
|
||||
assert_eq!(
|
||||
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(3),),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_highest_common_point_middle() {
|
||||
let prop_th = TermHistory(vec![
|
||||
(1, Lsn(10)).into(),
|
||||
(2, Lsn(20)).into(),
|
||||
(4, Lsn(40)).into(),
|
||||
]);
|
||||
let sk_th = TermHistory(vec![
|
||||
(1, Lsn(10)).into(),
|
||||
(2, Lsn(20)).into(),
|
||||
(3, Lsn(30)).into(), // sk ends last common term 2 at 30
|
||||
]);
|
||||
assert_eq!(
|
||||
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(40),),
|
||||
Some(TermLsn {
|
||||
term: 2,
|
||||
lsn: Lsn(30),
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_highest_common_point_sk_end() {
|
||||
let prop_th = TermHistory(vec![
|
||||
(1, Lsn(10)).into(),
|
||||
(2, Lsn(20)).into(), // last common term 2, sk will end it at 32 sk_end_lsn
|
||||
(4, Lsn(40)).into(),
|
||||
]);
|
||||
let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
|
||||
assert_eq!(
|
||||
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
|
||||
Some(TermLsn {
|
||||
term: 2,
|
||||
lsn: Lsn(32),
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_highest_common_point_walprop() {
|
||||
let prop_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
|
||||
let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
|
||||
assert_eq!(
|
||||
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
|
||||
Some(TermLsn {
|
||||
term: 2,
|
||||
lsn: Lsn(32),
|
||||
})
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -418,10 +418,11 @@ impl SafekeeperPostgresHandler {
|
||||
}
|
||||
|
||||
info!(
|
||||
"starting streaming from {:?}, available WAL ends at {}, recovery={}",
|
||||
"starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}",
|
||||
start_pos,
|
||||
end_pos,
|
||||
matches!(end_watch, EndWatch::Flush(_))
|
||||
matches!(end_watch, EndWatch::Flush(_)),
|
||||
appname
|
||||
);
|
||||
|
||||
// switch to copy
|
||||
|
||||
@@ -11,6 +11,7 @@ use tokio::fs;
|
||||
use serde_with::DisplayFromStr;
|
||||
use std::cmp::max;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::{Mutex, MutexGuard};
|
||||
use tokio::{
|
||||
sync::{mpsc::Sender, watch},
|
||||
@@ -27,7 +28,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::recovery::recovery_main;
|
||||
use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
|
||||
SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM,
|
||||
@@ -45,11 +46,12 @@ use crate::{debug_dump, wal_storage};
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PeerInfo {
|
||||
pub sk_id: NodeId,
|
||||
pub term: Term,
|
||||
/// Term of the last entry.
|
||||
_last_log_term: Term,
|
||||
pub last_log_term: Term,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
_flush_lsn: Lsn,
|
||||
pub flush_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub commit_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
|
||||
@@ -61,16 +63,21 @@ pub struct PeerInfo {
|
||||
#[serde(skip)]
|
||||
#[serde(default = "Instant::now")]
|
||||
ts: Instant,
|
||||
pub pg_connstr: String,
|
||||
pub http_connstr: String,
|
||||
}
|
||||
|
||||
impl PeerInfo {
|
||||
fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
|
||||
PeerInfo {
|
||||
sk_id: NodeId(sk_info.safekeeper_id),
|
||||
_last_log_term: sk_info.last_log_term,
|
||||
_flush_lsn: Lsn(sk_info.flush_lsn),
|
||||
term: sk_info.term,
|
||||
last_log_term: sk_info.last_log_term,
|
||||
flush_lsn: Lsn(sk_info.flush_lsn),
|
||||
commit_lsn: Lsn(sk_info.commit_lsn),
|
||||
local_start_lsn: Lsn(sk_info.local_start_lsn),
|
||||
pg_connstr: sk_info.safekeeper_connstr.clone(),
|
||||
http_connstr: sk_info.http_connstr.clone(),
|
||||
ts,
|
||||
}
|
||||
}
|
||||
@@ -262,6 +269,20 @@ impl SharedState {
|
||||
availability_zone: conf.availability_zone.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get our latest view of alive peers status on the timeline.
|
||||
/// We pass our own info through the broker as well, so when we don't have connection
|
||||
/// to the broker returned vec is empty.
|
||||
fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
|
||||
let now = Instant::now();
|
||||
self.peers_info
|
||||
.0
|
||||
.iter()
|
||||
// Regard peer as absent if we haven't heard from it within heartbeat_timeout.
|
||||
.filter(|p| now.duration_since(p.ts) <= heartbeat_timeout)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -443,7 +464,9 @@ impl Timeline {
|
||||
/// Bootstrap new or existing timeline starting background stasks.
|
||||
pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
|
||||
// Start recovery task which always runs on the timeline.
|
||||
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
||||
if conf.peer_recovery_enabled {
|
||||
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete timeline from disk completely, by removing timeline directory. Background
|
||||
@@ -677,20 +700,88 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get our latest view of alive peers status on the timeline.
|
||||
/// We pass our own info through the broker as well, so when we don't have connection
|
||||
/// to the broker returned vec is empty.
|
||||
pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
|
||||
let shared_state = self.write_shared_state().await;
|
||||
let now = Instant::now();
|
||||
shared_state
|
||||
.peers_info
|
||||
.0
|
||||
.iter()
|
||||
// Regard peer as absent if we haven't heard from it within heartbeat_timeout.
|
||||
.filter(|p| now.duration_since(p.ts) <= conf.heartbeat_timeout)
|
||||
.cloned()
|
||||
.collect()
|
||||
shared_state.get_peers(conf.heartbeat_timeout)
|
||||
}
|
||||
|
||||
/// Should we start fetching WAL from a peer safekeeper, and if yes, from
|
||||
/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
|
||||
/// to fetch, and we can do that without running elections; 2) there is no
|
||||
/// actively streaming compute, as we don't want to compete with it.
|
||||
///
|
||||
/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
|
||||
/// to its last_log_term so we are sure such a leader ever had been elected.
|
||||
///
|
||||
/// All possible donors are returned so that we could keep connection to the
|
||||
/// current one if it is good even if it slightly lags behind.
|
||||
///
|
||||
/// Note that term conditions above might be not met, but safekeepers are
|
||||
/// still not aligned on last flush_lsn. Generally in this case until
|
||||
/// elections are run it is not possible to say which safekeeper should
|
||||
/// recover from which one -- history which would be committed is different
|
||||
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
|
||||
/// Thus we don't try to predict it here.
|
||||
pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
|
||||
let ss = self.write_shared_state().await;
|
||||
let term = ss.sk.state.acceptor_state.term;
|
||||
let last_log_term = ss.sk.get_epoch();
|
||||
let flush_lsn = ss.sk.flush_lsn();
|
||||
// note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
|
||||
let mut peers = ss.get_peers(heartbeat_timeout);
|
||||
// Sort by <last log term, lsn> pairs.
|
||||
peers.sort_by(|p1, p2| {
|
||||
let tl1 = TermLsn {
|
||||
term: p1.last_log_term,
|
||||
lsn: p1.flush_lsn,
|
||||
};
|
||||
let tl2 = TermLsn {
|
||||
term: p2.last_log_term,
|
||||
lsn: p2.flush_lsn,
|
||||
};
|
||||
tl2.cmp(&tl1) // desc
|
||||
});
|
||||
let num_streaming_computes = self.walreceivers.get_num_streaming();
|
||||
let donors = if num_streaming_computes > 0 {
|
||||
vec![] // If there is a streaming compute, don't try to recover to not intervene.
|
||||
} else {
|
||||
peers
|
||||
.iter()
|
||||
.filter_map(|candidate| {
|
||||
// Are we interested in this candidate?
|
||||
let candidate_tl = TermLsn {
|
||||
term: candidate.last_log_term,
|
||||
lsn: candidate.flush_lsn,
|
||||
};
|
||||
let my_tl = TermLsn {
|
||||
term: last_log_term,
|
||||
lsn: flush_lsn,
|
||||
};
|
||||
if my_tl < candidate_tl {
|
||||
// Yes, we are interested. Can we pull from it without
|
||||
// (re)running elections? It is possible if 1) his term
|
||||
// is equal to his last_log_term so we could act on
|
||||
// behalf of leader of this term (we must be sure he was
|
||||
// ever elected) and 2) our term is not higher, or we'll refuse data.
|
||||
if candidate.term == candidate.last_log_term && candidate.term >= term {
|
||||
Some(Donor::from(candidate))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
RecoveryNeededInfo {
|
||||
term,
|
||||
last_log_term,
|
||||
flush_lsn,
|
||||
peers,
|
||||
num_streaming_computes,
|
||||
donors,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_walsenders(&self) -> &Arc<WalSenders> {
|
||||
|
||||
@@ -60,7 +60,6 @@ from fixtures.utils import (
|
||||
allure_attach_from_dir,
|
||||
get_self_dir,
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
)
|
||||
|
||||
"""
|
||||
@@ -1632,7 +1631,7 @@ class NeonPageserver(PgProtocol):
|
||||
".*took more than expected to complete.*",
|
||||
# these can happen during shutdown, but it should not be a reason to fail a test
|
||||
".*completed, took longer than expected.*",
|
||||
'.*registered custom resource manager "neon".*',
|
||||
'.*registered custom resource manager \\\\"neon\\\\".*',
|
||||
# AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
|
||||
# and it is not a failure of our code when it happens.
|
||||
".*DeleteObjects.*We encountered an internal error. Please try again.*",
|
||||
@@ -1681,41 +1680,6 @@ class NeonPageserver(PgProtocol):
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def restart(self, immediate: bool = False):
|
||||
"""
|
||||
High level wrapper for restart: restarts the process, and waits for
|
||||
tenant state to stabilize.
|
||||
"""
|
||||
self.stop(immediate=immediate)
|
||||
self.start()
|
||||
self.quiesce_tenants()
|
||||
|
||||
def quiesce_tenants(self):
|
||||
"""
|
||||
Wait for all tenants to enter a stable state (Active or Broken)
|
||||
|
||||
Call this after restarting the pageserver, or after attaching a tenant,
|
||||
to ensure that it is ready for use.
|
||||
"""
|
||||
|
||||
stable_states = {"Active", "Broken"}
|
||||
|
||||
client = self.http_client()
|
||||
|
||||
def complete():
|
||||
log.info("Checking tenants...")
|
||||
tenants = client.tenant_list()
|
||||
tenants = client.tenant_list()
|
||||
log.info(f"Tenant list: {tenants}...")
|
||||
any_unstable = any((t["state"]["slug"] not in stable_states) for t in tenants)
|
||||
if any_unstable:
|
||||
for t in tenants:
|
||||
log.info(f"Waiting for tenant {t['id']} in state {t['state']['slug']}")
|
||||
log.info(f"any_unstable={any_unstable}")
|
||||
assert not any_unstable
|
||||
|
||||
wait_until(20, 0.5, complete)
|
||||
|
||||
def __enter__(self) -> "NeonPageserver":
|
||||
return self
|
||||
|
||||
@@ -2794,6 +2758,20 @@ class Safekeeper:
|
||||
def data_dir(self) -> str:
|
||||
return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
|
||||
|
||||
def timeline_dir(self, tenant_id, timeline_id) -> str:
|
||||
return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id))
|
||||
|
||||
def list_segments(self, tenant_id, timeline_id) -> List[str]:
|
||||
"""
|
||||
Get list of segment names of the given timeline.
|
||||
"""
|
||||
tli_dir = self.timeline_dir(tenant_id, timeline_id)
|
||||
segments = []
|
||||
for _, _, filenames in os.walk(tli_dir):
|
||||
segments.extend([f for f in filenames if f != "safekeeper.control"])
|
||||
segments.sort()
|
||||
return segments
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
|
||||
@@ -157,7 +157,7 @@ def wait_for_last_record_lsn(
|
||||
lsn: Lsn,
|
||||
) -> Lsn:
|
||||
"""waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
|
||||
for i in range(100):
|
||||
for i in range(1000000):
|
||||
current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
return current_lsn
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
import pytest
|
||||
import os
|
||||
import shutil
|
||||
from contextlib import closing
|
||||
from fixtures.log_helper import log
|
||||
|
||||
from fixtures.compare_fixtures import NeonCompare, PgCompare
|
||||
from fixtures.pg_version import PgVersion
|
||||
@@ -18,6 +21,9 @@ from fixtures.pg_version import PgVersion
|
||||
def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
env = neon_with_baseline
|
||||
|
||||
# Number of times to run the write query. One run creates 350MB of wal.
|
||||
n_writes = 10
|
||||
|
||||
with closing(env.pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table huge (i int, j int);")
|
||||
@@ -25,7 +31,10 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
# Run INSERT, recording the time and I/O it takes
|
||||
with env.record_pageserver_writes("pageserver_writes"):
|
||||
with env.record_duration("insert"):
|
||||
cur.execute("insert into huge values (generate_series(1, 5000000), 0);")
|
||||
for i in range(n_writes):
|
||||
if n_writes > 1:
|
||||
log.info(f"running query {i}/{n_writes}")
|
||||
cur.execute("insert into huge values (generate_series(1, 5000000), 0);")
|
||||
env.flush()
|
||||
|
||||
env.report_peak_memory_use()
|
||||
@@ -39,7 +48,9 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
|
||||
|
||||
def measure_recovery_time(env: NeonCompare):
|
||||
client = env.env.pageserver.http_client()
|
||||
# Hmm why is pageserver less ready to respond to http when the datadir is large?
|
||||
from urllib3.util.retry import Retry
|
||||
client = env.env.pageserver.http_client(retries=Retry(1000))
|
||||
pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"])
|
||||
|
||||
# Stop pageserver and remove tenant data
|
||||
@@ -57,3 +68,13 @@ def measure_recovery_time(env: NeonCompare):
|
||||
|
||||
# Flush, which will also wait for lsn to catch up
|
||||
env.flush()
|
||||
|
||||
|
||||
# This test is meant for local iteration only. The use case is when you want to re-run
|
||||
# the measure_recovery_time part of test_bulk_insert, but without running the setup.
|
||||
# It allows you to iterate on results 2x faster while trying to improve wal ingestion
|
||||
# performance.
|
||||
@pytest.mark.skip("this is a convenience test for local dev only")
|
||||
def test_recovery(neon_env_builder):
|
||||
env = neon_env_builder.init_start()
|
||||
measure_recovery_time(env)
|
||||
|
||||
592
test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
generated
592
test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -9,8 +9,8 @@ publish = false
|
||||
[dependencies]
|
||||
native-tls = "0.2.11"
|
||||
postgres-native-tls = "0.5.0"
|
||||
tokio = { version = "1.28", features=["rt", "macros"] }
|
||||
tokio-postgres = "0.7.8"
|
||||
tokio = { version = "1.33", features=["rt", "macros"] }
|
||||
tokio-postgres = "0.7.10"
|
||||
|
||||
|
||||
# This is not part of the main 'neon' workspace
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM rust:1.70
|
||||
FROM rust:1.73
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
|
||||
@@ -333,30 +333,16 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
|
||||
".*Failed to load index_part from remote storage.*",
|
||||
# On a fast restart, there may be an initdb still running in a basebackup...__temp directory
|
||||
".*Failed to purge.*Directory not empty.*",
|
||||
]
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
|
||||
)
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# pause all uploads
|
||||
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
|
||||
ps_http.tenant_create(env.initial_tenant)
|
||||
|
||||
# Create a timeline whose creation will succeed. The tenant will need at least one
|
||||
# timeline to be loadable.
|
||||
success_timeline = TimelineId.generate()
|
||||
log.info(f"Creating timeline {success_timeline}")
|
||||
ps_http.timeline_create(env.pg_version, env.initial_tenant, success_timeline, timeout=60)
|
||||
|
||||
# Create a timeline whose upload to remote storage will be blocked
|
||||
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
|
||||
|
||||
def start_creating_timeline():
|
||||
log.info(f"Creating (expect failure) timeline {env.initial_timeline}")
|
||||
with pytest.raises(RequestException):
|
||||
ps_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
|
||||
@@ -380,9 +366,6 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
|
||||
with pytest.raises(PageserverApiException, match="not found"):
|
||||
ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# The one successfully created timeline should still be there.
|
||||
assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
|
||||
|
||||
|
||||
def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
|
||||
@@ -15,7 +15,7 @@ from fixtures.types import TenantId, TimelineId
|
||||
|
||||
# Test restarting page server, while safekeeper and compute node keep
|
||||
# running.
|
||||
def test_local_corruption(neon_env_builder: NeonEnvBuilder):
|
||||
def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
@@ -69,19 +69,24 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
# Un-damaged tenant works
|
||||
# Tenant 0 should still work
|
||||
pg0.start()
|
||||
assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
|
||||
|
||||
# Tenant with corrupt local metadata works: remote storage is authoritative for metadata
|
||||
pg1.start()
|
||||
assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
|
||||
# But all others are broken
|
||||
|
||||
# First timeline would not get loaded into pageserver due to corrupt metadata file
|
||||
with pytest.raises(
|
||||
Exception, match=f"Tenant {tenant1} will not become active. Current state: Broken"
|
||||
) as err:
|
||||
pg1.start()
|
||||
log.info(
|
||||
f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
|
||||
)
|
||||
|
||||
# Second timeline will fail during basebackup, because the local layer file is corrupt.
|
||||
# It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
|
||||
# (We don't check layer file contents on startup, when loading the timeline)
|
||||
#
|
||||
# This will change when we implement checksums for layers
|
||||
with pytest.raises(Exception, match="Failed to load delta layer") as err:
|
||||
pg2.start()
|
||||
log.info(
|
||||
@@ -128,7 +133,8 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
|
||||
_ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id)
|
||||
|
||||
# Restart the page server
|
||||
env.pageserver.restart(immediate=True)
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.pageserver.start()
|
||||
|
||||
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
||||
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
||||
|
||||
@@ -157,6 +157,8 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
endpoint = env.endpoints.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
|
||||
|
||||
# insert something to force sk -> ps message
|
||||
endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
||||
# Wait to make sure that we get a latest WAL receiver data.
|
||||
# We need to wait here because it's possible that we don't have access to
|
||||
# the latest WAL yet, when the `timeline_detail` API is first called.
|
||||
@@ -168,7 +170,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
|
||||
)
|
||||
|
||||
# Make a DB modification then expect getting a new WAL receiver's data.
|
||||
endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
||||
endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')")
|
||||
wait_until(
|
||||
number_of_iterations=5,
|
||||
interval=1,
|
||||
|
||||
@@ -62,14 +62,14 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
|
||||
tenant_load_delay_ms = 5000
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start(
|
||||
extra_env_vars={"FAILPOINTS": f"before-attaching-tenant=return({tenant_load_delay_ms})"}
|
||||
extra_env_vars={"FAILPOINTS": f"before-loading-tenant=return({tenant_load_delay_ms})"}
|
||||
)
|
||||
|
||||
# Check that it's in Attaching state
|
||||
# Check that it's in Loading state
|
||||
client = env.pageserver.http_client()
|
||||
tenant_status = client.tenant_status(env.initial_tenant)
|
||||
log.info("Tenant status : %s", tenant_status)
|
||||
assert tenant_status["state"]["slug"] == "Attaching"
|
||||
assert tenant_status["state"]["slug"] == "Loading"
|
||||
|
||||
# Try to read. This waits until the loading finishes, and then return normally.
|
||||
cur.execute("SELECT count(*) FROM foo")
|
||||
|
||||
@@ -241,7 +241,8 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
assert reason.endswith(f"failpoint: {failpoint}"), reason
|
||||
|
||||
if check is Check.RETRY_WITH_RESTART:
|
||||
env.pageserver.restart()
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
if failpoint in (
|
||||
"tenant-delete-before-shutdown",
|
||||
|
||||
@@ -66,6 +66,10 @@ def test_tenant_reattach(
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
# Thats because of UnreliableWrapper's injected failures
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
|
||||
)
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
with endpoint.cursor() as cur:
|
||||
@@ -112,7 +116,7 @@ def test_tenant_reattach(
|
||||
assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
|
||||
|
||||
# Check that we had to retry the downloads
|
||||
assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*")
|
||||
assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
|
||||
assert env.pageserver.log_contains(".*download.*failed, will retry.*")
|
||||
|
||||
|
||||
@@ -639,6 +643,47 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
|
||||
ensure_test_data(data_id, data_secret, endpoint)
|
||||
|
||||
|
||||
# Tests that it's possible to `load` broken tenants:
|
||||
# * `ignore` a tenant
|
||||
# * removes its `metadata` file locally
|
||||
# * `load` the same tenant
|
||||
# * ensure that it's status is `Broken`
|
||||
def test_ignored_tenant_stays_broken_without_metadata(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
env.endpoints.create_start("main")
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: (Broken|Stopping).*"
|
||||
)
|
||||
|
||||
# ignore the tenant and remove its metadata
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
metadata_removed = False
|
||||
for dir_entry in timeline_dir.iterdir():
|
||||
if dir_entry.name == "metadata":
|
||||
# Looks like a layer file. Remove it
|
||||
dir_entry.unlink()
|
||||
metadata_removed = True
|
||||
assert metadata_removed, f"Failed to find metadata file in {timeline_dir}"
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*{tenant_id}.*: load failed.*: failed to load metadata.*"
|
||||
)
|
||||
|
||||
# now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
|
||||
pageserver_http.tenant_load(tenant_id=tenant_id)
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 5)
|
||||
|
||||
|
||||
# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
|
||||
# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
|
||||
def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -733,8 +778,7 @@ def test_ignore_while_attaching(
|
||||
tenants_before_ignore
|
||||
), "Only ignored tenant should be missing"
|
||||
|
||||
# Calling load will bring the tenant back online
|
||||
pageserver_http.configure_failpoints([("attach-before-activate", "off")])
|
||||
# But can load it from local files, that will restore attach.
|
||||
pageserver_http.tenant_load(tenant_id)
|
||||
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
from contextlib import closing
|
||||
from datetime import datetime
|
||||
@@ -19,7 +20,7 @@ from fixtures.neon_fixtures import (
|
||||
)
|
||||
from fixtures.pageserver.utils import timeline_delete_wait_completed
|
||||
from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
|
||||
from fixtures.types import Lsn, TenantId
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
from prometheus_client.samples import Sample
|
||||
|
||||
@@ -297,8 +298,13 @@ def test_pageserver_with_empty_tenants(
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
tenant_with_empty_timelines = env.initial_tenant
|
||||
timeline_delete_wait_completed(client, tenant_with_empty_timelines, env.initial_timeline)
|
||||
tenant_with_empty_timelines = TenantId.generate()
|
||||
client.tenant_create(tenant_with_empty_timelines)
|
||||
temp_timelines = client.timeline_list(tenant_with_empty_timelines)
|
||||
for temp_timeline in temp_timelines:
|
||||
timeline_delete_wait_completed(
|
||||
client, tenant_with_empty_timelines, TimelineId(temp_timeline["timeline_id"])
|
||||
)
|
||||
|
||||
files_in_timelines_dir = sum(
|
||||
1 for _p in Path.iterdir(env.pageserver.timeline_dir(tenant_with_empty_timelines))
|
||||
@@ -311,19 +317,34 @@ def test_pageserver_with_empty_tenants(
|
||||
env.endpoints.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
tenant_without_timelines_dir = env.initial_tenant
|
||||
shutil.rmtree(env.pageserver.timeline_dir(tenant_without_timelines_dir))
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
def not_attaching():
|
||||
def not_loading():
|
||||
tenants = client.tenant_list()
|
||||
assert len(tenants) == 1
|
||||
assert all(t["state"]["slug"] != "Attaching" for t in tenants)
|
||||
assert len(tenants) == 2
|
||||
assert all(t["state"]["slug"] != "Loading" for t in tenants)
|
||||
|
||||
wait_until(10, 0.2, not_attaching)
|
||||
wait_until(10, 0.2, not_loading)
|
||||
|
||||
tenants = client.tenant_list()
|
||||
|
||||
[broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)]
|
||||
assert (
|
||||
broken_tenant["state"]["slug"] == "Broken"
|
||||
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
|
||||
|
||||
broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
|
||||
assert (
|
||||
broken_tenant_status["state"]["slug"] == "Broken"
|
||||
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
|
||||
|
||||
assert env.pageserver.log_contains(".*load failed, setting tenant state to Broken:.*")
|
||||
|
||||
[loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)]
|
||||
assert (
|
||||
loaded_tenant["state"]["slug"] == "Active"
|
||||
@@ -337,6 +358,9 @@ def test_pageserver_with_empty_tenants(
|
||||
time.sleep(1) # to allow metrics propagation
|
||||
|
||||
ps_metrics = client.get_metrics()
|
||||
broken_tenants_metric_filter = {
|
||||
"tenant_id": str(tenant_without_timelines_dir),
|
||||
}
|
||||
active_tenants_metric_filter = {
|
||||
"state": "Active",
|
||||
}
|
||||
@@ -350,3 +374,13 @@ def test_pageserver_with_empty_tenants(
|
||||
assert (
|
||||
tenant_active_count == 1
|
||||
), f"Tenant {tenant_with_empty_timelines} should have metric as active"
|
||||
|
||||
tenant_broken_count = int(
|
||||
ps_metrics.query_one(
|
||||
"pageserver_broken_tenants_count", filter=broken_tenants_metric_filter
|
||||
).value
|
||||
)
|
||||
|
||||
assert (
|
||||
tenant_broken_count == 1
|
||||
), f"Tenant {tenant_without_timelines_dir} should have metric as broken"
|
||||
|
||||
@@ -70,7 +70,8 @@ def test_threshold_based_eviction(
|
||||
}
|
||||
|
||||
# restart because changing tenant config is not instant
|
||||
env.pageserver.restart()
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
|
||||
"kind": "LayerAccessThreshold",
|
||||
|
||||
@@ -277,6 +277,13 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
|
||||
if failpoint == "timeline-delete-after-index-delete":
|
||||
m = ps_http.get_metrics()
|
||||
assert (
|
||||
m.query_one(
|
||||
"remote_storage_s3_request_seconds_count",
|
||||
filter={"request_type": "get_object", "result": "err"},
|
||||
).value
|
||||
== 2 # One is missing tenant deletion mark, second is missing index part
|
||||
)
|
||||
assert (
|
||||
m.query_one(
|
||||
"remote_storage_s3_request_seconds_count",
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import filecmp
|
||||
import os
|
||||
import pathlib
|
||||
import random
|
||||
@@ -400,8 +401,11 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
def wait(f, desc, timeout=30, wait_f=None):
|
||||
started_at = time.time()
|
||||
while True:
|
||||
if f():
|
||||
break
|
||||
try:
|
||||
if f():
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
elapsed = time.time() - started_at
|
||||
if elapsed > timeout:
|
||||
raise RuntimeError(f"timed out waiting {elapsed:.0f}s for {desc}")
|
||||
@@ -984,6 +988,141 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
|
||||
endpoint.start()
|
||||
|
||||
|
||||
# Test that we can create timeline with one safekeeper down and initialize it
|
||||
# later when some data already had been written.
|
||||
def test_late_init(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
sk1 = env.safekeepers[0]
|
||||
sk1.stop()
|
||||
|
||||
# create and insert smth while safekeeper is down...
|
||||
env.neon_cli.create_branch("test_late_init")
|
||||
endpoint = env.endpoints.create_start("test_late_init")
|
||||
endpoint.safe_psql("create table t(key int, value text)")
|
||||
endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
|
||||
log.info("insert with safekeeper down done")
|
||||
endpoint.stop() # stop compute
|
||||
|
||||
# stop another safekeeper, and start one which missed timeline creation
|
||||
sk2 = env.safekeepers[1]
|
||||
sk2.stop()
|
||||
sk1.start()
|
||||
|
||||
# insert some more
|
||||
endpoint = env.endpoints.create_start("test_late_init")
|
||||
endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
|
||||
|
||||
|
||||
# is timeline flush_lsn equal on provided safekeepers?
|
||||
def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id):
|
||||
status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(
|
||||
f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}"
|
||||
)
|
||||
return status1.flush_lsn == status2.flush_lsn
|
||||
|
||||
|
||||
# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that
|
||||
# 1) walproposer can't recover node if it misses WAL written by previous computes, but
|
||||
# still starts up and functions normally if two other sks are ok.
|
||||
# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions
|
||||
# normally if two other sks are ok.
|
||||
# 3) Lagged safekeeper can still recover by peer recovery.
|
||||
def test_one_sk_down(neon_env_builder: NeonEnvBuilder):
|
||||
pass
|
||||
|
||||
|
||||
# Smaller version of test_one_sk_down testing peer recovery in isolation: that
|
||||
# it works without compute at all.
|
||||
def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_peer_recovery")
|
||||
endpoint = env.endpoints.create_start("test_peer_recovery")
|
||||
|
||||
endpoint.safe_psql("create table t(key int, value text)")
|
||||
sk1 = env.safekeepers[0]
|
||||
sk2 = env.safekeepers[1]
|
||||
sk1_http_cli = sk1.http_client()
|
||||
sk2_http_cli = sk2.http_client()
|
||||
# ensure tli gets created on sk1, peer recovery won't do that
|
||||
wait(
|
||||
partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
|
||||
"flush_lsn to get aligned",
|
||||
)
|
||||
|
||||
sk1 = env.safekeepers[0]
|
||||
sk1.stop()
|
||||
|
||||
# roughly fills one segment
|
||||
endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'")
|
||||
|
||||
endpoint.stop() # stop compute
|
||||
|
||||
# now start safekeeper, but with peer recovery disabled; it should lag for about a segment
|
||||
sk1.start(extra_opts=["--peer-recovery=false"])
|
||||
sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(
|
||||
f"flush_lsns after insertion: sk1={sk1_tli_status.flush_lsn}, sk2={sk2_tli_status.flush_lsn}"
|
||||
)
|
||||
assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
|
||||
|
||||
# wait a bit, lsns shouldn't change
|
||||
# time.sleep(5)
|
||||
sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(
|
||||
f"flush_lsns after waiting: sk1={sk1_tli_status.flush_lsn}, sk2={sk2_tli_status.flush_lsn}"
|
||||
)
|
||||
assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
|
||||
|
||||
# now restart safekeeper with peer recovery enabled and wait for recovery
|
||||
sk1.stop().start(extra_opts=["--peer-recovery=true"])
|
||||
wait(
|
||||
partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
|
||||
"flush_lsn to get aligned",
|
||||
)
|
||||
|
||||
# check that WALs are identic after recovery
|
||||
segs = sk1.list_segments(tenant_id, timeline_id)
|
||||
log.info(f"segs are {segs}")
|
||||
|
||||
(_, mismatch, not_regular) = filecmp.cmpfiles(
|
||||
sk1.timeline_dir(tenant_id, timeline_id),
|
||||
sk2.timeline_dir(tenant_id, timeline_id),
|
||||
segs,
|
||||
shallow=False,
|
||||
)
|
||||
log.info(
|
||||
f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
|
||||
)
|
||||
|
||||
for f in mismatch:
|
||||
f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f)
|
||||
f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f)
|
||||
stdout_filename = "{}.filediff".format(f2)
|
||||
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
|
||||
subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
|
||||
|
||||
cmd = "diff {}.hex {}.hex".format(f1, f2)
|
||||
subprocess.run([cmd], stdout=stdout_f, shell=True)
|
||||
|
||||
assert (mismatch, not_regular) == ([], [])
|
||||
|
||||
# stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
|
||||
env.safekeepers[2].stop()
|
||||
endpoint = env.endpoints.create_start("test_peer_recovery")
|
||||
endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
|
||||
|
||||
|
||||
class SafekeeperEnv:
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: ebcca9e9eb...6669a672ee
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 23f2d41102...ab67ab9635
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: e5e255d2da...550ffa6495
6
vendor/revisions.json
vendored
6
vendor/revisions.json
vendored
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"postgres-v16": "e5e255d2da05bc5f884b871c042014030a114a9b",
|
||||
"postgres-v15": "23f2d411020a739375b32895ce1362ded2962084",
|
||||
"postgres-v14": "ebcca9e9eb49621b5b17247833b59e836337e8aa"
|
||||
"postgres-v16": "550ffa6495a5dc62fccc3a8b449386633758680b",
|
||||
"postgres-v15": "ab67ab96355d61e9d0218630be4aa7db53bf83e7",
|
||||
"postgres-v14": "6669a672ee14ab2c09d44c4552f9a13fad3afc10"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user