mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 13:30:38 +00:00
Compare commits
11 Commits
erik/segme
...
erik/batch
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4308ffe5c0 | ||
|
|
d5435b1a81 | ||
|
|
080d585b22 | ||
|
|
7595d3afe6 | ||
|
|
1ff5333a1b | ||
|
|
d8f5d43549 | ||
|
|
2256a5727a | ||
|
|
3f80af8b1d | ||
|
|
a61d81bbc7 | ||
|
|
05381a48f0 | ||
|
|
cef165818c |
7
Cargo.lock
generated
7
Cargo.lock
generated
@@ -994,9 +994,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.5.0"
|
||||
version = "1.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
|
||||
checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -1229,12 +1229,15 @@ dependencies = [
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper 0.14.30",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"notify",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
"postgres",
|
||||
"prometheus",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;
|
||||
SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;
|
||||
|
||||
@@ -18,9 +18,11 @@ clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
hyper0 = { workspace = true, features = ["full"] }
|
||||
metrics.workspace = true
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
num_cpus.workspace = true
|
||||
once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
opentelemetry_sdk.workspace = true
|
||||
postgres.workspace = true
|
||||
@@ -39,6 +41,7 @@ tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
thiserror.workspace = true
|
||||
url.workspace = true
|
||||
prometheus.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::catalog::SchemaDumpError;
|
||||
use crate::catalog::{get_database_schema, get_dbs_and_roles};
|
||||
use crate::compute::forward_termination_signal;
|
||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use crate::installed_extensions;
|
||||
use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
|
||||
use compute_api::responses::{
|
||||
ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
|
||||
@@ -19,6 +20,8 @@ use anyhow::Result;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use metrics::Encoder;
|
||||
use metrics::TextEncoder;
|
||||
use tokio::task;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
@@ -65,6 +68,28 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
|
||||
}
|
||||
|
||||
// Prometheus metrics
|
||||
(&Method::GET, "/metrics") => {
|
||||
debug!("serving /metrics GET request");
|
||||
|
||||
let mut buffer = vec![];
|
||||
let metrics = installed_extensions::collect();
|
||||
let encoder = TextEncoder::new();
|
||||
encoder.encode(&metrics, &mut buffer).unwrap();
|
||||
|
||||
match Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(CONTENT_TYPE, encoder.format_type())
|
||||
.body(Body::from(buffer))
|
||||
{
|
||||
Ok(response) => response,
|
||||
Err(err) => {
|
||||
let msg = format!("error handling /metrics request: {err}");
|
||||
error!(msg);
|
||||
render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
// Collect Postgres current usage insights
|
||||
(&Method::GET, "/insights") => {
|
||||
info!("serving /insights GET request");
|
||||
|
||||
@@ -37,6 +37,21 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeMetrics"
|
||||
|
||||
/metrics
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get compute node metrics in text format.
|
||||
description: ""
|
||||
operationId: getComputeMetrics
|
||||
responses:
|
||||
200:
|
||||
description: ComputeMetrics
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Metrics in text format.
|
||||
/insights:
|
||||
get:
|
||||
tags:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use metrics::proto::MetricFamily;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use tracing::info;
|
||||
@@ -8,6 +9,10 @@ use anyhow::Result;
|
||||
use postgres::{Client, NoTls};
|
||||
use tokio::task;
|
||||
|
||||
use metrics::core::Collector;
|
||||
use metrics::{register_uint_gauge_vec, UIntGaugeVec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
/// We don't reuse get_existing_dbs() just for code clarity
|
||||
/// and to make database listing query here more explicit.
|
||||
///
|
||||
@@ -59,6 +64,12 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
|
||||
|
||||
for (extname, v) in extensions.iter() {
|
||||
let version = v.to_string();
|
||||
|
||||
// increment the number of databases where the version of extension is installed
|
||||
INSTALLED_EXTENSIONS
|
||||
.with_label_values(&[extname, &version])
|
||||
.inc();
|
||||
|
||||
extensions_map
|
||||
.entry(extname.to_string())
|
||||
.and_modify(|e| {
|
||||
@@ -74,9 +85,11 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
|
||||
}
|
||||
}
|
||||
|
||||
Ok(InstalledExtensions {
|
||||
let res = InstalledExtensions {
|
||||
extensions: extensions_map.values().cloned().collect(),
|
||||
})
|
||||
};
|
||||
|
||||
Ok(res)
|
||||
})
|
||||
.await?
|
||||
}
|
||||
@@ -97,6 +110,18 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
|
||||
"[NEON_EXT_STAT] {}",
|
||||
serde_json::to_string(&result).expect("failed to serialize extensions list")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"installed_extensions",
|
||||
"Number of databases where the version of extension is installed",
|
||||
&["extension_name", "version"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub fn collect() -> Vec<MetricFamily> {
|
||||
INSTALLED_EXTENSIONS.collect()
|
||||
}
|
||||
|
||||
@@ -80,18 +80,18 @@ impl NeonWalRecord {
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_clear() -> Self {
|
||||
pub fn wal_clear(s: impl AsRef<str>) -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
append: s.as_ref().to_string(),
|
||||
clear: true,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_init() -> Self {
|
||||
pub fn wal_init(s: impl AsRef<str>) -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
append: s.as_ref().to_string(),
|
||||
clear: true,
|
||||
will_init: true,
|
||||
}
|
||||
|
||||
@@ -123,15 +123,27 @@ pub async fn fsync_async_opt(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Like postgres' durable_rename, renames file issuing fsyncs do make it
|
||||
/// durable. After return, file and rename are guaranteed to be persisted.
|
||||
/// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After
|
||||
/// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the
|
||||
/// same file system.
|
||||
///
|
||||
/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
|
||||
/// contents durable; 2) its directory entry to make rename durable 3) again to
|
||||
/// already renamed file, which is not required by standards but postgres does
|
||||
/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
|
||||
/// rename if it exists to ensure that at least one of the files survives, but
|
||||
/// current callers don't need that.
|
||||
/// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to
|
||||
/// make the rename durable. This sequence ensures the target file will never be incomplete.
|
||||
///
|
||||
/// Postgres also:
|
||||
///
|
||||
/// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing
|
||||
/// file survives a crash. Current callers don't need this as it should already be fsynced if
|
||||
/// durability is needed.
|
||||
///
|
||||
/// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g.
|
||||
/// NFS), but not on Linux with most common file systems like ext4 (which we currently use).
|
||||
///
|
||||
/// An audit of 8 other databases found that none fsynced the file after a rename:
|
||||
/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837180535>
|
||||
///
|
||||
/// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs:
|
||||
/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837926218>
|
||||
///
|
||||
/// virtual_file.rs has similar code, but it doesn't use vfs.
|
||||
///
|
||||
@@ -149,9 +161,6 @@ pub async fn durable_rename(
|
||||
// Time to do the real deal.
|
||||
tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
|
||||
|
||||
// Postgres'ish fsync of renamed file.
|
||||
fsync_async_opt(new_path.as_ref(), do_fsync).await?;
|
||||
|
||||
// Now fsync the parent
|
||||
let parent = match new_path.as_ref().parent() {
|
||||
Some(p) => p,
|
||||
|
||||
@@ -7757,13 +7757,13 @@ mod tests {
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_clear()),
|
||||
Value::WalRecord(NeonWalRecord::wal_clear("c")),
|
||||
),
|
||||
(get_key(4), Lsn(0x10), Value::Image("0x10".into())),
|
||||
(
|
||||
get_key(4),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_init()),
|
||||
Value::WalRecord(NeonWalRecord::wal_init("i")),
|
||||
),
|
||||
];
|
||||
let image1 = vec![(get_key(1), "0x10".into())];
|
||||
@@ -7912,8 +7912,30 @@ mod tests {
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
|
||||
async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
|
||||
test_simple_bottom_most_compaction_deltas_helper(
|
||||
"test_simple_bottom_most_compaction_deltas_1",
|
||||
false,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_deltas_2() -> anyhow::Result<()> {
|
||||
test_simple_bottom_most_compaction_deltas_helper(
|
||||
"test_simple_bottom_most_compaction_deltas_2",
|
||||
true,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
async fn test_simple_bottom_most_compaction_deltas_helper(
|
||||
test_name: &'static str,
|
||||
use_delta_bottom_layer: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create(test_name).await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
@@ -7944,6 +7966,16 @@ mod tests {
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
// or, delta layer at 0x10 if `use_delta_bottom_layer` is true
|
||||
let delta4 = (0..10)
|
||||
.map(|id| {
|
||||
(
|
||||
get_key(id),
|
||||
Lsn(0x08),
|
||||
Value::WalRecord(NeonWalRecord::wal_init(format!("value {id}@0x10"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
@@ -7997,21 +8029,61 @@ mod tests {
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
let tline = if use_delta_bottom_layer {
|
||||
tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x08),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x08)..Lsn(0x10),
|
||||
delta4,
|
||||
),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x20)..Lsn(0x48),
|
||||
delta1,
|
||||
),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x20)..Lsn(0x48),
|
||||
delta2,
|
||||
),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x48)..Lsn(0x50),
|
||||
delta3,
|
||||
),
|
||||
], // delta layers
|
||||
vec![], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?
|
||||
} else {
|
||||
tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x48),
|
||||
delta1,
|
||||
),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x48),
|
||||
delta2,
|
||||
),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x48)..Lsn(0x50),
|
||||
delta3,
|
||||
),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?
|
||||
};
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
@@ -8121,7 +8193,7 @@ mod tests {
|
||||
(
|
||||
key,
|
||||
Lsn(0x10),
|
||||
Value::Image(Bytes::copy_from_slice(b"0x10")),
|
||||
Value::WalRecord(NeonWalRecord::wal_init("0x10")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
@@ -8183,7 +8255,7 @@ mod tests {
|
||||
Lsn(0x20),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
|
||||
Value::Image(Bytes::from_static(b"0x10;0x20")),
|
||||
)]),
|
||||
),
|
||||
(
|
||||
@@ -9165,7 +9237,7 @@ mod tests {
|
||||
|
||||
let will_init = will_init_keys.contains(&i);
|
||||
if will_init {
|
||||
delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
|
||||
delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
|
||||
|
||||
expected_key_values.insert(key, "".to_string());
|
||||
} else {
|
||||
|
||||
@@ -562,7 +562,7 @@ mod tests {
|
||||
(
|
||||
get_key(0),
|
||||
Lsn(0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_init()),
|
||||
Value::WalRecord(NeonWalRecord::wal_init("")),
|
||||
),
|
||||
(
|
||||
get_key(0),
|
||||
@@ -572,7 +572,7 @@ mod tests {
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_init()),
|
||||
Value::WalRecord(NeonWalRecord::wal_init("")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
|
||||
@@ -253,6 +253,10 @@ pub(crate) fn apply_in_neon(
|
||||
use bytes::BufMut;
|
||||
if *will_init {
|
||||
assert!(*clear, "init record must be clear to ensure correctness");
|
||||
assert!(
|
||||
page.is_empty(),
|
||||
"init record must be the first entry to ensure correctness"
|
||||
);
|
||||
}
|
||||
if *clear {
|
||||
page.clear();
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#include <dirent.h>
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include <dirent.h>
|
||||
#include <signal.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
@@ -21,17 +22,35 @@
|
||||
|
||||
static int logical_replication_max_snap_files = 300;
|
||||
|
||||
/*
|
||||
* According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
|
||||
* snapshot files. Let's use 8 MB since 8 is a power of 2.
|
||||
*/
|
||||
static int logical_replication_max_logicalsnapdir_size = 8000;
|
||||
|
||||
/*
|
||||
* A primitive description of a logical snapshot file including the LSN of the
|
||||
* file and its size.
|
||||
*/
|
||||
typedef struct SnapDesc {
|
||||
XLogRecPtr lsn;
|
||||
off_t sz;
|
||||
} SnapDesc;
|
||||
|
||||
PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
|
||||
|
||||
/*
|
||||
* Sorts an array of snapshot descriptors by their LSN.
|
||||
*/
|
||||
static int
|
||||
LsnDescComparator(const void *a, const void *b)
|
||||
SnapDescComparator(const void *a, const void *b)
|
||||
{
|
||||
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
|
||||
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
|
||||
const SnapDesc *desc1 = a;
|
||||
const SnapDesc *desc2 = b;
|
||||
|
||||
if (lsn1 < lsn2)
|
||||
if (desc1->lsn < desc2->lsn)
|
||||
return 1;
|
||||
else if (lsn1 == lsn2)
|
||||
else if (desc1->lsn == desc2->lsn)
|
||||
return 0;
|
||||
else
|
||||
return -1;
|
||||
@@ -43,28 +62,39 @@ LsnDescComparator(const void *a, const void *b)
|
||||
* slots having lower restart_lsn should be dropped.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
get_num_snap_files_lsn_threshold(void)
|
||||
get_snapshots_cutoff_lsn(void)
|
||||
{
|
||||
DIR *dirdesc;
|
||||
struct dirent *de;
|
||||
char *snap_path = "pg_logical/snapshots/";
|
||||
int lsns_allocated = 1024;
|
||||
int lsns_num = 0;
|
||||
XLogRecPtr *lsns;
|
||||
XLogRecPtr cutoff;
|
||||
/* PG 18 has a constant defined for this, PG_LOGICAL_SNAPSHOTS_DIR */
|
||||
#define SNAPDIR "pg_logical/snapshots"
|
||||
|
||||
if (logical_replication_max_snap_files < 0)
|
||||
DIR *dirdesc;
|
||||
int dirdesc_fd;
|
||||
struct dirent *de;
|
||||
size_t snapshot_index = 0;
|
||||
SnapDesc *snapshot_descriptors;
|
||||
size_t descriptors_allocated = 1024;
|
||||
XLogRecPtr cutoff = 0;
|
||||
off_t logicalsnapdir_size = 0;
|
||||
const int logical_replication_max_logicalsnapdir_size_bytes = logical_replication_max_logicalsnapdir_size * 1000;
|
||||
|
||||
if (logical_replication_max_snap_files < 0 && logical_replication_max_logicalsnapdir_size < 0)
|
||||
return 0;
|
||||
|
||||
lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
|
||||
snapshot_descriptors = palloc(sizeof(*snapshot_descriptors) * descriptors_allocated);
|
||||
|
||||
dirdesc = AllocateDir(SNAPDIR);
|
||||
dirdesc_fd = dirfd(dirdesc);
|
||||
if (dirdesc_fd == -1)
|
||||
ereport(ERROR, errmsg("failed to get a file descriptor for " SNAPDIR ": %m"));
|
||||
|
||||
/* find all .snap files and get their lsns */
|
||||
dirdesc = AllocateDir(snap_path);
|
||||
while ((de = ReadDir(dirdesc, snap_path)) != NULL)
|
||||
while ((de = ReadDir(dirdesc, SNAPDIR)) != NULL)
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
uint32 hi;
|
||||
uint32 lo;
|
||||
struct stat st;
|
||||
XLogRecPtr lsn;
|
||||
SnapDesc *desc;
|
||||
|
||||
if (strcmp(de->d_name, ".") == 0 ||
|
||||
strcmp(de->d_name, "..") == 0)
|
||||
@@ -79,28 +109,69 @@ get_num_snap_files_lsn_threshold(void)
|
||||
|
||||
lsn = ((uint64) hi) << 32 | lo;
|
||||
elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
|
||||
if (lsns_allocated == lsns_num)
|
||||
|
||||
if (fstatat(dirdesc_fd, de->d_name, &st, 0) == -1)
|
||||
ereport(ERROR, errmsg("failed to get the size of " SNAPDIR "/%s: %m", de->d_name));
|
||||
|
||||
if (descriptors_allocated == snapshot_index)
|
||||
{
|
||||
lsns_allocated *= 2;
|
||||
lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
|
||||
descriptors_allocated *= 2;
|
||||
snapshot_descriptors = repalloc(snapshot_descriptors, sizeof(*snapshot_descriptors) * descriptors_allocated);
|
||||
}
|
||||
lsns[lsns_num++] = lsn;
|
||||
|
||||
desc = &snapshot_descriptors[snapshot_index++];
|
||||
desc->lsn = lsn;
|
||||
desc->sz = st.st_size;
|
||||
}
|
||||
/* sort by lsn desc */
|
||||
qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
|
||||
/* and take cutoff at logical_replication_max_snap_files */
|
||||
if (logical_replication_max_snap_files > lsns_num)
|
||||
cutoff = 0;
|
||||
/* have less files than cutoff */
|
||||
else
|
||||
|
||||
qsort(snapshot_descriptors, snapshot_index, sizeof(*snapshot_descriptors), SnapDescComparator);
|
||||
|
||||
/* Are there more snapshot files than specified? */
|
||||
if (logical_replication_max_snap_files <= snapshot_index)
|
||||
{
|
||||
cutoff = lsns[logical_replication_max_snap_files - 1];
|
||||
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
|
||||
LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
|
||||
cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
|
||||
elog(LOG,
|
||||
"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
|
||||
LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
|
||||
}
|
||||
pfree(lsns);
|
||||
|
||||
/* Is the size of the logical snapshots directory larger than specified?
|
||||
*
|
||||
* It's possible we could hit both thresholds, so remove any extra files
|
||||
* first, and then truncate based on size of the remaining files.
|
||||
*/
|
||||
if (logicalsnapdir_size > logical_replication_max_logicalsnapdir_size_bytes)
|
||||
{
|
||||
/* Unfortunately, iterating the directory does not guarantee any order
|
||||
* so we can't cache an index in the preceding loop.
|
||||
*/
|
||||
|
||||
off_t sz;
|
||||
const XLogRecPtr original = cutoff;
|
||||
|
||||
sz = snapshot_descriptors[0].sz;
|
||||
for (size_t i = 1; i < logical_replication_max_snap_files; ++i)
|
||||
{
|
||||
if (sz > logical_replication_max_logicalsnapdir_size_bytes)
|
||||
{
|
||||
cutoff = snapshot_descriptors[i - 1].lsn;
|
||||
break;
|
||||
}
|
||||
|
||||
sz += snapshot_descriptors[i].sz;
|
||||
}
|
||||
|
||||
if (cutoff != original)
|
||||
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
|
||||
LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
|
||||
}
|
||||
|
||||
pfree(snapshot_descriptors);
|
||||
FreeDir(dirdesc);
|
||||
|
||||
return cutoff;
|
||||
|
||||
#undef SNAPDIR
|
||||
}
|
||||
|
||||
void
|
||||
@@ -118,6 +189,16 @@ InitLogicalReplicationMonitor(void)
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable(
|
||||
"neon.logical_replication_max_logicalsnapdir_size",
|
||||
"Maximum allowed size of the pg_logical/snapshots directory (KB). When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
|
||||
NULL,
|
||||
&logical_replication_max_logicalsnapdir_size,
|
||||
8000, -1, INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_KB,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
memset(&bgw, 0, sizeof(bgw));
|
||||
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
|
||||
@@ -162,7 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
|
||||
* If there are too many .snap files, just drop all logical slots to
|
||||
* prevent aux files bloat.
|
||||
*/
|
||||
cutoff_lsn = get_num_snap_files_lsn_threshold();
|
||||
cutoff_lsn = get_snapshots_cutoff_lsn();
|
||||
if (cutoff_lsn > 0)
|
||||
{
|
||||
for (int i = 0; i < max_replication_slots; i++)
|
||||
|
||||
@@ -262,7 +262,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) {
|
||||
|
||||
// Send requests.
|
||||
for req in reqgen {
|
||||
_ = reply_rx.try_recv(); // discard any replies, to avoid blocking
|
||||
while reply_rx.try_recv().is_ok() {} // discard replies, to avoid blocking
|
||||
let msg = ProposerAcceptorMessage::AppendRequest(req);
|
||||
msg_tx.send(msg).await.expect("send failed");
|
||||
}
|
||||
|
||||
@@ -217,7 +217,8 @@ pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
|
||||
let mut buckets = pow2_buckets(1, MSG_QUEUE_SIZE);
|
||||
buckets.insert(0, 0.0);
|
||||
buckets.insert(buckets.len() - 1, (MSG_QUEUE_SIZE - 1) as f64);
|
||||
assert!(buckets.len() <= 12, "too many histogram buckets");
|
||||
// TODO: tweak this.
|
||||
assert!(buckets.len() <= 16, "too many histogram buckets");
|
||||
|
||||
register_histogram!(
|
||||
"safekeeper_wal_receiver_queue_depth",
|
||||
|
||||
@@ -7,14 +7,15 @@ use crate::metrics::{
|
||||
WAL_RECEIVERS, WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL,
|
||||
WAL_RECEIVER_QUEUE_SIZE_TOTAL,
|
||||
};
|
||||
use crate::safekeeper::AcceptorProposerMessage;
|
||||
use crate::safekeeper::ProposerAcceptorMessage;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
|
||||
ServerInfo,
|
||||
};
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{BufMut as _, Bytes, BytesMut};
|
||||
use parking_lot::MappedMutexGuard;
|
||||
use parking_lot::Mutex;
|
||||
use parking_lot::MutexGuard;
|
||||
@@ -206,7 +207,8 @@ impl Drop for WalReceiverGuard {
|
||||
}
|
||||
}
|
||||
|
||||
pub const MSG_QUEUE_SIZE: usize = 256;
|
||||
// TODO: reconsider this.
|
||||
pub const MSG_QUEUE_SIZE: usize = 4096;
|
||||
pub const REPLY_QUEUE_SIZE: usize = 16;
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
@@ -484,6 +486,9 @@ const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
|
||||
/// every 5 seconds, for 12 samples per poll. This will give a count of up to 12x active timelines.
|
||||
const METRICS_INTERVAL: Duration = Duration::from_secs(5);
|
||||
|
||||
/// The AppendRequest buffer size.
|
||||
const APPEND_BUFFER_SIZE: usize = 1024 * 1024;
|
||||
|
||||
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
|
||||
/// replies to reply_tx.
|
||||
///
|
||||
@@ -530,6 +535,9 @@ impl WalAcceptor {
|
||||
async fn run(&mut self) -> anyhow::Result<()> {
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
|
||||
|
||||
// Buffer AppendRequests to submit them as a single large write.
|
||||
let mut append_buf = BufferedAppendRequest::new(APPEND_BUFFER_SIZE);
|
||||
|
||||
// Periodically flush the WAL and compute metrics.
|
||||
let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
|
||||
flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
@@ -546,7 +554,7 @@ impl WalAcceptor {
|
||||
// Process inbound message.
|
||||
msg = self.msg_rx.recv() => {
|
||||
// If disconnected, break to flush WAL and return.
|
||||
let Some(mut msg) = msg else {
|
||||
let Some(msg) = msg else {
|
||||
break;
|
||||
};
|
||||
|
||||
@@ -563,11 +571,44 @@ impl WalAcceptor {
|
||||
// This batches multiple appends per fsync. If the channel is empty after
|
||||
// sending the reply, we'll schedule an immediate flush.
|
||||
if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
|
||||
msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
||||
dirty = true;
|
||||
}
|
||||
// Try to batch multiple messages into a single large write.
|
||||
if !append_buf.is_empty() || !self.msg_rx.is_empty() {
|
||||
if append_buf.add(&append_request) {
|
||||
continue; // message buffered, go get next message
|
||||
}
|
||||
|
||||
self.tli.process_msg(&msg).await?
|
||||
// Full buffer, write it and buffer this message for next iteration.
|
||||
dirty = true;
|
||||
let buf_req = append_buf.take().expect("empty buffer");
|
||||
let buf_msg = ProposerAcceptorMessage::NoFlushAppendRequest(buf_req);
|
||||
let reply = self.tli.process_msg(&buf_msg).await?;
|
||||
drop(buf_msg); // allow reusing buffer for add
|
||||
assert!(append_buf.add(&append_request), "empty buffer rejected msg");
|
||||
reply
|
||||
} else {
|
||||
dirty = true;
|
||||
let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
||||
self.tli.process_msg(&msg).await?
|
||||
}
|
||||
} else {
|
||||
self.tli.process_msg(&msg).await?
|
||||
}
|
||||
}
|
||||
|
||||
// If there are no pending messages, write the append buffer.
|
||||
//
|
||||
// NB: we don't also flush the WAL here. Otherwise we can get into a regime where we
|
||||
// quickly drain msg_rx and fsync before the sender is able to repopulate msg_rx.
|
||||
// This happens consistently due to Tokio scheduling, leading to overeager fsyncing.
|
||||
// Instead, we perform the write without fsyncing and give the sender a chance to
|
||||
// get scheduled and populate msg_rx for the next iteration. If there are no further
|
||||
// messages, the next iteration will flush the WAL.
|
||||
_ = future::ready(()), if self.msg_rx.is_empty() && !append_buf.is_empty() => {
|
||||
dirty = true;
|
||||
let buf_req = append_buf.take().expect("empty buffer");
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::NoFlushAppendRequest(buf_req))
|
||||
.await?
|
||||
}
|
||||
|
||||
// While receiving AppendRequests, flush the WAL periodically and respond with an
|
||||
@@ -579,11 +620,11 @@ impl WalAcceptor {
|
||||
.await?
|
||||
}
|
||||
|
||||
// If there are no pending messages, flush the WAL immediately.
|
||||
// If there are no pending messages, flush the WAL and append buffer immediately.
|
||||
//
|
||||
// TODO: this should be done via flush_ticker.reset_immediately(), but that's always
|
||||
// delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866.
|
||||
_ = future::ready(()), if dirty && self.msg_rx.is_empty() => {
|
||||
_ = future::ready(()), if self.msg_rx.is_empty() && dirty => {
|
||||
dirty = false;
|
||||
flush_ticker.reset();
|
||||
self.tli
|
||||
@@ -627,3 +668,115 @@ impl Drop for WalAcceptor {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Buffers WAL data for multiple AppendRequests, to submit them as a single write.
|
||||
struct BufferedAppendRequest {
|
||||
/// The buffer capacity.
|
||||
capacity: usize,
|
||||
/// The buffered header and WAL data.
|
||||
buf: Option<(AppendRequestHeader, BytesMut)>,
|
||||
/// A previous buffer that can be reused when the returned message is dropped.
|
||||
reuse_buf: Option<Bytes>,
|
||||
/// If an AppendRequest is larger than the buffer capacity (when empty), just stash it here to
|
||||
/// avoid growing the buffer and copying it. This will be returned as-is.
|
||||
large: Option<AppendRequest>,
|
||||
}
|
||||
|
||||
impl BufferedAppendRequest {
|
||||
/// Creates a new append request buffer with the given capacity.
|
||||
fn new(capacity: usize) -> Self {
|
||||
Self {
|
||||
capacity,
|
||||
buf: None,
|
||||
reuse_buf: None,
|
||||
large: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds the given append request to the buffer, if possible. Returns `false` if the message
|
||||
/// can't be buffered, leaving self unmodified. An empty buffer will always accept a message.
|
||||
///
|
||||
/// If the buffer is not empty, the message must have the same term and proposer and contiguous
|
||||
/// `begin_lsn` and `end_lsn`. The buffer must have available capacity for the entire
|
||||
/// `wal_data`. If the message is greater than an empty buffer's capacity, it is accepted but
|
||||
/// simply stashed away in `large` without growing the buffer.
|
||||
pub fn add(&mut self, msg: &AppendRequest) -> bool {
|
||||
// If there is a stashed large message, reject further messages.
|
||||
if self.large.is_some() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If there is no existing buffer, initialize one with the message.
|
||||
let Some((ref mut h, ref mut wal_data)) = self.buf else {
|
||||
// If the message is larger than the buffer capacity, just stash it instead of growing.
|
||||
if msg.wal_data.len() > self.capacity {
|
||||
assert!(self.large.is_none());
|
||||
self.large = Some(msg.clone()); // clone is cheap with Bytes
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reuse a previous buffer, if any, or allocate a new one.
|
||||
//
|
||||
// TODO: try_into_mut() is essentially runtime borrow checking. If AppendRequest used a
|
||||
// normal Vec<u8> we could do compile-time borrow checking instead and avoid panic.
|
||||
let mut wal_data = match self.reuse_buf.take() {
|
||||
Some(reuse_buf) => match reuse_buf.try_into_mut() {
|
||||
Ok(mut reuse_buf) => {
|
||||
assert_eq!(reuse_buf.capacity(), self.capacity);
|
||||
reuse_buf.clear();
|
||||
reuse_buf
|
||||
}
|
||||
Err(_) => panic!("couldn't reuse buffer, still in use"),
|
||||
},
|
||||
None => BytesMut::with_capacity(self.capacity),
|
||||
};
|
||||
// Copy the append request into the buffer.
|
||||
wal_data.put_slice(&msg.wal_data);
|
||||
self.buf = Some((msg.h, wal_data));
|
||||
return true;
|
||||
};
|
||||
|
||||
// The messages must have the same term and proposer.
|
||||
if h.term != msg.h.term || h.proposer_uuid != msg.h.proposer_uuid {
|
||||
return false;
|
||||
}
|
||||
// The messages must be contiguous.
|
||||
if h.end_lsn != msg.h.begin_lsn {
|
||||
return false;
|
||||
}
|
||||
// The message must fit in the buffer.
|
||||
if wal_data.len() + msg.wal_data.len() > self.capacity {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Add the message to the buffer, bumping the commit and truncate LSNs. We assume that later
|
||||
// messages have later commit/truncate LSNs.
|
||||
h.end_lsn = msg.h.end_lsn;
|
||||
h.commit_lsn = msg.h.commit_lsn;
|
||||
h.truncate_lsn = msg.h.truncate_lsn;
|
||||
wal_data.put_slice(&msg.wal_data);
|
||||
true
|
||||
}
|
||||
|
||||
/// Returns true if there is no buffered message.
|
||||
fn is_empty(&self) -> bool {
|
||||
self.buf.is_none() && self.large.is_none()
|
||||
}
|
||||
|
||||
/// Takes the buffered AppendRequest (if any), leaving a None in its place.
|
||||
///
|
||||
/// NB: The returned `wal_data` Bytes must be dropped before the next call to `add()`, in order
|
||||
/// to reuse the buffer. This is basically runtime borrow checking, because of Bytes.
|
||||
fn take(&mut self) -> Option<AppendRequest> {
|
||||
// If there is a stashed large message, return it.
|
||||
if let Some(large) = self.large.take() {
|
||||
assert!(self.buf.is_none(), "both buf and large are set");
|
||||
return Some(large);
|
||||
}
|
||||
|
||||
let (h, wal_data) = self.buf.take()?;
|
||||
let wal_data = wal_data.freeze();
|
||||
self.reuse_buf = Some(wal_data.clone()); // keep a reference to the buffer
|
||||
Some(AppendRequest { h, wal_data })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -296,12 +296,13 @@ pub struct ProposerElected {
|
||||
|
||||
/// Request with WAL message sent from proposer to safekeeper. Along the way it
|
||||
/// communicates commit_lsn.
|
||||
#[derive(Debug)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AppendRequest {
|
||||
pub h: AppendRequestHeader,
|
||||
pub wal_data: Bytes,
|
||||
}
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
|
||||
#[derive(Debug, Clone, Copy, Deserialize)]
|
||||
pub struct AppendRequestHeader {
|
||||
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term,
|
||||
@@ -1166,7 +1167,7 @@ mod tests {
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let mut append_request = AppendRequest {
|
||||
h: ar_hdr.clone(),
|
||||
h: ar_hdr,
|
||||
wal_data: Bytes::from_static(b"b"),
|
||||
};
|
||||
|
||||
@@ -1240,7 +1241,7 @@ mod tests {
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let append_request = AppendRequest {
|
||||
h: ar_hdr.clone(),
|
||||
h: ar_hdr,
|
||||
wal_data: Bytes::from_static(b"b"),
|
||||
};
|
||||
|
||||
@@ -1248,7 +1249,7 @@ mod tests {
|
||||
sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await
|
||||
.unwrap();
|
||||
let mut ar_hrd2 = ar_hdr.clone();
|
||||
let mut ar_hrd2 = ar_hdr;
|
||||
ar_hrd2.begin_lsn = Lsn(4);
|
||||
ar_hrd2.end_lsn = Lsn(5);
|
||||
let append_request = AppendRequest {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
use std::{cmp::max, ops::Deref};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use safekeeper_api::models::TimelineTermBumpResponse;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::{
|
||||
@@ -144,7 +145,7 @@ impl TimelinePersistentState {
|
||||
ServerInfo {
|
||||
pg_version: 170000, /* Postgres server version (major * 10000) */
|
||||
system_id: 0, /* Postgres system identifier */
|
||||
wal_seg_size: 16 * 1024 * 1024,
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
},
|
||||
vec![],
|
||||
Lsn::INVALID,
|
||||
|
||||
@@ -46,3 +46,8 @@ class EndpointHttpClient(requests.Session):
|
||||
)
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def metrics(self) -> str:
|
||||
res = self.get(f"http://localhost:{self.port}/metrics")
|
||||
res.raise_for_status()
|
||||
return res.text
|
||||
|
||||
@@ -1065,6 +1065,9 @@ class NeonEnv:
|
||||
"http_auth_type": http_auth_type,
|
||||
# Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
|
||||
"availability_zone": "us-east-2a",
|
||||
# Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
|
||||
# the pageserver taking a long time to start up due to syncfs flushing other tests' data
|
||||
"no_sync": True,
|
||||
}
|
||||
if self.pageserver_virtual_file_io_engine is not None:
|
||||
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
|
||||
|
||||
@@ -149,12 +149,16 @@ def test_subscriber_lag(
|
||||
check_pgbench_still_running(pub_workload, "pub")
|
||||
check_pgbench_still_running(sub_workload, "sub")
|
||||
|
||||
with (
|
||||
psycopg2.connect(pub_connstr) as pub_conn,
|
||||
psycopg2.connect(sub_connstr) as sub_conn,
|
||||
):
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
pub_conn = psycopg2.connect(pub_connstr)
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
@@ -206,6 +210,7 @@ def test_publisher_restart(
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
|
||||
pub_exists = len(pub_cur.fetchall()) != 0
|
||||
@@ -222,6 +227,7 @@ def test_publisher_restart(
|
||||
sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
|
||||
|
||||
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
@@ -248,12 +254,17 @@ def test_publisher_restart(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
|
||||
env=pub_env,
|
||||
)
|
||||
with (
|
||||
psycopg2.connect(pub_connstr) as pub_conn,
|
||||
psycopg2.connect(sub_connstr) as sub_conn,
|
||||
):
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
pub_conn = psycopg2.connect(pub_connstr)
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
@@ -288,58 +299,56 @@ def test_snap_files(
|
||||
env = benchmark_project_pub.pgbench_env
|
||||
connstr = benchmark_project_pub.connstr
|
||||
|
||||
with psycopg2.connect(connstr) as conn:
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
|
||||
is_super = cast("bool", cur.fetchall()[0][0])
|
||||
assert is_super, "This benchmark won't work if we don't have superuser"
|
||||
conn = psycopg2.connect(connstr)
|
||||
conn.autocommit = True
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
|
||||
is_super = cast("bool", cur.fetchall()[0][0])
|
||||
assert is_super, "This benchmark won't work if we don't have superuser"
|
||||
|
||||
conn.close()
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)
|
||||
|
||||
conn = psycopg2.connect(connstr)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1")
|
||||
|
||||
with psycopg2.connect(connstr) as conn:
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT pg_reload_conf()")
|
||||
|
||||
with psycopg2.connect(connstr) as conn:
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_replication_slots
|
||||
WHERE slot_name = 'slotter'
|
||||
) THEN
|
||||
PERFORM pg_drop_replication_slot('slotter');
|
||||
END IF;
|
||||
END $$;
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
)
|
||||
cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_replication_slots
|
||||
WHERE slot_name = 'slotter'
|
||||
) THEN
|
||||
PERFORM pg_drop_replication_slot('slotter');
|
||||
END IF;
|
||||
END $$;
|
||||
"""
|
||||
)
|
||||
cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
|
||||
|
||||
conn.close()
|
||||
|
||||
workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
|
||||
try:
|
||||
start = time.time()
|
||||
prev_measurement = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
with psycopg2.connect(connstr) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
|
||||
)
|
||||
check_pgbench_still_running(workload)
|
||||
cur.execute(
|
||||
"SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
|
||||
)
|
||||
conn = psycopg2.connect(connstr)
|
||||
conn.autocommit = True
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
|
||||
)
|
||||
check_pgbench_still_running(workload)
|
||||
cur.execute("SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())")
|
||||
|
||||
conn.close()
|
||||
|
||||
# Measure storage
|
||||
if time.time() - prev_measurement > test_interval_min * 60:
|
||||
|
||||
@@ -102,15 +102,21 @@ def test_ro_replica_lag(
|
||||
check_pgbench_still_running(master_workload)
|
||||
check_pgbench_still_running(replica_workload)
|
||||
time.sleep(sync_interval_min * 60)
|
||||
|
||||
conn_master = psycopg2.connect(master_connstr)
|
||||
conn_replica = psycopg2.connect(replica_connstr)
|
||||
conn_master.autocommit = True
|
||||
conn_replica.autocommit = True
|
||||
|
||||
with (
|
||||
psycopg2.connect(master_connstr) as conn_master,
|
||||
psycopg2.connect(replica_connstr) as conn_replica,
|
||||
conn_master.cursor() as cur_master,
|
||||
conn_replica.cursor() as cur_replica,
|
||||
):
|
||||
with (
|
||||
conn_master.cursor() as cur_master,
|
||||
conn_replica.cursor() as cur_replica,
|
||||
):
|
||||
lag = measure_replication_lag(cur_master, cur_replica)
|
||||
lag = measure_replication_lag(cur_master, cur_replica)
|
||||
|
||||
conn_master.close()
|
||||
conn_replica.close()
|
||||
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
finally:
|
||||
@@ -219,11 +225,15 @@ def test_replication_start_stop(
|
||||
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env)
|
||||
|
||||
# Sync replicas
|
||||
with psycopg2.connect(master_connstr) as conn_master:
|
||||
with conn_master.cursor() as cur_master:
|
||||
for i in range(num_replicas):
|
||||
conn_replica = psycopg2.connect(replica_connstr[i])
|
||||
measure_replication_lag(cur_master, conn_replica.cursor())
|
||||
conn_master = psycopg2.connect(master_connstr)
|
||||
conn_master.autocommit = True
|
||||
|
||||
with conn_master.cursor() as cur_master:
|
||||
for i in range(num_replicas):
|
||||
conn_replica = psycopg2.connect(replica_connstr[i])
|
||||
measure_replication_lag(cur_master, conn_replica.cursor())
|
||||
|
||||
conn_master.close()
|
||||
|
||||
master_pgbench = pg_bin.run_nonblocking(
|
||||
[
|
||||
@@ -277,17 +287,22 @@ def test_replication_start_stop(
|
||||
|
||||
time.sleep(configuration_test_time_sec)
|
||||
|
||||
with psycopg2.connect(master_connstr) as conn_master:
|
||||
with conn_master.cursor() as cur_master:
|
||||
for ireplica in range(num_replicas):
|
||||
replica_conn = psycopg2.connect(replica_connstr[ireplica])
|
||||
lag = measure_replication_lag(cur_master, replica_conn.cursor())
|
||||
zenbenchmark.record(
|
||||
f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
log.info(
|
||||
f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
|
||||
)
|
||||
conn_master = psycopg2.connect(master_connstr)
|
||||
conn_master.autocommit = True
|
||||
|
||||
with conn_master.cursor() as cur_master:
|
||||
for ireplica in range(num_replicas):
|
||||
replica_conn = psycopg2.connect(replica_connstr[ireplica])
|
||||
lag = measure_replication_lag(cur_master, replica_conn.cursor())
|
||||
zenbenchmark.record(
|
||||
f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
log.info(
|
||||
f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
|
||||
)
|
||||
|
||||
conn_master.close()
|
||||
|
||||
master_pgbench.terminate()
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
|
||||
@@ -1,6 +1,14 @@
|
||||
from logging import info
|
||||
from __future__ import annotations
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
import time
|
||||
from logging import info
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import parse_metrics
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
def test_installed_extensions(neon_simple_env: NeonEnv):
|
||||
@@ -85,3 +93,52 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
|
||||
assert ext["n_databases"] == 2
|
||||
ext["versions"].sort()
|
||||
assert ext["versions"] == ["1.2", "1.3"]
|
||||
|
||||
# check that /metrics endpoint is available
|
||||
# ensure that we see the metric before and after restart
|
||||
res = client.metrics()
|
||||
info("Metrics: %s", res)
|
||||
m = parse_metrics(res)
|
||||
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
|
||||
assert len(neon_m) == 1
|
||||
for sample in neon_m:
|
||||
assert sample.value == 2
|
||||
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
|
||||
assert len(neon_m) == 1
|
||||
for sample in neon_m:
|
||||
assert sample.value == 1
|
||||
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
|
||||
timeout = 10
|
||||
while timeout > 0:
|
||||
try:
|
||||
res = client.metrics()
|
||||
timeout = -1
|
||||
if len(parse_metrics(res).query_all("installed_extensions")) < 4:
|
||||
# Assume that not all metrics that are collected yet
|
||||
time.sleep(1)
|
||||
timeout -= 1
|
||||
continue
|
||||
except Exception:
|
||||
log.exception("failed to get metrics, assume they are not collected yet")
|
||||
time.sleep(1)
|
||||
timeout -= 1
|
||||
continue
|
||||
|
||||
assert (
|
||||
len(parse_metrics(res).query_all("installed_extensions")) >= 4
|
||||
), "Not all metrics are collected"
|
||||
|
||||
info("After restart metrics: %s", res)
|
||||
m = parse_metrics(res)
|
||||
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
|
||||
assert len(neon_m) == 1
|
||||
for sample in neon_m:
|
||||
assert sample.value == 1
|
||||
|
||||
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
|
||||
assert len(neon_m) == 1
|
||||
for sample in neon_m:
|
||||
assert sample.value == 1
|
||||
|
||||
@@ -5,7 +5,8 @@ import time
|
||||
from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
|
||||
|
||||
|
||||
def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
"""Test read replica of a primary which has a logical replication publication"""
|
||||
env = neon_simple_env
|
||||
|
||||
n_records = 100000
|
||||
@@ -13,7 +14,6 @@ def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
primary = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
endpoint_id="primary",
|
||||
config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
|
||||
)
|
||||
p_con = primary.connect()
|
||||
p_cur = p_con.cursor()
|
||||
@@ -30,7 +30,6 @@ def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
secondary = env.endpoints.new_replica_start(
|
||||
origin=primary,
|
||||
endpoint_id="secondary",
|
||||
config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
|
||||
)
|
||||
|
||||
s_con = secondary.connect()
|
||||
@@ -48,3 +47,51 @@ def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
# Check that LR slot is not copied to replica
|
||||
s_cur.execute("select count(*) from pg_replication_slots")
|
||||
assert s_cur.fetchall()[0][0] == 0
|
||||
|
||||
|
||||
def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
"""Test that AUX files are not saved at replica"""
|
||||
env = neon_simple_env
|
||||
|
||||
n_records = 20000
|
||||
|
||||
primary = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
endpoint_id="primary",
|
||||
)
|
||||
p_con = primary.connect()
|
||||
p_cur = p_con.cursor()
|
||||
p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
|
||||
p_cur.execute("create publication pub1 for table t")
|
||||
|
||||
# start subscriber
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)")
|
||||
connstr = primary.connstr().replace("'", "''")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
|
||||
|
||||
for pk in range(n_records):
|
||||
p_cur.execute("insert into t (pk) values (%s)", (pk,))
|
||||
|
||||
# LR snapshot is stored each 15 seconds
|
||||
time.sleep(16)
|
||||
|
||||
# start replica
|
||||
secondary = env.endpoints.new_replica_start(
|
||||
origin=primary,
|
||||
endpoint_id="secondary",
|
||||
)
|
||||
|
||||
s_con = secondary.connect()
|
||||
s_cur = s_con.cursor()
|
||||
|
||||
logical_replication_sync(vanilla_pg, primary)
|
||||
|
||||
assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
|
||||
s_cur.execute("select count(*) from t")
|
||||
assert s_cur.fetchall()[0][0] == n_records
|
||||
|
||||
vanilla_pg.stop()
|
||||
secondary.stop()
|
||||
primary.stop()
|
||||
assert not secondary.log_contains("cannot make new WAL entries during recovery")
|
||||
|
||||
@@ -427,7 +427,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.start()
|
||||
|
||||
for f in futs:
|
||||
f.result(timeout=10)
|
||||
f.result(timeout=30)
|
||||
|
||||
# The tenant should end up active
|
||||
wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 2199b83fb7...de0a000daf
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 22e580fe9f...fd631a9590
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: e131a9c027...03b43900ed
2
vendor/postgres-v17
vendored
2
vendor/postgres-v17
vendored
Submodule vendor/postgres-v17 updated: 9ad2f3c5c3...ae4cc30dba
8
vendor/revisions.json
vendored
8
vendor/revisions.json
vendored
@@ -1,18 +1,18 @@
|
||||
{
|
||||
"v17": [
|
||||
"17.0",
|
||||
"9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb"
|
||||
"ae4cc30dba24f3910533e5a48e8103c3f2fff300"
|
||||
],
|
||||
"v16": [
|
||||
"16.4",
|
||||
"e131a9c027b202ce92bd7b9cf2569d48a6f9948e"
|
||||
"03b43900edc5d8d6eecec460bfc89aec7174bd84"
|
||||
],
|
||||
"v15": [
|
||||
"15.8",
|
||||
"22e580fe9ffcea7e02592110b1c9bf426d83cada"
|
||||
"fd631a959049dfe2b82f67409c8b8b0d3e0016d1"
|
||||
],
|
||||
"v14": [
|
||||
"14.13",
|
||||
"2199b83fb72680001ce0f43bf6187a21dfb8f45d"
|
||||
"de0a000dafc2e66ce2e39282d3aa1c704fe0390e"
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user