Compare commits

..

1 Commits

Author SHA1 Message Date
Erik Grinaker
6ad0739080 postgres_ffi: bump WAL_SEGMENT_SIZE to 128 MB 2024-11-12 18:11:03 +01:00
35 changed files with 214 additions and 1177 deletions

3
Cargo.lock generated
View File

@@ -1229,15 +1229,12 @@ dependencies = [
"flate2",
"futures",
"hyper 0.14.30",
"metrics",
"nix 0.27.1",
"notify",
"num_cpus",
"once_cell",
"opentelemetry",
"opentelemetry_sdk",
"postgres",
"prometheus",
"regex",
"remote_storage",
"reqwest 0.12.4",

View File

@@ -55,7 +55,6 @@ RUN set -e \
--bin proxy \
--bin neon_local \
--bin storage_scrubber \
--bin stream_events \
--locked --release
# Build final image
@@ -83,7 +82,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_control
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/stream_events /usr/local/bin
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/

View File

@@ -1 +1 @@
SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;
SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;

View File

@@ -1,45 +1,3 @@
commit 00aa659afc9c7336ab81036edec3017168aabf40
Author: Heikki Linnakangas <heikki@neon.tech>
Date: Tue Nov 12 16:59:19 2024 +0200
Temporarily disable test that depends on timezone
diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
index 23ef5fa..9e60deb 100644
--- a/ext-src/pg_anon-src/tests/expected/generalization.out
+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
(1 row)
-SELECT anon.generalize_tstzrange('19041107','millennium');
- generalize_tstzrange
------------------------------------------------------------------
- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
-(1 row)
-
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
-- generalize_daterange
SELECT anon.generalize_daterange('19041107');
generalize_daterange
diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
index b868344..b4fc977 100644
--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
SELECT anon.generalize_tstzrange('19041107','year');
SELECT anon.generalize_tstzrange('19041107','decade');
SELECT anon.generalize_tstzrange('19041107','century');
-SELECT anon.generalize_tstzrange('19041107','millennium');
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
-- generalize_daterange
SELECT anon.generalize_daterange('19041107');
commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
Author: Alexey Masterov <alexeymasterov@neon.tech>
Date: Fri May 31 06:34:26 2024 +0000

View File

@@ -18,11 +18,9 @@ clap.workspace = true
flate2.workspace = true
futures.workspace = true
hyper0 = { workspace = true, features = ["full"] }
metrics.workspace = true
nix.workspace = true
notify.workspace = true
num_cpus.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
opentelemetry_sdk.workspace = true
postgres.workspace = true
@@ -41,7 +39,6 @@ tracing-subscriber.workspace = true
tracing-utils.workspace = true
thiserror.workspace = true
url.workspace = true
prometheus.workspace = true
compute_api.workspace = true
utils.workspace = true

View File

@@ -9,7 +9,6 @@ use crate::catalog::SchemaDumpError;
use crate::catalog::{get_database_schema, get_dbs_and_roles};
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use crate::installed_extensions;
use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
use compute_api::responses::{
ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
@@ -20,8 +19,6 @@ use anyhow::Result;
use hyper::header::CONTENT_TYPE;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use metrics::Encoder;
use metrics::TextEncoder;
use tokio::task;
use tracing::{debug, error, info, warn};
use tracing_utils::http::OtelName;
@@ -68,28 +65,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
}
// Prometheus metrics
(&Method::GET, "/metrics") => {
debug!("serving /metrics GET request");
let mut buffer = vec![];
let metrics = installed_extensions::collect();
let encoder = TextEncoder::new();
encoder.encode(&metrics, &mut buffer).unwrap();
match Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, encoder.format_type())
.body(Body::from(buffer))
{
Ok(response) => response,
Err(err) => {
let msg = format!("error handling /metrics request: {err}");
error!(msg);
render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// Collect Postgres current usage insights
(&Method::GET, "/insights") => {
info!("serving /insights GET request");

View File

@@ -37,21 +37,6 @@ paths:
schema:
$ref: "#/components/schemas/ComputeMetrics"
/metrics
get:
tags:
- Info
summary: Get compute node metrics in text format.
description: ""
operationId: getComputeMetrics
responses:
200:
description: ComputeMetrics
content:
text/plain:
schema:
type: string
description: Metrics in text format.
/insights:
get:
tags:

View File

@@ -1,5 +1,4 @@
use compute_api::responses::{InstalledExtension, InstalledExtensions};
use metrics::proto::MetricFamily;
use std::collections::HashMap;
use std::collections::HashSet;
use tracing::info;
@@ -9,10 +8,6 @@ use anyhow::Result;
use postgres::{Client, NoTls};
use tokio::task;
use metrics::core::Collector;
use metrics::{register_uint_gauge_vec, UIntGaugeVec};
use once_cell::sync::Lazy;
/// We don't reuse get_existing_dbs() just for code clarity
/// and to make database listing query here more explicit.
///
@@ -64,12 +59,6 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
for (extname, v) in extensions.iter() {
let version = v.to_string();
// increment the number of databases where the version of extension is installed
INSTALLED_EXTENSIONS
.with_label_values(&[extname, &version])
.inc();
extensions_map
.entry(extname.to_string())
.and_modify(|e| {
@@ -85,11 +74,9 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
}
}
let res = InstalledExtensions {
Ok(InstalledExtensions {
extensions: extensions_map.values().cloned().collect(),
};
Ok(res)
})
})
.await?
}
@@ -110,18 +97,6 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
"[NEON_EXT_STAT] {}",
serde_json::to_string(&result).expect("failed to serialize extensions list")
);
Ok(())
}
static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"installed_extensions",
"Number of databases where the version of extension is installed",
&["extension_name", "version"]
)
.expect("failed to define a metric")
});
pub fn collect() -> Vec<MetricFamily> {
INSTALLED_EXTENSIONS.collect()
}

View File

@@ -80,18 +80,18 @@ impl NeonWalRecord {
}
#[cfg(feature = "testing")]
pub fn wal_clear(s: impl AsRef<str>) -> Self {
pub fn wal_clear() -> Self {
Self::Test {
append: s.as_ref().to_string(),
append: "".to_string(),
clear: true,
will_init: false,
}
}
#[cfg(feature = "testing")]
pub fn wal_init(s: impl AsRef<str>) -> Self {
pub fn wal_init() -> Self {
Self::Test {
append: s.as_ref().to_string(),
append: "".to_string(),
clear: true,
will_init: true,
}

View File

@@ -241,7 +241,7 @@ pub use v14::bindings::{CheckPoint, ControlFileData};
pub const BLCKSZ: u16 = 8192;
pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
pub const XLOG_BLCKSZ: usize = 8192;
pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
pub const WAL_SEGMENT_SIZE: usize = 128 * 1024 * 1024;
pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

View File

@@ -151,7 +151,7 @@ fn check_end_of_wal(
.unwrap();
}
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
const_assert!(WAL_SEGMENT_SIZE == 128 * 1024 * 1024);
#[test]
pub fn test_find_end_of_wal_simple() {

View File

@@ -15,9 +15,6 @@ pub enum DownloadError {
///
/// Concurrency control is not timed within timeout.
Timeout,
/// Some integrity/consistency check failed during download. This is used during
/// timeline loads to cancel the load of a tenant if some timeline detects fatal corruption.
Fatal(String),
/// The file was found in the remote storage, but the download failed.
Other(anyhow::Error),
}
@@ -32,7 +29,6 @@ impl std::fmt::Display for DownloadError {
DownloadError::Unmodified => write!(f, "File was not modified"),
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
DownloadError::Timeout => write!(f, "timeout"),
DownloadError::Fatal(why) => write!(f, "Fatal read error: {why}"),
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
}
}
@@ -45,7 +41,7 @@ impl DownloadError {
pub fn is_permanent(&self) -> bool {
use DownloadError::*;
match self {
BadInput(_) | NotFound | Unmodified | Fatal(_) | Cancelled => true,
BadInput(_) | NotFound | Unmodified | Cancelled => true,
Timeout | Other(_) => false,
}
}

View File

@@ -123,27 +123,15 @@ pub async fn fsync_async_opt(
Ok(())
}
/// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After
/// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the
/// same file system.
/// Like postgres' durable_rename, renames file issuing fsyncs do make it
/// durable. After return, file and rename are guaranteed to be persisted.
///
/// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to
/// make the rename durable. This sequence ensures the target file will never be incomplete.
///
/// Postgres also:
///
/// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing
/// file survives a crash. Current callers don't need this as it should already be fsynced if
/// durability is needed.
///
/// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g.
/// NFS), but not on Linux with most common file systems like ext4 (which we currently use).
///
/// An audit of 8 other databases found that none fsynced the file after a rename:
/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837180535>
///
/// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs:
/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837926218>
/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
/// contents durable; 2) its directory entry to make rename durable 3) again to
/// already renamed file, which is not required by standards but postgres does
/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
/// rename if it exists to ensure that at least one of the files survives, but
/// current callers don't need that.
///
/// virtual_file.rs has similar code, but it doesn't use vfs.
///
@@ -161,6 +149,9 @@ pub async fn durable_rename(
// Time to do the real deal.
tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
// Postgres'ish fsync of renamed file.
fsync_async_opt(new_path.as_ref(), do_fsync).await?;
// Now fsync the parent
let parent = match new_path.as_ref().parent() {
Some(p) => p,

View File

@@ -1433,12 +1433,6 @@ impl Tenant {
info!(%timeline_id, "index_part not found on remote");
continue;
}
Err(DownloadError::Fatal(why)) => {
// If, while loading one remote timeline, we saw an indication that our generation
// number is likely invalid, then we should not load the whole tenant.
error!(%timeline_id, "Fatal error loading timeline: {why}");
anyhow::bail!(why.to_string());
}
Err(e) => {
// Some (possibly ephemeral) error happened during index_part download.
// Pretend the timeline exists to not delete the timeline directory,
@@ -7763,13 +7757,13 @@ mod tests {
(
get_key(3),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_clear("c")),
Value::WalRecord(NeonWalRecord::wal_clear()),
),
(get_key(4), Lsn(0x10), Value::Image("0x10".into())),
(
get_key(4),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_init("i")),
Value::WalRecord(NeonWalRecord::wal_init()),
),
];
let image1 = vec![(get_key(1), "0x10".into())];
@@ -7918,30 +7912,8 @@ mod tests {
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
test_simple_bottom_most_compaction_deltas_helper(
"test_simple_bottom_most_compaction_deltas_1",
false,
)
.await
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_simple_bottom_most_compaction_deltas_2() -> anyhow::Result<()> {
test_simple_bottom_most_compaction_deltas_helper(
"test_simple_bottom_most_compaction_deltas_2",
true,
)
.await
}
#[cfg(feature = "testing")]
async fn test_simple_bottom_most_compaction_deltas_helper(
test_name: &'static str,
use_delta_bottom_layer: bool,
) -> anyhow::Result<()> {
let harness = TenantHarness::create(test_name).await?;
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
@@ -7972,16 +7944,6 @@ mod tests {
let img_layer = (0..10)
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
.collect_vec();
// or, delta layer at 0x10 if `use_delta_bottom_layer` is true
let delta4 = (0..10)
.map(|id| {
(
get_key(id),
Lsn(0x08),
Value::WalRecord(NeonWalRecord::wal_init(format!("value {id}@0x10"))),
)
})
.collect_vec();
let delta1 = vec![
(
@@ -8035,61 +7997,21 @@ mod tests {
),
];
let tline = if use_delta_bottom_layer {
tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x08),
DEFAULT_PG_VERSION,
&ctx,
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x08)..Lsn(0x10),
delta4,
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x20)..Lsn(0x48),
delta1,
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x20)..Lsn(0x48),
delta2,
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x48)..Lsn(0x50),
delta3,
),
], // delta layers
vec![], // image layers
Lsn(0x50),
)
.await?
} else {
tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x48),
delta1,
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x48),
delta2,
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x48)..Lsn(0x50),
delta3,
),
], // delta layers
vec![(Lsn(0x10), img_layer)], // image layers
Lsn(0x50),
)
.await?
};
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
], // delta layers
vec![(Lsn(0x10), img_layer)], // image layers
Lsn(0x50),
)
.await?;
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
@@ -8199,7 +8121,7 @@ mod tests {
(
key,
Lsn(0x10),
Value::WalRecord(NeonWalRecord::wal_init("0x10")),
Value::Image(Bytes::copy_from_slice(b"0x10")),
),
(
key,
@@ -8261,7 +8183,7 @@ mod tests {
Lsn(0x20),
KeyLogAtLsn(vec![(
Lsn(0x20),
Value::Image(Bytes::from_static(b"0x10;0x20")),
Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
)]),
),
(
@@ -9243,7 +9165,7 @@ mod tests {
let will_init = will_init_keys.contains(&i);
if will_init {
delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
expected_key_values.insert(key, "".to_string());
} else {

View File

@@ -574,18 +574,12 @@ impl RemoteTimelineClient {
if latest_index_generation > index_generation {
// Unexpected! Why are we loading such an old index if a more recent one exists?
// We will refuse to proceed, as there is no reasonable scenario where this should happen, but
// there _is_ a clear bug/corruption scenario where it would happen (controller sets the generation
// backwards).
tracing::error!(
tracing::warn!(
?index_generation,
?latest_index_generation,
?latest_index_mtime,
"Found a newer index while loading an old one"
);
return Err(DownloadError::Fatal(
"Index age exceeds threshold and a newer index exists".into(),
));
}
}

View File

@@ -562,7 +562,7 @@ mod tests {
(
get_key(0),
Lsn(0x10),
Value::WalRecord(NeonWalRecord::wal_init("")),
Value::WalRecord(NeonWalRecord::wal_init()),
),
(
get_key(0),
@@ -572,7 +572,7 @@ mod tests {
(
get_key(5),
Lsn(0x10),
Value::WalRecord(NeonWalRecord::wal_init("")),
Value::WalRecord(NeonWalRecord::wal_init()),
),
(
get_key(5),

View File

@@ -253,10 +253,6 @@ pub(crate) fn apply_in_neon(
use bytes::BufMut;
if *will_init {
assert!(*clear, "init record must be clear to ensure correctness");
assert!(
page.is_empty(),
"init record must be the first entry to ensure correctness"
);
}
if *clear {
page.clear();

View File

@@ -1,8 +1,7 @@
#include <dirent.h>
#include <limits.h>
#include <string.h>
#include <dirent.h>
#include <signal.h>
#include <sys/stat.h>
#include "postgres.h"
@@ -22,35 +21,17 @@
static int logical_replication_max_snap_files = 300;
/*
* According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
* snapshot files. Let's use 8 MB since 8 is a power of 2.
*/
static int logical_replication_max_logicalsnapdir_size = 8000;
/*
* A primitive description of a logical snapshot file including the LSN of the
* file and its size.
*/
typedef struct SnapDesc {
XLogRecPtr lsn;
off_t sz;
} SnapDesc;
PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
/*
* Sorts an array of snapshot descriptors by their LSN.
*/
static int
SnapDescComparator(const void *a, const void *b)
LsnDescComparator(const void *a, const void *b)
{
const SnapDesc *desc1 = a;
const SnapDesc *desc2 = b;
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
if (desc1->lsn < desc2->lsn)
if (lsn1 < lsn2)
return 1;
else if (desc1->lsn == desc2->lsn)
else if (lsn1 == lsn2)
return 0;
else
return -1;
@@ -62,39 +43,28 @@ SnapDescComparator(const void *a, const void *b)
* slots having lower restart_lsn should be dropped.
*/
static XLogRecPtr
get_snapshots_cutoff_lsn(void)
get_num_snap_files_lsn_threshold(void)
{
/* PG 18 has a constant defined for this, PG_LOGICAL_SNAPSHOTS_DIR */
#define SNAPDIR "pg_logical/snapshots"
DIR *dirdesc;
int dirdesc_fd;
struct dirent *de;
size_t snapshot_index = 0;
SnapDesc *snapshot_descriptors;
size_t descriptors_allocated = 1024;
XLogRecPtr cutoff = 0;
off_t logicalsnapdir_size = 0;
const int logical_replication_max_logicalsnapdir_size_bytes = logical_replication_max_logicalsnapdir_size * 1000;
char *snap_path = "pg_logical/snapshots/";
int lsns_allocated = 1024;
int lsns_num = 0;
XLogRecPtr *lsns;
XLogRecPtr cutoff;
if (logical_replication_max_snap_files < 0 && logical_replication_max_logicalsnapdir_size < 0)
if (logical_replication_max_snap_files < 0)
return 0;
snapshot_descriptors = palloc(sizeof(*snapshot_descriptors) * descriptors_allocated);
dirdesc = AllocateDir(SNAPDIR);
dirdesc_fd = dirfd(dirdesc);
if (dirdesc_fd == -1)
ereport(ERROR, errmsg("failed to get a file descriptor for " SNAPDIR ": %m"));
lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
/* find all .snap files and get their lsns */
while ((de = ReadDir(dirdesc, SNAPDIR)) != NULL)
dirdesc = AllocateDir(snap_path);
while ((de = ReadDir(dirdesc, snap_path)) != NULL)
{
XLogRecPtr lsn;
uint32 hi;
uint32 lo;
struct stat st;
XLogRecPtr lsn;
SnapDesc *desc;
if (strcmp(de->d_name, ".") == 0 ||
strcmp(de->d_name, "..") == 0)
@@ -109,69 +79,28 @@ get_snapshots_cutoff_lsn(void)
lsn = ((uint64) hi) << 32 | lo;
elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
if (fstatat(dirdesc_fd, de->d_name, &st, 0) == -1)
ereport(ERROR, errmsg("failed to get the size of " SNAPDIR "/%s: %m", de->d_name));
if (descriptors_allocated == snapshot_index)
if (lsns_allocated == lsns_num)
{
descriptors_allocated *= 2;
snapshot_descriptors = repalloc(snapshot_descriptors, sizeof(*snapshot_descriptors) * descriptors_allocated);
lsns_allocated *= 2;
lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
}
desc = &snapshot_descriptors[snapshot_index++];
desc->lsn = lsn;
desc->sz = st.st_size;
lsns[lsns_num++] = lsn;
}
qsort(snapshot_descriptors, snapshot_index, sizeof(*snapshot_descriptors), SnapDescComparator);
/* Are there more snapshot files than specified? */
if (logical_replication_max_snap_files <= snapshot_index)
/* sort by lsn desc */
qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
/* and take cutoff at logical_replication_max_snap_files */
if (logical_replication_max_snap_files > lsns_num)
cutoff = 0;
/* have less files than cutoff */
else
{
cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
elog(LOG,
"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
cutoff = lsns[logical_replication_max_snap_files - 1];
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
}
/* Is the size of the logical snapshots directory larger than specified?
*
* It's possible we could hit both thresholds, so remove any extra files
* first, and then truncate based on size of the remaining files.
*/
if (logicalsnapdir_size > logical_replication_max_logicalsnapdir_size_bytes)
{
/* Unfortunately, iterating the directory does not guarantee any order
* so we can't cache an index in the preceding loop.
*/
off_t sz;
const XLogRecPtr original = cutoff;
sz = snapshot_descriptors[0].sz;
for (size_t i = 1; i < logical_replication_max_snap_files; ++i)
{
if (sz > logical_replication_max_logicalsnapdir_size_bytes)
{
cutoff = snapshot_descriptors[i - 1].lsn;
break;
}
sz += snapshot_descriptors[i].sz;
}
if (cutoff != original)
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
}
pfree(snapshot_descriptors);
pfree(lsns);
FreeDir(dirdesc);
return cutoff;
#undef SNAPDIR
}
void
@@ -189,16 +118,6 @@ InitLogicalReplicationMonitor(void)
0,
NULL, NULL, NULL);
DefineCustomIntVariable(
"neon.logical_replication_max_logicalsnapdir_size",
"Maximum allowed size of the pg_logical/snapshots directory (KB). When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
NULL,
&logical_replication_max_logicalsnapdir_size,
8000, -1, INT_MAX,
PGC_SIGHUP,
GUC_UNIT_KB,
NULL, NULL, NULL);
memset(&bgw, 0, sizeof(bgw));
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
@@ -243,7 +162,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
* If there are too many .snap files, just drop all logical slots to
* prevent aux files bloat.
*/
cutoff_lsn = get_snapshots_cutoff_lsn();
cutoff_lsn = get_num_snap_files_lsn_threshold();
if (cutoff_lsn > 0)
{
for (int i = 0; i < max_replication_slots; i++)

View File

@@ -1,450 +0,0 @@
use std::sync::Arc;
use anyhow::bail;
use aws_config::environment::EnvironmentVariableCredentialsProvider;
use aws_config::imds::credentials::ImdsCredentialsProvider;
use aws_config::meta::credentials::CredentialsProviderChain;
use aws_config::meta::region::RegionProviderChain;
use aws_config::profile::ProfileFileCredentialsProvider;
use aws_config::provider_config::ProviderConfig;
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
use aws_config::Region;
use clap::{Parser, ValueEnum};
use proxy::config::{self, remote_storage_from_toml, ProxyProtocolV2};
use proxy::context::parquet::ParquetUploadArgs;
use proxy::rate_limiter::RateBucketInfo;
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use proxy::redis::elasticache;
use redis::streams::{StreamReadOptions, StreamReadReply};
use redis::{AsyncCommands, FromRedisValue, Value};
use remote_storage::RemoteStorageConfig;
use serde::{Deserialize, Serialize};
use tracing::warn;
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
#[derive(Clone, Debug, ValueEnum)]
enum AuthBackendType {
#[value(name("console"), alias("cplane"))]
ControlPlane,
#[value(name("link"), alias("control-redirect"))]
ConsoleRedirect,
#[cfg(feature = "testing")]
Postgres,
}
/// Neon proxy/router
#[derive(Parser)]
struct ProxyCliArgs {
/// Name of the region this proxy is deployed in
#[clap(long, default_value_t = String::new())]
region: String,
/// listen for incoming client connections on ip:port
#[clap(short, long, default_value = "127.0.0.1:4432")]
proxy: String,
#[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
auth_backend: AuthBackendType,
/// listen for management callback connection on ip:port
#[clap(short, long, default_value = "127.0.0.1:7000")]
mgmt: String,
/// listen for incoming http connections (metrics, etc) on ip:port
#[clap(long, default_value = "127.0.0.1:7001")]
http: String,
/// listen for incoming wss connections on ip:port
#[clap(long)]
wss: Option<String>,
/// redirect unauthenticated users to the given uri in case of console redirect auth
#[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
uri: String,
/// cloud API endpoint for authenticating users
#[clap(
short,
long,
default_value = "http://localhost:3000/authenticate_proxy_request/"
)]
auth_endpoint: String,
/// JWT used to connect to control plane.
#[clap(
long,
value_name = "JWT",
default_value = "",
env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
)]
control_plane_token: Arc<str>,
/// if this is not local proxy, this toggles whether we accept jwt or passwords for http
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
is_auth_broker: bool,
/// path to TLS key for client postgres connections
///
/// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
#[clap(short = 'k', long, alias = "ssl-key")]
tls_key: Option<String>,
/// path to TLS cert for client postgres connections
///
/// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
#[clap(short = 'c', long, alias = "ssl-cert")]
tls_cert: Option<String>,
/// path to directory with TLS certificates for client postgres connections
#[clap(long)]
certs_dir: Option<String>,
/// timeout for the TLS handshake
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
handshake_timeout: tokio::time::Duration,
/// http endpoint to receive periodic metric updates
#[clap(long)]
metric_collection_endpoint: Option<String>,
/// how often metrics should be sent to a collection endpoint
#[clap(long)]
metric_collection_interval: Option<String>,
/// cache for `wake_compute` api method (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
wake_compute_cache: String,
/// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
wake_compute_lock: String,
/// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
connect_compute_lock: String,
/// Allow self-signed certificates for compute nodes (for testing)
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
allow_self_signed_compute: bool,
#[clap(flatten)]
sql_over_http: SqlOverHttpArgs,
/// timeout for scram authentication protocol
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
scram_protocol_timeout: tokio::time::Duration,
/// size of the threadpool for password hashing
#[clap(long, default_value_t = 4)]
scram_thread_pool_size: u8,
/// Endpoint rate limiter max number of requests per second.
///
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
/// Can be given multiple times for different bucket sizes.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
endpoint_rps_limit: Vec<RateBucketInfo>,
/// Wake compute rate limiter max number of requests per second.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
wake_compute_limit: Vec<RateBucketInfo>,
/// Whether the auth rate limiter actually takes effect (for testing)
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
auth_rate_limit_enabled: bool,
/// Authentication rate limiter max number of hashes per second.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
auth_rate_limit: Vec<RateBucketInfo>,
/// The IP subnet to use when considering whether two IP addresses are considered the same.
#[clap(long, default_value_t = 64)]
auth_rate_limit_ip_subnet: u8,
/// Redis rate limiter max number of requests per second.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
redis_rps_limit: Vec<RateBucketInfo>,
/// cache for `allowed_ips` (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
allowed_ips_cache: String,
/// cache for `role_secret` (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
role_secret_cache: String,
/// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
#[clap(long)]
redis_notifications: Option<String>,
/// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
#[clap(long, default_value = "irsa")]
redis_auth_type: String,
/// redis host for streaming connections (might be different from the notifications host)
#[clap(long)]
redis_host: Option<String>,
/// redis port for streaming connections (might be different from the notifications host)
#[clap(long)]
redis_port: Option<u16>,
/// redis cluster name, used in aws elasticache
#[clap(long)]
redis_cluster_name: Option<String>,
/// redis user_id, used in aws elasticache
#[clap(long)]
redis_user_id: Option<String>,
/// aws region to retrieve credentials
#[clap(long, default_value_t = String::new())]
aws_region: String,
/// cache for `project_info` (use `size=0` to disable)
#[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
project_info_cache: String,
/// cache for all valid endpoints
#[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
endpoint_cache_config: String,
#[clap(flatten)]
parquet_upload: ParquetUploadArgs,
/// interval for backup metric collection
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
metric_backup_collection_interval: std::time::Duration,
/// remote storage configuration for backup metric collection
/// Encoded as toml (same format as pageservers), eg
/// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
#[clap(long, value_parser = remote_storage_from_toml)]
metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
/// chunk size for backup metric collection
/// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
#[clap(long, default_value = "4194304")]
metric_backup_collection_chunk_size: usize,
/// Whether to retry the connection to the compute node
#[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
connect_to_compute_retry: String,
/// Whether to retry the wake_compute request
#[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
wake_compute_retry: String,
/// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
is_private_access_proxy: bool,
/// Configure whether all incoming requests have a Proxy Protocol V2 packet.
// TODO(conradludgate): switch default to rejected or required once we've updated all deployments
#[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)]
proxy_protocol_v2: ProxyProtocolV2,
/// Time the proxy waits for the webauth session to be confirmed by the control plane.
// TODO: rename to `console_redirect_confirmation_timeout`.
#[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
webauth_confirmation_timeout: std::time::Duration,
}
#[derive(clap::Args, Clone, Copy, Debug)]
struct SqlOverHttpArgs {
/// timeout for http connection requests
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
sql_over_http_timeout: tokio::time::Duration,
/// Whether the SQL over http pool is opt-in
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
sql_over_http_pool_opt_in: bool,
/// How many connections to pool for each endpoint. Excess connections are discarded
#[clap(long, default_value_t = 20)]
sql_over_http_pool_max_conns_per_endpoint: usize,
/// How many connections to pool for each endpoint. Excess connections are discarded
#[clap(long, default_value_t = 20000)]
sql_over_http_pool_max_total_conns: usize,
/// How long pooled connections should remain idle for before closing
#[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
sql_over_http_idle_timeout: tokio::time::Duration,
/// Duration each shard will wait on average before a GC sweep.
/// A longer time will causes sweeps to take longer but will interfere less frequently.
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
sql_over_http_pool_gc_epoch: tokio::time::Duration,
/// How many shards should the global pool have. Must be a power of two.
/// More shards will introduce less contention for pool operations, but can
/// increase memory used by the pool
#[clap(long, default_value_t = 128)]
sql_over_http_pool_shards: usize,
#[clap(long, default_value_t = 10000)]
sql_over_http_client_conn_threshold: u64,
#[clap(long, default_value_t = 64)]
sql_over_http_cancel_set_shards: usize,
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
sql_over_http_max_request_size_bytes: u64,
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
sql_over_http_max_response_size_bytes: usize,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let _logging_guard = proxy::logging::init().await?;
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
let args = ProxyCliArgs::parse();
let region_provider =
RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
let provider_conf =
ProviderConfig::without_region().with_region(region_provider.region().await);
let aws_credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
.or_else(
"profile-sso",
ProfileFileCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
// needed to access remote extensions bucket
.or_else(
"token",
WebIdentityTokenCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses imds v2
.or_else("imds", ImdsCredentialsProvider::builder().build())
};
let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
elasticache::AWSIRSAConfig::new(
args.aws_region.clone(),
args.redis_cluster_name,
args.redis_user_id,
),
aws_credentials_provider,
));
let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
("plain", redis_url) => match redis_url {
None => {
bail!("plain auth requires redis_notifications to be set");
}
Some(url) => Some(
ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()),
),
},
("irsa", _) => match (&args.redis_host, args.redis_port) {
(Some(host), Some(port)) => Some(
ConnectionWithCredentialsProvider::new_with_credentials_provider(
host.to_string(),
port,
elasticache_credentials_provider.clone(),
),
),
(None, None) => {
warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client");
None
}
_ => {
bail!("redis-host and redis-port must be specified together");
}
},
_ => {
bail!("unknown auth type given");
}
};
let endpoint_cache_config: config::EndpointCacheConfig = args.endpoint_cache_config.parse()?;
let Some(mut regional_redis_client) = regional_redis_client else {
bail!("no regional_redis_client");
};
if let Err(e) = regional_redis_client.connect().await {
bail!("error connecting to redis: {:?}", e);
}
let mut last_id = "0-0".to_string();
batch_read(
&mut regional_redis_client,
endpoint_cache_config.stream_name,
StreamReadOptions::default().count(endpoint_cache_config.default_batch_size),
&mut last_id,
true,
|event| {
let json = serde_json::to_string(&event)?;
println!("{}", json);
Ok(())
},
)
.await?;
Ok(())
}
// TODO: this could be an enum, but events in Redis need to be fixed first.
// ProjectCreated was sent with type:branch_created. So we ignore type.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
struct CPlaneEvent {
id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
endpoint_created: Option<EndpointCreated>,
#[serde(skip_serializing_if = "Option::is_none")]
branch_created: Option<BranchCreated>,
#[serde(skip_serializing_if = "Option::is_none")]
project_created: Option<ProjectCreated>,
#[serde(rename = "type")]
_type: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
struct EndpointCreated {
endpoint_id: String,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
struct BranchCreated {
branch_id: String,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
struct ProjectCreated {
project_id: String,
}
impl TryFrom<&Value> for CPlaneEvent {
type Error = anyhow::Error;
fn try_from(value: &Value) -> Result<Self, Self::Error> {
let json = String::from_redis_value(value)?;
Ok(serde_json::from_str(&json)?)
}
}
async fn batch_read(
conn: &mut ConnectionWithCredentialsProvider,
stream_name: String,
opts: StreamReadOptions,
last_id: &mut String,
return_when_finish: bool,
mut insert_event: impl FnMut(CPlaneEvent) -> anyhow::Result<()>,
) -> anyhow::Result<()> {
let mut total: usize = 0;
loop {
let mut res: StreamReadReply = conn
.xread_options(&[&stream_name], &[last_id.as_str()], &opts)
.await?;
if res.keys.is_empty() {
if return_when_finish {
if total != 0 {
break;
}
anyhow::bail!(
"Redis stream {} is empty, cannot be used to filter endpoints",
stream_name
);
}
// If we are not returning when finish, we should wait for more data.
continue;
}
if res.keys.len() != 1 {
anyhow::bail!("Cannot read from redis stream {}", stream_name);
}
let key = res.keys.pop().expect("Checked length above");
for stream_id in key.ids {
total += 1;
for value in stream_id.map.values() {
match value.try_into() {
Ok::<CPlaneEvent, _>(mut event) => {
event.id = Some(stream_id.id.clone());
insert_event(event)?;
}
Err(err) => {
tracing::error!("error parsing value {value:?}: {err:?}");
}
};
}
if total.is_power_of_two() {
tracing::debug!("endpoints read {}", total);
}
*last_id = stream_id.id;
}
}
tracing::info!("read {} endpoints/branches/projects from redis", total);
Ok(())
}

View File

@@ -80,7 +80,7 @@ impl ConnectionWithCredentialsProvider {
redis::cmd("PING").query_async(con).await
}
pub async fn connect(&mut self) -> anyhow::Result<()> {
pub(crate) async fn connect(&mut self) -> anyhow::Result<()> {
let _guard = self.mutex.lock().await;
if let Some(con) = self.con.as_mut() {
match Self::ping(con).await {

View File

@@ -127,29 +127,23 @@ pub struct PhysicalStorage {
/// - doesn't point to the end of the segment
file: Option<File>,
/// When true, WAL truncation potentially has been interrupted and we need
/// to finish it before allowing WAL writes; see truncate_wal for details.
/// In this case [`write_lsn`] can be less than actually written WAL on
/// disk. In particular, there can be a case with unexpected .partial file.
/// When false, we have just initialized storage using the LSN from find_end_of_wal().
/// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
/// there can be a case with unexpected .partial file.
///
/// Imagine the following:
/// - 000000010000000000000001
/// - it was fully written, but the last record is split between 2
/// segments
/// - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in
/// the end of this segment
/// - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were
/// initialized to 0/1FFFFF0
/// - it was fully written, but the last record is split between 2 segments
/// - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in the end of this segment
/// - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were initialized to 0/1FFFFF0
/// - 000000010000000000000002.partial
/// - it has only 1 byte written, which is not enough to make a full WAL
/// record
/// - it has only 1 byte written, which is not enough to make a full WAL record
///
/// Partial segment 002 has no WAL records, and it will be removed by the
/// next truncate_wal(). This flag will be set to true after the first
/// truncate_wal() call.
/// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
/// This flag will be set to true after the first truncate_wal() call.
///
/// [`write_lsn`]: Self::write_lsn
pending_wal_truncation: bool,
is_truncated_after_restart: bool,
}
impl PhysicalStorage {
@@ -214,7 +208,7 @@ impl PhysicalStorage {
flush_record_lsn: flush_lsn,
decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
file: None,
pending_wal_truncation: true,
is_truncated_after_restart: false,
})
}
@@ -411,13 +405,6 @@ impl Storage for PhysicalStorage {
startpos
);
}
if self.pending_wal_truncation {
bail!(
"write_wal called with pending WAL truncation, write_lsn={}, startpos={}",
self.write_lsn,
startpos
);
}
let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?;
// WAL is written, updating write metrics
@@ -492,34 +479,15 @@ impl Storage for PhysicalStorage {
);
}
// Quick exit if nothing to do and we know that the state is clean to
// avoid writing up to 16 MiB of zeros on disk (this happens on each
// connect).
if !self.pending_wal_truncation
// Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
// disk (this happens on each connect).
if self.is_truncated_after_restart
&& end_pos == self.write_lsn
&& end_pos == self.flush_record_lsn
{
return Ok(());
}
// Atomicity: we start with LSNs reset because once on disk deletion is
// started it can't be reversed. However, we might crash/error in the
// middle, leaving garbage above the truncation point. In theory,
// concatenated with previous records it might form bogus WAL (though
// very unlikely in practice because CRC would guard from that). To
// protect, set pending_wal_truncation flag before beginning: it means
// truncation must be retried and WAL writes are prohibited until it
// succeeds. Flag is also set on boot because we don't know if the last
// state was clean.
//
// Protocol (HandleElected before first AppendRequest) ensures we'll
// always try to ensure clean truncation before any writes.
self.pending_wal_truncation = true;
self.write_lsn = end_pos;
self.write_record_lsn = end_pos;
self.flush_record_lsn = end_pos;
// Close previously opened file, if any
if let Some(unflushed_file) = self.file.take() {
self.fdatasync_file(&unflushed_file).await?;
@@ -545,7 +513,11 @@ impl Storage for PhysicalStorage {
fs::rename(wal_file_path, wal_file_partial_path).await?;
}
self.pending_wal_truncation = false;
// Update LSNs
self.write_lsn = end_pos;
self.write_record_lsn = end_pos;
self.flush_record_lsn = end_pos;
self.is_truncated_after_restart = true;
Ok(())
}

View File

@@ -46,8 +46,3 @@ class EndpointHttpClient(requests.Session):
)
res.raise_for_status()
return res.json()
def metrics(self) -> str:
res = self.get(f"http://localhost:{self.port}/metrics")
res.raise_for_status()
return res.text

View File

@@ -5,8 +5,6 @@ from typing import TYPE_CHECKING, cast, final
import requests
from fixtures.log_helper import log
if TYPE_CHECKING:
from typing import Any, Literal, Optional
@@ -32,11 +30,7 @@ class NeonAPI:
kwargs["headers"] = {}
kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
resp = requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
log.debug("%s %s returned a %d: %s", method, endpoint, resp.status_code, resp.text)
resp.raise_for_status()
return resp
return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
def create_project(
self,
@@ -72,6 +66,8 @@ class NeonAPI:
json=data,
)
assert resp.status_code == 201
return cast("dict[str, Any]", resp.json())
def get_project_details(self, project_id: str) -> dict[str, Any]:
@@ -83,7 +79,7 @@ class NeonAPI:
"Content-Type": "application/json",
},
)
assert resp.status_code == 200
return cast("dict[str, Any]", resp.json())
def delete_project(
@@ -99,6 +95,8 @@ class NeonAPI:
},
)
assert resp.status_code == 200
return cast("dict[str, Any]", resp.json())
def start_endpoint(
@@ -114,6 +112,8 @@ class NeonAPI:
},
)
assert resp.status_code == 200
return cast("dict[str, Any]", resp.json())
def suspend_endpoint(
@@ -129,6 +129,8 @@ class NeonAPI:
},
)
assert resp.status_code == 200
return cast("dict[str, Any]", resp.json())
def restart_endpoint(
@@ -144,6 +146,8 @@ class NeonAPI:
},
)
assert resp.status_code == 200
return cast("dict[str, Any]", resp.json())
def create_endpoint(
@@ -174,6 +178,8 @@ class NeonAPI:
json=data,
)
assert resp.status_code == 201
return cast("dict[str, Any]", resp.json())
def get_connection_uri(
@@ -200,6 +206,8 @@ class NeonAPI:
},
)
assert resp.status_code == 200
return cast("dict[str, Any]", resp.json())
def get_branches(self, project_id: str) -> dict[str, Any]:
@@ -211,6 +219,8 @@ class NeonAPI:
},
)
assert resp.status_code == 200
return cast("dict[str, Any]", resp.json())
def get_endpoints(self, project_id: str) -> dict[str, Any]:
@@ -222,6 +232,8 @@ class NeonAPI:
},
)
assert resp.status_code == 200
return cast("dict[str, Any]", resp.json())
def get_operations(self, project_id: str) -> dict[str, Any]:
@@ -234,6 +246,8 @@ class NeonAPI:
},
)
assert resp.status_code == 200
return cast("dict[str, Any]", resp.json())
def wait_for_operation_to_finish(self, project_id: str):

View File

@@ -1065,9 +1065,6 @@ class NeonEnv:
"http_auth_type": http_auth_type,
# Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
"availability_zone": "us-east-2a",
# Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
# the pageserver taking a long time to start up due to syncfs flushing other tests' data
"no_sync": True,
}
if self.pageserver_virtual_file_io_engine is not None:
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine

View File

@@ -149,16 +149,12 @@ def test_subscriber_lag(
check_pgbench_still_running(pub_workload, "pub")
check_pgbench_still_running(sub_workload, "sub")
pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn.close()
sub_conn.close()
with (
psycopg2.connect(pub_connstr) as pub_conn,
psycopg2.connect(sub_connstr) as sub_conn,
):
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)
log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
@@ -210,7 +206,6 @@ def test_publisher_restart(
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
pub_exists = len(pub_cur.fetchall()) != 0
@@ -227,7 +222,6 @@ def test_publisher_restart(
sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn.close()
sub_conn.close()
@@ -254,17 +248,12 @@ def test_publisher_restart(
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
env=pub_env,
)
pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn.close()
sub_conn.close()
with (
psycopg2.connect(pub_connstr) as pub_conn,
psycopg2.connect(sub_connstr) as sub_conn,
):
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)
log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
@@ -299,56 +288,58 @@ def test_snap_files(
env = benchmark_project_pub.pgbench_env
connstr = benchmark_project_pub.connstr
conn = psycopg2.connect(connstr)
conn.autocommit = True
with conn.cursor() as cur:
cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
is_super = cast("bool", cur.fetchall()[0][0])
assert is_super, "This benchmark won't work if we don't have superuser"
conn.close()
with psycopg2.connect(connstr) as conn:
conn.autocommit = True
with conn.cursor() as cur:
cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
is_super = cast("bool", cur.fetchall()[0][0])
assert is_super, "This benchmark won't work if we don't have superuser"
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)
conn = psycopg2.connect(connstr)
conn.autocommit = True
cur = conn.cursor()
cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1")
with conn.cursor() as cur:
cur.execute(
with psycopg2.connect(connstr) as conn:
conn.autocommit = True
with conn.cursor() as cur:
cur.execute("SELECT pg_reload_conf()")
with psycopg2.connect(connstr) as conn:
conn.autocommit = True
with conn.cursor() as cur:
cur.execute(
"""
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_replication_slots
WHERE slot_name = 'slotter'
) THEN
PERFORM pg_drop_replication_slot('slotter');
END IF;
END $$;
"""
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_replication_slots
WHERE slot_name = 'slotter'
) THEN
PERFORM pg_drop_replication_slot('slotter');
END IF;
END $$;
"""
)
cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
conn.close()
)
cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
try:
start = time.time()
prev_measurement = time.time()
while time.time() - start < test_duration_min * 60:
conn = psycopg2.connect(connstr)
conn.autocommit = True
with conn.cursor() as cur:
cur.execute(
"SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
)
check_pgbench_still_running(workload)
cur.execute("SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())")
conn.close()
with psycopg2.connect(connstr) as conn:
with conn.cursor() as cur:
cur.execute(
"SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
)
check_pgbench_still_running(workload)
cur.execute(
"SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
)
# Measure storage
if time.time() - prev_measurement > test_interval_min * 60:

View File

@@ -102,21 +102,15 @@ def test_ro_replica_lag(
check_pgbench_still_running(master_workload)
check_pgbench_still_running(replica_workload)
time.sleep(sync_interval_min * 60)
conn_master = psycopg2.connect(master_connstr)
conn_replica = psycopg2.connect(replica_connstr)
conn_master.autocommit = True
conn_replica.autocommit = True
with (
conn_master.cursor() as cur_master,
conn_replica.cursor() as cur_replica,
psycopg2.connect(master_connstr) as conn_master,
psycopg2.connect(replica_connstr) as conn_replica,
):
lag = measure_replication_lag(cur_master, cur_replica)
conn_master.close()
conn_replica.close()
with (
conn_master.cursor() as cur_master,
conn_replica.cursor() as cur_replica,
):
lag = measure_replication_lag(cur_master, cur_replica)
log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
finally:
@@ -225,15 +219,11 @@ def test_replication_start_stop(
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env)
# Sync replicas
conn_master = psycopg2.connect(master_connstr)
conn_master.autocommit = True
with conn_master.cursor() as cur_master:
for i in range(num_replicas):
conn_replica = psycopg2.connect(replica_connstr[i])
measure_replication_lag(cur_master, conn_replica.cursor())
conn_master.close()
with psycopg2.connect(master_connstr) as conn_master:
with conn_master.cursor() as cur_master:
for i in range(num_replicas):
conn_replica = psycopg2.connect(replica_connstr[i])
measure_replication_lag(cur_master, conn_replica.cursor())
master_pgbench = pg_bin.run_nonblocking(
[
@@ -287,22 +277,17 @@ def test_replication_start_stop(
time.sleep(configuration_test_time_sec)
conn_master = psycopg2.connect(master_connstr)
conn_master.autocommit = True
with conn_master.cursor() as cur_master:
for ireplica in range(num_replicas):
replica_conn = psycopg2.connect(replica_connstr[ireplica])
lag = measure_replication_lag(cur_master, replica_conn.cursor())
zenbenchmark.record(
f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
)
log.info(
f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
)
conn_master.close()
with psycopg2.connect(master_connstr) as conn_master:
with conn_master.cursor() as cur_master:
for ireplica in range(num_replicas):
replica_conn = psycopg2.connect(replica_connstr[ireplica])
lag = measure_replication_lag(cur_master, replica_conn.cursor())
zenbenchmark.record(
f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
)
log.info(
f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
)
master_pgbench.terminate()
except Exception as e:
error_occurred = True

View File

@@ -1,14 +1,6 @@
from __future__ import annotations
import time
from logging import info
from typing import TYPE_CHECKING
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv
from fixtures.neon_fixtures import NeonEnv
def test_installed_extensions(neon_simple_env: NeonEnv):
@@ -93,52 +85,3 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
assert ext["n_databases"] == 2
ext["versions"].sort()
assert ext["versions"] == ["1.2", "1.3"]
# check that /metrics endpoint is available
# ensure that we see the metric before and after restart
res = client.metrics()
info("Metrics: %s", res)
m = parse_metrics(res)
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
assert len(neon_m) == 1
for sample in neon_m:
assert sample.value == 2
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
assert len(neon_m) == 1
for sample in neon_m:
assert sample.value == 1
endpoint.stop()
endpoint.start()
timeout = 10
while timeout > 0:
try:
res = client.metrics()
timeout = -1
if len(parse_metrics(res).query_all("installed_extensions")) < 4:
# Assume that not all metrics that are collected yet
time.sleep(1)
timeout -= 1
continue
except Exception:
log.exception("failed to get metrics, assume they are not collected yet")
time.sleep(1)
timeout -= 1
continue
assert (
len(parse_metrics(res).query_all("installed_extensions")) >= 4
), "Not all metrics are collected"
info("After restart metrics: %s", res)
m = parse_metrics(res)
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
assert len(neon_m) == 1
for sample in neon_m:
assert sample.value == 1
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
assert len(neon_m) == 1
for sample in neon_m:
assert sample.value == 1

View File

@@ -35,10 +35,9 @@ from fixtures.pageserver.utils import (
wait_for_upload,
)
from fixtures.remote_storage import (
LocalFsStorage,
RemoteStorageKind,
)
from fixtures.utils import run_only_on_default_postgres, wait_until
from fixtures.utils import wait_until
from fixtures.workload import Workload
if TYPE_CHECKING:
@@ -729,68 +728,3 @@ def test_upgrade_generationless_local_file_paths(
)
# We should download into the same local path we started with
assert os.path.exists(victim_path)
@run_only_on_default_postgres("Only tests index logic")
def test_old_index_time_threshold(
neon_env_builder: NeonEnvBuilder,
):
"""
Exercise pageserver's detection of trying to load an ancient non-latest index.
(see https://github.com/neondatabase/neon/issues/6951)
"""
# Run with local_fs because we will interfere with mtimes by local filesystem access
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(32)
# Remember generation 1's index path
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
index_path = env.pageserver_remote_storage.index_path(tenant_id, timeline_id)
# Increment generation by detaching+attaching, and write+flush some data to get a new remote index
env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
env.storage_controller.reconcile_until_idle()
workload.churn_rows(32)
# A new index should have been written
assert env.pageserver_remote_storage.index_path(tenant_id, timeline_id) != index_path
# Hack the mtime on the generation 1 index
log.info(f"Setting old mtime on {index_path}")
os.utime(index_path, times=(time.time(), time.time() - 30 * 24 * 3600))
env.pageserver.allowed_errors.extend(
[
".*Found a newer index while loading an old one.*",
".*Index age exceeds threshold and a newer index exists.*",
]
)
# Detach from storage controller + attach in an old generation directly on the pageserver.
workload.stop()
env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
env.storage_controller.reconcile_until_idle()
env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"})
env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy")
# The controller would not do this (attach in an old generation): we are doing it to simulate
# a hypothetical profound bug in the controller.
env.pageserver.http_client().tenant_location_conf(
tenant_id, {"generation": 1, "mode": "AttachedSingle", "tenant_conf": {}}
)
# The pageserver should react to this situation by refusing to attach the tenant and putting
# it into Broken state
env.pageserver.allowed_errors.append(".*tenant is broken.*")
with pytest.raises(
PageserverApiException,
match="tenant is broken: Index age exceeds threshold and a newer index exists",
):
env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)

View File

@@ -5,8 +5,7 @@ import time
from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg):
"""Test read replica of a primary which has a logical replication publication"""
def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
n_records = 100000
@@ -14,6 +13,7 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
primary = env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
)
p_con = primary.connect()
p_cur = p_con.cursor()
@@ -30,6 +30,7 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
secondary = env.endpoints.new_replica_start(
origin=primary,
endpoint_id="secondary",
config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
)
s_con = secondary.connect()
@@ -47,51 +48,3 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
# Check that LR slot is not copied to replica
s_cur.execute("select count(*) from pg_replication_slots")
assert s_cur.fetchall()[0][0] == 0
def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
"""Test that AUX files are not saved at replica"""
env = neon_simple_env
n_records = 20000
primary = env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
)
p_con = primary.connect()
p_cur = p_con.cursor()
p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
p_cur.execute("create publication pub1 for table t")
# start subscriber
vanilla_pg.start()
vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)")
connstr = primary.connstr().replace("'", "''")
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
for pk in range(n_records):
p_cur.execute("insert into t (pk) values (%s)", (pk,))
# LR snapshot is stored each 15 seconds
time.sleep(16)
# start replica
secondary = env.endpoints.new_replica_start(
origin=primary,
endpoint_id="secondary",
)
s_con = secondary.connect()
s_cur = s_con.cursor()
logical_replication_sync(vanilla_pg, primary)
assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
s_cur.execute("select count(*) from t")
assert s_cur.fetchall()[0][0] == n_records
vanilla_pg.stop()
secondary.stop()
primary.stop()
assert not secondary.log_contains("cannot make new WAL entries during recovery")

View File

@@ -427,7 +427,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
env.pageserver.start()
for f in futs:
f.result(timeout=30)
f.result(timeout=10)
# The tenant should end up active
wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)

16
vendor/revisions.json vendored
View File

@@ -1,18 +1,18 @@
{
"v17": [
"17.1",
"aa2e29f2b6952140dfe51876bbd11054acae776f"
"17.0",
"9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb"
],
"v16": [
"16.5",
"b0b693ea298454e95e6b154780d1fd586a244dfd"
"16.4",
"e131a9c027b202ce92bd7b9cf2569d48a6f9948e"
],
"v15": [
"15.9",
"1feff6b60f07cb71b665d0f5ead71a4320a71743"
"15.8",
"22e580fe9ffcea7e02592110b1c9bf426d83cada"
],
"v14": [
"14.14",
"c5e0d642efb02e4bfedc283b0a7707fe6c79cc89"
"14.13",
"2199b83fb72680001ce0f43bf6187a21dfb8f45d"
]
}