mirror of
https://github.com/neondatabase/neon.git
synced 2026-04-28 03:40:37 +00:00
Compare commits
29 Commits
arpad/remo
...
arpad/run_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d38bc02bdf | ||
|
|
24bc6ddec4 | ||
|
|
f49fe734d1 | ||
|
|
872e645f7d | ||
|
|
648fe7c92d | ||
|
|
21045477a3 | ||
|
|
125f24ca49 | ||
|
|
443d4ce868 | ||
|
|
3290fb09bf | ||
|
|
efdb2bf948 | ||
|
|
5559b16953 | ||
|
|
1aea65eb9d | ||
|
|
34ddec67d9 | ||
|
|
9ace36d93c | ||
|
|
e4898a6e60 | ||
|
|
c77981289c | ||
|
|
f003dd6ad5 | ||
|
|
7e7e9f5191 | ||
|
|
760a48207d | ||
|
|
88df057531 | ||
|
|
c65ac37a6d | ||
|
|
a092127b17 | ||
|
|
e8f773387d | ||
|
|
00936d19e1 | ||
|
|
57155ada77 | ||
|
|
02b916d3c9 | ||
|
|
e6e013b3b7 | ||
|
|
bd19290d9f | ||
|
|
a584e300d1 |
@@ -1,2 +1,2 @@
|
||||
[profile.default]
|
||||
slow-timeout = "1m"
|
||||
slow-timeout = { period = "20s", terminate-after = 3 }
|
||||
|
||||
5
Cargo.lock
generated
5
Cargo.lock
generated
@@ -3991,6 +3991,7 @@ dependencies = [
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"webpki-roots 0.25.2",
|
||||
"workspace_hack",
|
||||
"x509-parser",
|
||||
@@ -5030,9 +5031,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.1.0"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
|
||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook"
|
||||
|
||||
@@ -143,6 +143,8 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
|
||||
#########################################################################################
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
||||
|
||||
@@ -617,6 +619,7 @@ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O
|
||||
FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
@@ -779,6 +782,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# Public extensions
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=postgis-build /sfcgal/* /
|
||||
@@ -883,8 +888,10 @@ FROM debian:bullseye-slim
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
mkdir /var/db/postgres/pgbouncer && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
chmod 0750 /var/db/postgres/pgbouncer && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
|
||||
@@ -32,8 +32,6 @@
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
|
||||
//! --pgbouncer-ini-path /etc/pgbouncer.ini \
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
@@ -112,9 +110,6 @@ fn main() -> Result<()> {
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
|
||||
let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
|
||||
let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
|
||||
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
@@ -225,8 +220,6 @@ fn main() -> Result<()> {
|
||||
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
build_tag,
|
||||
pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
|
||||
pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -523,23 +516,6 @@ fn cli() -> clap::Command {
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-connstr")
|
||||
.long("pgbouncer-connstr")
|
||||
.default_value(
|
||||
"host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
|
||||
)
|
||||
.value_name("PGBOUNCER_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-ini-path")
|
||||
.long("pgbouncer-ini-path")
|
||||
// Note: this doesn't match current path for pgbouncer.ini.
|
||||
// Until we fix it, we need to pass the path explicitly
|
||||
// or this will be effectively no-op.
|
||||
.default_value("/etc/pgbouncer.ini")
|
||||
.value_name("PGBOUNCER_INI_PATH"),
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
|
||||
@@ -71,10 +71,6 @@ pub struct ComputeNode {
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
// connection string to pgbouncer to change settings
|
||||
pub pgbouncer_connstr: Option<String>,
|
||||
// path to pgbouncer.ini to change settings
|
||||
pub pgbouncer_ini_path: Option<String>,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
@@ -769,8 +765,8 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
@@ -779,15 +775,9 @@ impl ComputeNode {
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
@@ -852,8 +842,8 @@ impl ComputeNode {
|
||||
);
|
||||
|
||||
// tune pgbouncer
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
@@ -862,15 +852,9 @@ impl ComputeNode {
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
|
||||
@@ -366,7 +366,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
|
||||
}
|
||||
|
||||
/// Update pgbouncer.ini with provided options
|
||||
pub fn update_pgbouncer_ini(
|
||||
fn update_pgbouncer_ini(
|
||||
pgbouncer_config: HashMap<String, String>,
|
||||
pgbouncer_ini_path: &str,
|
||||
) -> Result<()> {
|
||||
@@ -375,6 +375,10 @@ pub fn update_pgbouncer_ini(
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
section.insert(option_name, value);
|
||||
debug!(
|
||||
"Updating pgbouncer.ini with new values {}={}",
|
||||
option_name, value
|
||||
);
|
||||
}
|
||||
|
||||
conf.write_to_file(pgbouncer_ini_path)?;
|
||||
@@ -384,49 +388,80 @@ pub fn update_pgbouncer_ini(
|
||||
/// Tune pgbouncer.
|
||||
/// 1. Apply new config using pgbouncer admin console
|
||||
/// 2. Add new values to pgbouncer.ini to preserve them after restart
|
||||
pub async fn tune_pgbouncer(
|
||||
pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
pgbouncer_connstr: &str,
|
||||
pgbouncer_ini_path: Option<String>,
|
||||
) -> Result<()> {
|
||||
if let Some(pgbouncer_config) = pgbouncer_settings {
|
||||
// Apply new config
|
||||
let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
|
||||
let (client, connection) = connect_result.unwrap();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
|
||||
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
|
||||
// for VMs use pgbouncer specific way to connect to
|
||||
// pgbouncer admin console without password
|
||||
// when pgbouncer is running under the same user.
|
||||
"host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string()
|
||||
} else {
|
||||
// for k8s use normal connection string with password
|
||||
// to connect to pgbouncer admin console
|
||||
let mut pgbouncer_connstr =
|
||||
"host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
|
||||
if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
|
||||
pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
|
||||
}
|
||||
pgbouncer_connstr
|
||||
};
|
||||
|
||||
info!(
|
||||
"Connecting to pgbouncer with connection string: {}",
|
||||
pgbouncer_connstr
|
||||
);
|
||||
|
||||
// connect to pgbouncer, retrying several times
|
||||
// because pgbouncer may not be ready yet
|
||||
let mut retries = 3;
|
||||
let client = loop {
|
||||
match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await {
|
||||
Ok((client, connection)) => {
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
break client;
|
||||
}
|
||||
});
|
||||
Err(e) => {
|
||||
if retries == 0 {
|
||||
return Err(e.into());
|
||||
}
|
||||
error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e);
|
||||
retries -= 1;
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
info!(
|
||||
"Applying pgbouncer setting change: {} = {}",
|
||||
option_name, value
|
||||
// Apply new config
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
let query = format!("SET {}={}", option_name, value);
|
||||
// keep this log line for debugging purposes
|
||||
info!("Applying pgbouncer setting change: {}", query);
|
||||
|
||||
if let Err(err) = client.simple_query(&query).await {
|
||||
// Don't fail on error, just print it into log
|
||||
error!(
|
||||
"Failed to apply pgbouncer setting change: {}, {}",
|
||||
query, err
|
||||
);
|
||||
let query = format!("SET {} = {}", option_name, value);
|
||||
|
||||
let result = client.simple_query(&query).await;
|
||||
|
||||
info!("Applying pgbouncer setting change: {}", query);
|
||||
info!("pgbouncer setting change result: {:?}", result);
|
||||
|
||||
if let Err(err) = result {
|
||||
// Don't fail on error, just print it into log
|
||||
error!(
|
||||
"Failed to apply pgbouncer setting change: {}, {}",
|
||||
query, err
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
// save values to pgbouncer.ini
|
||||
// so that they are preserved after pgbouncer restart
|
||||
if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
|
||||
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// save values to pgbouncer.ini
|
||||
// so that they are preserved after pgbouncer restart
|
||||
let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() {
|
||||
// in VMs we use /etc/pgbouncer.ini
|
||||
"/etc/pgbouncer.ini".to_string()
|
||||
} else {
|
||||
// in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini
|
||||
// this is a shared volume between pgbouncer and postgres containers
|
||||
// FIXME: fix permissions for this file
|
||||
"/var/db/postgres/pgbouncer/pgbouncer.ini".to_string()
|
||||
};
|
||||
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -329,8 +329,8 @@ impl CheckPoint {
|
||||
///
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
// nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||
let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||
let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
|
||||
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
|
||||
new_xid =
|
||||
|
||||
@@ -38,40 +38,6 @@ pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
|
||||
Ok(content)
|
||||
}
|
||||
|
||||
/// Version of [`std::fs::remove_dir_all`] that is idempotent and tolerates parallel removals of the same path or sub-paths
|
||||
///
|
||||
/// The idempotency implies that we return `Ok(())`` even if the file is already gone or has never existed,
|
||||
/// unlike `remove_dir_all` from std/tokio.
|
||||
pub fn remove_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
|
||||
fn strip_not_found<T>(v: io::Result<T>) -> Option<io::Result<T>> {
|
||||
match v {
|
||||
Err(e) if e.kind() == io::ErrorKind::NotFound => None,
|
||||
other => Some(other),
|
||||
}
|
||||
}
|
||||
let Some(list) = strip_not_found(std::fs::read_dir(&path)) else {
|
||||
return Ok(());
|
||||
};
|
||||
for entry in list? {
|
||||
let Some(entry) = strip_not_found(entry) else {
|
||||
continue;
|
||||
};
|
||||
let entry = entry?;
|
||||
let Some(file_type) = strip_not_found(entry.file_type()) else {
|
||||
continue;
|
||||
};
|
||||
if file_type?.is_dir() {
|
||||
remove_dir_all(entry.path())?;
|
||||
} else {
|
||||
strip_not_found(std::fs::remove_file(entry.path())).unwrap_or(Ok(()))?;
|
||||
}
|
||||
}
|
||||
if let Some(res) = strip_not_found(std::fs::remove_dir(path)) {
|
||||
res?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Ok(())
|
||||
|
||||
@@ -108,9 +108,32 @@ pub struct RelTagBlockNo {
|
||||
}
|
||||
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(mut self) {
|
||||
let _ = self.cancel_on_client_drop.take();
|
||||
self.conn_task.await.unwrap();
|
||||
pub async fn shutdown(self) {
|
||||
let Self {
|
||||
copy_both,
|
||||
cancel_on_client_drop: cancel_conn_task,
|
||||
conn_task,
|
||||
} = self;
|
||||
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
|
||||
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
|
||||
//
|
||||
// If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`,
|
||||
// the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race).
|
||||
//
|
||||
// Further, the pageserver makes a lot of noise when it receives CopyFail.
|
||||
// Computes don't send it in practice, they just hard-close the connection.
|
||||
//
|
||||
// So, let's behave like the computes and suppress the CopyFail as follows:
|
||||
// kill the socket first, then drop copy_both.
|
||||
//
|
||||
// See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY
|
||||
//
|
||||
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
|
||||
// => https://github.com/neondatabase/neon/issues/6390
|
||||
let _ = cancel_conn_task.unwrap();
|
||||
conn_task.await.unwrap();
|
||||
drop(copy_both);
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
|
||||
@@ -404,23 +404,27 @@ async fn client(
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
while let Some(req) =
|
||||
tokio::select! { work = work.recv() => { work } , _ = cancel.cancelled() => { return; } }
|
||||
{
|
||||
let start = Instant::now();
|
||||
|
||||
let res = tokio::select! {
|
||||
res = client.getpage(req) => { res },
|
||||
_ = cancel.cancelled() => { return; }
|
||||
};
|
||||
res.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
let do_requests = async {
|
||||
start_work_barrier.wait().await;
|
||||
while let Some(req) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
client
|
||||
.getpage(req)
|
||||
.await
|
||||
.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
};
|
||||
tokio::select! {
|
||||
res = do_requests => { res },
|
||||
_ = cancel.cancelled() => {
|
||||
client.shutdown().await;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@ fn main() {
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
let args = Args::parse();
|
||||
match args {
|
||||
|
||||
@@ -386,39 +386,56 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// If we get far enough in the list that we start to evict layers that are below
|
||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||
let mut warned = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
let mut evicted_amount = 0;
|
||||
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
if !usage_planned.has_pressure() {
|
||||
debug!(
|
||||
no_candidates_evicted = i,
|
||||
"took enough candidates for pressure to be relieved"
|
||||
);
|
||||
break;
|
||||
let selection = select_victims(&candidates, usage_pre);
|
||||
|
||||
let mut candidates = candidates;
|
||||
|
||||
let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
|
||||
// we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
|
||||
// for comparison here. this is a temporary measure to develop alternatives.
|
||||
use std::fmt::Write;
|
||||
|
||||
let mut summary_buf = String::with_capacity(256);
|
||||
|
||||
{
|
||||
let absolute_summary = candidates
|
||||
.iter()
|
||||
.take(selection.amount)
|
||||
.map(|(_, candidate)| candidate)
|
||||
.collect::<summary::EvictionSummary>();
|
||||
|
||||
write!(summary_buf, "{absolute_summary}").expect("string grows");
|
||||
|
||||
info!("absolute accessed selection summary: {summary_buf}");
|
||||
}
|
||||
|
||||
if partition == &MinResidentSizePartition::Below && warned.is_none() {
|
||||
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
||||
warned = Some(usage_planned);
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
});
|
||||
|
||||
let selection = select_victims(&candidates, usage_pre);
|
||||
|
||||
{
|
||||
summary_buf.clear();
|
||||
|
||||
let relative_summary = candidates
|
||||
.iter()
|
||||
.take(selection.amount)
|
||||
.map(|(_, candidate)| candidate)
|
||||
.collect::<summary::EvictionSummary>();
|
||||
|
||||
write!(summary_buf, "{relative_summary}").expect("string grows");
|
||||
|
||||
info!("relative accessed selection summary: {summary_buf}");
|
||||
}
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.get_file_size());
|
||||
evicted_amount += 1;
|
||||
}
|
||||
|
||||
let usage_planned = match warned {
|
||||
Some(respecting_tenant_min_resident_size) => PlannedUsage {
|
||||
respecting_tenant_min_resident_size,
|
||||
fallback_to_global_lru: Some(usage_planned),
|
||||
},
|
||||
None => PlannedUsage {
|
||||
respecting_tenant_min_resident_size: usage_planned,
|
||||
fallback_to_global_lru: None,
|
||||
},
|
||||
selection
|
||||
} else {
|
||||
selection
|
||||
};
|
||||
debug!(?usage_planned, "usage planned");
|
||||
|
||||
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
|
||||
|
||||
// phase2: evict layers
|
||||
|
||||
@@ -796,14 +813,16 @@ async fn collect_eviction_candidates(
|
||||
// A default override can be put in the default tenant conf in the pageserver.toml.
|
||||
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
|
||||
debug!(
|
||||
tenant_id=%tenant.tenant_id(),
|
||||
tenant_id=%tenant.tenant_shard_id().tenant_id,
|
||||
shard_id=%tenant.tenant_shard_id().shard_slug(),
|
||||
overridden_size=s,
|
||||
"using overridden min resident size for tenant"
|
||||
);
|
||||
s
|
||||
} else {
|
||||
debug!(
|
||||
tenant_id=%tenant.tenant_id(),
|
||||
tenant_id=%tenant.tenant_shard_id().tenant_id,
|
||||
shard_id=%tenant.tenant_shard_id().shard_slug(),
|
||||
max_layer_size,
|
||||
"using max layer size as min_resident_size for tenant",
|
||||
);
|
||||
@@ -908,22 +927,80 @@ async fn collect_eviction_candidates(
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
|
||||
match eviction_order {
|
||||
EvictionOrder::AbsoluteAccessed => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.last_activity_ts)
|
||||
});
|
||||
}
|
||||
EvictionOrder::RelativeAccessed { .. } => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
});
|
||||
}
|
||||
}
|
||||
// always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
|
||||
// will sort later by candidate.relative_last_activity to get compare evictions.
|
||||
candidates
|
||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||
|
||||
Ok(EvictionCandidates::Finished(candidates))
|
||||
}
|
||||
|
||||
/// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
|
||||
/// relieve pressure.
|
||||
///
|
||||
/// Returns the amount of candidates selected, with the planned usage.
|
||||
fn select_victims<U: Usage>(
|
||||
candidates: &[(MinResidentSizePartition, EvictionCandidate)],
|
||||
usage_pre: U,
|
||||
) -> VictimSelection<U> {
|
||||
let mut usage_when_switched = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
let mut evicted_amount = 0;
|
||||
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
if !usage_planned.has_pressure() {
|
||||
break;
|
||||
}
|
||||
|
||||
if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
|
||||
usage_when_switched = Some((usage_planned, i));
|
||||
}
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.get_file_size());
|
||||
evicted_amount += 1;
|
||||
}
|
||||
|
||||
VictimSelection {
|
||||
amount: evicted_amount,
|
||||
usage_pre,
|
||||
usage_when_switched,
|
||||
usage_planned,
|
||||
}
|
||||
}
|
||||
|
||||
struct VictimSelection<U> {
|
||||
amount: usize,
|
||||
usage_pre: U,
|
||||
usage_when_switched: Option<(U, usize)>,
|
||||
usage_planned: U,
|
||||
}
|
||||
|
||||
impl<U: Usage> VictimSelection<U> {
|
||||
fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
|
||||
debug!(
|
||||
evicted_amount=%self.amount,
|
||||
"took enough candidates for pressure to be relieved"
|
||||
);
|
||||
|
||||
if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
|
||||
warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
||||
}
|
||||
|
||||
let planned = match self.usage_when_switched {
|
||||
Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
|
||||
respecting_tenant_min_resident_size,
|
||||
fallback_to_global_lru: Some(self.usage_planned),
|
||||
},
|
||||
None => PlannedUsage {
|
||||
respecting_tenant_min_resident_size: self.usage_planned,
|
||||
fallback_to_global_lru: None,
|
||||
},
|
||||
};
|
||||
|
||||
(self.amount, planned)
|
||||
}
|
||||
}
|
||||
|
||||
struct TimelineKey(Arc<Timeline>);
|
||||
|
||||
impl PartialEq for TimelineKey {
|
||||
@@ -1008,6 +1085,137 @@ pub(crate) mod finite_f32 {
|
||||
}
|
||||
}
|
||||
|
||||
mod summary {
|
||||
use super::finite_f32::FiniteF32;
|
||||
use super::{EvictionCandidate, LayerCount};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::time::SystemTime;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub(super) struct EvictionSummary {
|
||||
evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
|
||||
total: LayerCount,
|
||||
|
||||
last_absolute: Option<SystemTime>,
|
||||
last_relative: Option<FiniteF32>,
|
||||
}
|
||||
|
||||
impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
|
||||
fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
|
||||
let mut summary = EvictionSummary::default();
|
||||
for item in iter {
|
||||
let counts = summary
|
||||
.evicted_per_tenant
|
||||
.entry(*item.layer.get_tenant_shard_id())
|
||||
.or_default();
|
||||
|
||||
let sz = item.layer.get_file_size();
|
||||
|
||||
counts.file_sizes += sz;
|
||||
counts.count += 1;
|
||||
|
||||
summary.total.file_sizes += sz;
|
||||
summary.total.count += 1;
|
||||
|
||||
summary.last_absolute = Some(item.last_activity_ts);
|
||||
summary.last_relative = Some(item.relative_last_activity);
|
||||
}
|
||||
|
||||
summary
|
||||
}
|
||||
}
|
||||
|
||||
struct SiBytesAmount(u64);
|
||||
|
||||
impl std::fmt::Display for SiBytesAmount {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.0 < 1024 {
|
||||
return write!(f, "{}B", self.0);
|
||||
}
|
||||
|
||||
let mut tmp = self.0;
|
||||
let mut ch = 0;
|
||||
let suffixes = b"KMGTPE";
|
||||
|
||||
while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
|
||||
tmp /= 1024;
|
||||
ch += 1;
|
||||
}
|
||||
|
||||
let ch = suffixes[ch] as char;
|
||||
|
||||
write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EvictionSummary {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// wasteful, but it's for testing
|
||||
|
||||
let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
|
||||
|
||||
for (tenant_shard_id, count) in &self.evicted_per_tenant {
|
||||
sorted
|
||||
.entry(count.count)
|
||||
.or_default()
|
||||
.push((*tenant_shard_id, count.file_sizes));
|
||||
}
|
||||
|
||||
let total_file_sizes = SiBytesAmount(self.total.file_sizes);
|
||||
|
||||
writeln!(
|
||||
f,
|
||||
"selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
|
||||
self.total.count, self.last_absolute, self.last_relative,
|
||||
)?;
|
||||
|
||||
for (count, per_tenant) in sorted.iter().rev().take(10) {
|
||||
write!(f, "- {count} layers: ")?;
|
||||
|
||||
if per_tenant.len() < 3 {
|
||||
for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
let bytes = SiBytesAmount(*bytes);
|
||||
write!(f, "{tenant_shard_id} ({bytes})")?;
|
||||
}
|
||||
} else {
|
||||
let num_tenants = per_tenant.len();
|
||||
let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
|
||||
let total_bytes = SiBytesAmount(total_bytes);
|
||||
let layers = num_tenants * count;
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{num_tenants} tenants {total_bytes} in total {layers} layers",
|
||||
)?;
|
||||
}
|
||||
|
||||
writeln!(f)?;
|
||||
}
|
||||
|
||||
if sorted.len() > 10 {
|
||||
let (rem_count, rem_bytes) = sorted
|
||||
.iter()
|
||||
.rev()
|
||||
.map(|(count, per_tenant)| {
|
||||
(
|
||||
count,
|
||||
per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
|
||||
)
|
||||
})
|
||||
.fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
|
||||
let rem_bytes = SiBytesAmount(rem_bytes);
|
||||
writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod filesystem_level_usage {
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -1236,7 +1236,7 @@ async fn tenant_create_handler(
|
||||
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
TenantCreateResponse(new_tenant.tenant_id()),
|
||||
TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
/// Prometheus histogram buckets (in seconds) for operations in the critical
|
||||
/// path. In other words, operations that directly affect that latency of user
|
||||
@@ -59,7 +59,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(||
|
||||
register_counter_vec!(
|
||||
"pageserver_storage_operations_seconds_sum",
|
||||
"Total time spent on storage operations with operation, tenant and timeline dimensions",
|
||||
&["operation", "tenant_id", "timeline_id"],
|
||||
&["operation", "tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -68,7 +68,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
|
||||
register_int_counter_vec!(
|
||||
"pageserver_storage_operations_seconds_count",
|
||||
"Count of storage operations with operation, tenant and timeline dimensions",
|
||||
&["operation", "tenant_id", "timeline_id"],
|
||||
&["operation", "tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -373,7 +373,7 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_last_record_lsn",
|
||||
"Last record LSN grouped by timeline",
|
||||
&["tenant_id", "timeline_id"]
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -382,7 +382,7 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_resident_physical_size",
|
||||
"The size of the layer files present in the pageserver's filesystem.",
|
||||
&["tenant_id", "timeline_id"]
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -400,7 +400,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
"pageserver_remote_physical_size",
|
||||
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
|
||||
// Corollary: If any files are missing from the index part, they won't be included here.
|
||||
&["tenant_id", "timeline_id"]
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -433,7 +433,7 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_current_logical_size",
|
||||
"Current logical size grouped by timeline",
|
||||
&["tenant_id", "timeline_id"]
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define current logical size metric")
|
||||
});
|
||||
@@ -582,7 +582,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_broken_tenants_count",
|
||||
"Set of broken tenants",
|
||||
&["tenant_id"]
|
||||
&["tenant_id", "shard_id"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
@@ -602,7 +602,7 @@ static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_created_persistent_files_total",
|
||||
"Number of files created that are meant to be uploaded to cloud storage",
|
||||
&["tenant_id", "timeline_id"]
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -611,7 +611,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"Total bytes written that are meant to be uploaded to cloud storage",
|
||||
&["tenant_id", "timeline_id"]
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -630,7 +630,7 @@ static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_evictions",
|
||||
"Number of layers evicted from the pageserver",
|
||||
&["tenant_id", "timeline_id"]
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -927,7 +927,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_io_operations_bytes_total",
|
||||
"Total amount of bytes read/written in IO operations",
|
||||
&["operation", "tenant_id", "timeline_id"]
|
||||
&["operation", "tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1002,7 +1002,7 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
&["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -1069,8 +1069,9 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let metrics = std::array::from_fn(|i| {
|
||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||
@@ -1078,7 +1079,7 @@ impl SmgrQueryTimePerTimeline {
|
||||
.get_metric_with_label_values(&[op.into()])
|
||||
.unwrap();
|
||||
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
|
||||
.get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
GlobalAndPerTimelineHistogram {
|
||||
global,
|
||||
@@ -1098,6 +1099,7 @@ impl SmgrQueryTimePerTimeline {
|
||||
|
||||
#[cfg(test)]
|
||||
mod smgr_query_time_tests {
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use strum::IntoEnumIterator;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -1124,7 +1126,10 @@ mod smgr_query_time_tests {
|
||||
for op in &ops {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
let metrics = super::SmgrQueryTimePerTimeline::new(
|
||||
&TenantShardId::unsharded(tenant_id),
|
||||
&timeline_id,
|
||||
);
|
||||
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
@@ -1205,7 +1210,13 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
|
||||
"Number of ongoing calls to remote timeline client. \
|
||||
Used to populate pageserver_remote_timeline_client_calls_started. \
|
||||
This metric is not useful for sampling from Prometheus, but useful in tests.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
&[
|
||||
"tenant_id",
|
||||
"shard_id",
|
||||
"timeline_id",
|
||||
"file_kind",
|
||||
"op_kind"
|
||||
],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1226,22 +1237,23 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
|
||||
Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_timeline_client_bytes_started",
|
||||
"Incremented by the number of bytes associated with a remote timeline client operation. \
|
||||
The increment happens when the operation is scheduled.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
&["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_timeline_client_bytes_finished",
|
||||
"Incremented by the number of bytes associated with a remote timeline client operation. \
|
||||
The increment happens when the operation finishes (regardless of success/failure/shutdown).",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
&["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1687,14 +1699,19 @@ pub(crate) struct StorageTimeMetrics {
|
||||
}
|
||||
|
||||
impl StorageTimeMetrics {
|
||||
pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
|
||||
pub fn new(
|
||||
operation: StorageTimeOperation,
|
||||
tenant_id: &str,
|
||||
shard_id: &str,
|
||||
timeline_id: &str,
|
||||
) -> Self {
|
||||
let operation: &'static str = operation.into();
|
||||
|
||||
let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
|
||||
.get_metric_with_label_values(&[operation, tenant_id, timeline_id])
|
||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
||||
.unwrap();
|
||||
let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
|
||||
.get_metric_with_label_values(&[operation, tenant_id, timeline_id])
|
||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
||||
.unwrap();
|
||||
let global_histogram = STORAGE_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[operation])
|
||||
@@ -1746,40 +1763,66 @@ impl TimelineMetrics {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let flush_time_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
|
||||
let compact_time_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
|
||||
let create_images_time_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
|
||||
let logical_size_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
|
||||
let flush_time_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::LayerFlush,
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let compact_time_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::Compact,
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let create_images_time_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::CreateImages,
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let logical_size_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::LogicalSize,
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let imitate_logical_size_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::ImitateLogicalSize,
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let load_layer_map_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::LoadLayerMap,
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let garbage_collect_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::Gc,
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let load_layer_map_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
|
||||
let garbage_collect_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
// TODO: we shouldn't expose this metric
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions = EVICTIONS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
|
||||
.build(&tenant_id, &shard_id, &timeline_id);
|
||||
@@ -1833,15 +1876,17 @@ impl Drop for TimelineMetrics {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
{
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ =
|
||||
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
}
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
let _ =
|
||||
NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
@@ -1854,29 +1899,42 @@ impl Drop for TimelineMetrics {
|
||||
// outlive an individual smgr connection, but not the timeline.
|
||||
|
||||
for op in StorageTimeOperation::VARIANTS {
|
||||
let _ =
|
||||
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
let _ =
|
||||
STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
|
||||
op,
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
|
||||
op,
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
|
||||
for op in STORAGE_IO_SIZE_OPERATIONS {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in SmgrQueryType::iter() {
|
||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
op.into(),
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
let tid = tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
// Only shard zero deals in synthetic sizes
|
||||
if tenant_shard_id.is_zero() {
|
||||
let tid = tenant_shard_id.tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
}
|
||||
|
||||
// we leave the BROKEN_TENANTS_SET entry if any
|
||||
}
|
||||
|
||||
@@ -1926,6 +1984,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
|
||||
|
||||
pub(crate) struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
||||
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
@@ -1937,6 +1996,7 @@ impl RemoteTimelineClientMetrics {
|
||||
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
||||
shard_id: format!("{}", tenant_shard_id.shard_slug()),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
@@ -1951,8 +2011,9 @@ impl RemoteTimelineClientMetrics {
|
||||
PerTimelineRemotePhysicalSizeGauge::new(
|
||||
REMOTE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
&self.timeline_id,
|
||||
])
|
||||
.unwrap(),
|
||||
)
|
||||
@@ -1987,8 +2048,9 @@ impl RemoteTimelineClientMetrics {
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
&self.timeline_id,
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
@@ -2018,8 +2080,9 @@ impl RemoteTimelineClientMetrics {
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
&self.timeline_id,
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
@@ -2038,8 +2101,9 @@ impl RemoteTimelineClientMetrics {
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
&self.timeline_id,
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
@@ -2183,6 +2247,7 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
fn drop(&mut self) {
|
||||
let RemoteTimelineClientMetrics {
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
remote_physical_size_gauge,
|
||||
calls_unfinished_gauge,
|
||||
@@ -2192,6 +2257,7 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
@@ -2200,6 +2266,7 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
@@ -2208,6 +2275,7 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
@@ -2215,7 +2283,7 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
}
|
||||
{
|
||||
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
|
||||
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2225,8 +2293,6 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
pub(crate) trait MeasureRemoteOp: Sized {
|
||||
fn measure_remote_op(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
file_kind: RemoteOpFileKind,
|
||||
op: RemoteOpKind,
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
@@ -2234,8 +2300,6 @@ pub(crate) trait MeasureRemoteOp: Sized {
|
||||
let start = Instant::now();
|
||||
MeasuredRemoteOp {
|
||||
inner: self,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
file_kind,
|
||||
op,
|
||||
start,
|
||||
@@ -2251,8 +2315,6 @@ pin_project! {
|
||||
{
|
||||
#[pin]
|
||||
inner: F,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
file_kind: RemoteOpFileKind,
|
||||
op: RemoteOpKind,
|
||||
start: Instant,
|
||||
|
||||
@@ -384,11 +384,17 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Analogous to calling cancelled() on a Timeline's cancellation token: waits for cancellation.
|
||||
/// Future that completes when we need to shut down the connection.
|
||||
///
|
||||
/// We use many Timeline objects, and hold GateGuards on all of them. We must therefore respect
|
||||
/// all of their cancellation tokens.
|
||||
async fn timeline_cancelled(&self) {
|
||||
/// Reasons for need to shut down are:
|
||||
/// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
|
||||
/// - task_mgr requests shutdown of the connection
|
||||
///
|
||||
/// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
|
||||
/// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
|
||||
///
|
||||
/// NB: keep in sync with [`Self::is_connection_cancelled`]
|
||||
async fn await_connection_cancelled(&self) {
|
||||
// A short wait before we expend the cycles to walk our timeline map. This avoids incurring
|
||||
// that cost every time we check for cancellation.
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
@@ -404,14 +410,19 @@ impl PageServerHandler {
|
||||
.map(|ht| ht.timeline.cancel.cancelled())
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
futs.next().await;
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => { }
|
||||
_ = futs.next() => {}
|
||||
}
|
||||
}
|
||||
|
||||
/// Analogous to calling is_cancelled() on a Timeline's cancellation token
|
||||
fn timeline_is_cancelled(&self) -> bool {
|
||||
self.shard_timelines
|
||||
.values()
|
||||
.any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
|
||||
/// Checking variant of [`Self::await_connection_cancelled`].
|
||||
fn is_connection_cancelled(&self) -> bool {
|
||||
task_mgr::is_shutdown_requested()
|
||||
|| self
|
||||
.shard_timelines
|
||||
.values()
|
||||
.any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
|
||||
}
|
||||
|
||||
/// This function always respects cancellation of any timeline in `[Self::shard_timelines]`. Pass in
|
||||
@@ -432,7 +443,7 @@ impl PageServerHandler {
|
||||
flush_r = pgb.flush() => {
|
||||
Ok(flush_r?)
|
||||
},
|
||||
_ = self.timeline_cancelled() => {
|
||||
_ = self.await_connection_cancelled() => {
|
||||
Err(QueryError::Shutdown)
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
@@ -545,13 +556,11 @@ impl PageServerHandler {
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
||||
|
||||
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
|
||||
loop {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = self.timeline_cancelled() => {
|
||||
_ = self.await_connection_cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown)
|
||||
@@ -585,7 +594,6 @@ impl PageServerHandler {
|
||||
|
||||
let (response, span) = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
|
||||
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
|
||||
(
|
||||
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -595,7 +603,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
|
||||
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
|
||||
(
|
||||
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -605,7 +612,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
|
||||
(
|
||||
self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -615,7 +621,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
|
||||
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
|
||||
(
|
||||
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -638,7 +643,7 @@ impl PageServerHandler {
|
||||
span.in_scope(|| info!("handler requested reconnect: {reason}"));
|
||||
return Err(QueryError::Reconnect);
|
||||
}
|
||||
Err(e) if self.timeline_is_cancelled() => {
|
||||
Err(e) if self.is_connection_cancelled() => {
|
||||
// This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
|
||||
// shutdown error, this may be buried inside a PageReconstructError::Other for example.
|
||||
//
|
||||
@@ -865,6 +870,9 @@ impl PageServerHandler {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetRelExists);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
@@ -888,6 +896,11 @@ impl PageServerHandler {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetRelSize);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
@@ -910,6 +923,11 @@ impl PageServerHandler {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetDbSize);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
@@ -1080,6 +1098,10 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
|
||||
@@ -18,13 +18,16 @@ use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::signal::unix::Signal;
|
||||
use std::fmt;
|
||||
use std::os::unix::process::CommandExt;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::runtime::Handle;
|
||||
@@ -112,7 +115,7 @@ use toml_edit;
|
||||
use utils::{
|
||||
crashsafe,
|
||||
generation::Generation,
|
||||
id::{TenantId, TimelineId},
|
||||
id::TimelineId,
|
||||
lsn::{Lsn, RecordLsn},
|
||||
};
|
||||
|
||||
@@ -371,13 +374,13 @@ impl WalRedoManager {
|
||||
pub enum GetTimelineError {
|
||||
#[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
|
||||
NotActive {
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
state: TimelineState,
|
||||
},
|
||||
#[error("Timeline {tenant_id}/{timeline_id} was not found")]
|
||||
NotFound {
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
},
|
||||
}
|
||||
@@ -1517,10 +1520,6 @@ impl Tenant {
|
||||
.map_err(LoadLocalTimelineError::Load)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_id(&self) -> TenantId {
|
||||
self.tenant_shard_id.tenant_id
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
|
||||
self.tenant_shard_id
|
||||
}
|
||||
@@ -1536,13 +1535,13 @@ impl Tenant {
|
||||
let timeline = timelines_accessor
|
||||
.get(&timeline_id)
|
||||
.ok_or(GetTimelineError::NotFound {
|
||||
tenant_id: self.tenant_shard_id.tenant_id,
|
||||
tenant_id: self.tenant_shard_id,
|
||||
timeline_id,
|
||||
})?;
|
||||
|
||||
if active_only && !timeline.is_active() {
|
||||
Err(GetTimelineError::NotActive {
|
||||
tenant_id: self.tenant_shard_id.tenant_id,
|
||||
tenant_id: self.tenant_shard_id,
|
||||
timeline_id,
|
||||
state: timeline.current_state(),
|
||||
})
|
||||
@@ -2597,7 +2596,9 @@ impl Tenant {
|
||||
let (state, mut rx) = watch::channel(state);
|
||||
|
||||
tokio::spawn(async move {
|
||||
// Strings for metric labels
|
||||
let tid = tenant_shard_id.to_string();
|
||||
let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
|
||||
|
||||
fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
|
||||
([state.into()], matches!(state, TenantState::Broken { .. }))
|
||||
@@ -2610,13 +2611,15 @@ impl Tenant {
|
||||
// the tenant might be ignored and reloaded, so first remove any previous set
|
||||
// element. it most likely has already been scraped, as these are manual operations
|
||||
// right now. most likely we will add it back very soon.
|
||||
drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
|
||||
drop(
|
||||
crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
|
||||
);
|
||||
false
|
||||
} else {
|
||||
// add the id to the set right away, there should not be any updates on the channel
|
||||
// after
|
||||
crate::metrics::BROKEN_TENANTS_SET
|
||||
.with_label_values(&[&tid])
|
||||
.with_label_values(&[&tid, &shard_id_str])
|
||||
.set(1);
|
||||
true
|
||||
};
|
||||
@@ -2642,7 +2645,7 @@ impl Tenant {
|
||||
counted_broken = true;
|
||||
// insert the tenant_id (back) into the set
|
||||
crate::metrics::BROKEN_TENANTS_SET
|
||||
.with_label_values(&[&tid])
|
||||
.with_label_values(&[&tid, &shard_id_str])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
@@ -3290,7 +3293,7 @@ impl Tenant {
|
||||
}
|
||||
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
|
||||
scopeguard::defer! {
|
||||
if let Err(e) = utils::fs_ext::remove_dir_all(&pgdata_path) {
|
||||
if let Err(e) = fs::remove_dir_all(&pgdata_path) {
|
||||
// this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
|
||||
error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
|
||||
}
|
||||
@@ -3629,6 +3632,9 @@ impl Tenant {
|
||||
self.cached_synthetic_tenant_size
|
||||
.store(size, Ordering::Relaxed);
|
||||
|
||||
// Only shard zero should be calculating synthetic sizes
|
||||
debug_assert!(self.shard_identity.is_zero());
|
||||
|
||||
TENANT_SYNTHETIC_SIZE_METRIC
|
||||
.get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
|
||||
.unwrap()
|
||||
@@ -3743,7 +3749,17 @@ async fn run_initdb(
|
||||
|
||||
let _permit = INIT_DB_SEMAPHORE.acquire().await;
|
||||
|
||||
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
|
||||
let mut initdb_command_std = std::process::Command::new(&initdb_bin_path);
|
||||
// The process_group function is unstable as tokio's MSRV is 1.63,
|
||||
// and process_group was stabilized in 1.64. This is the officially
|
||||
// recommended workaround.
|
||||
// Setting pgroup to 0 makes the pgroupid be that of the child, as explained in
|
||||
// https://github.com/microsoft/WSL/issues/2997 (unrelated bug, but explains it)
|
||||
// We use need the pgid to be set for pkill to work during cancellation, to also
|
||||
// get the child processes of initdb.
|
||||
initdb_command_std.process_group(0);
|
||||
|
||||
let mut initdb_command = tokio::process::Command::from(initdb_command_std)
|
||||
.args(["-D", initdb_target_dir.as_ref()])
|
||||
.args(["-U", &conf.superuser])
|
||||
.args(["-E", "utf8"])
|
||||
@@ -3764,13 +3780,25 @@ async fn run_initdb(
|
||||
.spawn()?;
|
||||
|
||||
tokio::select! {
|
||||
initdb_output = initdb_command.wait_with_output() => {
|
||||
let initdb_output = initdb_output?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
|
||||
exit_status = initdb_command.wait() => {
|
||||
let exit_status = exit_status?;
|
||||
if !exit_status.success() {
|
||||
let mut stderr = initdb_command.stderr.take().unwrap();
|
||||
let mut stderr_vec = Vec::new();
|
||||
tokio::io::copy(&mut stderr, &mut stderr_vec).await?;
|
||||
return Err(InitdbError::Failed(exit_status, stderr_vec));
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
if let Some(pid) = initdb_command.id() {
|
||||
warn!("Doing killpg...");
|
||||
nix::sys::signal::killpg(Pid::from_raw(pid as i32), Signal::SIGKILL)
|
||||
.map_err(|e| InitdbError::Other(anyhow::anyhow!(e)))?;
|
||||
initdb_command.wait().await?;
|
||||
} else {
|
||||
warn!("Couldn't obtain initdb pid, killing initdb process only.");
|
||||
initdb_command.kill().await?;
|
||||
}
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
}
|
||||
@@ -3780,7 +3808,7 @@ async fn run_initdb(
|
||||
|
||||
impl Drop for Tenant {
|
||||
fn drop(&mut self) {
|
||||
remove_tenant_metrics(&self.tenant_shard_id.tenant_id);
|
||||
remove_tenant_metrics(&self.tenant_shard_id);
|
||||
}
|
||||
}
|
||||
/// Dump contents of a layer file to stdout.
|
||||
@@ -5208,7 +5236,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
e,
|
||||
GetTimelineError::NotFound {
|
||||
tenant_id: tenant.tenant_shard_id.tenant_id,
|
||||
tenant_id: tenant.tenant_shard_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -847,15 +847,13 @@ impl TenantManager {
|
||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||
_ => {
|
||||
if active_only {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||
Err(GetTenantError::NotActive(tenant_shard_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
},
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
}
|
||||
@@ -1306,10 +1304,13 @@ impl TenantManager {
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum GetTenantError {
|
||||
/// NotFound is a TenantId rather than TenantShardId, because this error type is used from
|
||||
/// getters that use a TenantId and a ShardSelector, not just getters that target a specific shard.
|
||||
#[error("Tenant {0} not found")]
|
||||
NotFound(TenantId),
|
||||
|
||||
#[error("Tenant {0} is not active")]
|
||||
NotActive(TenantId),
|
||||
NotActive(TenantShardId),
|
||||
/// Broken is logically a subset of NotActive, but a distinct error is useful as
|
||||
/// NotActive is usually a retryable state for API purposes, whereas Broken
|
||||
/// is a stuck error state
|
||||
@@ -1342,15 +1343,13 @@ pub(crate) fn get_tenant(
|
||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||
_ => {
|
||||
if active_only {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||
Err(GetTenantError::NotActive(tenant_shard_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
},
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
}
|
||||
@@ -1426,7 +1425,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
}
|
||||
Some(TenantSlot::Secondary(_)) => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
|
||||
tenant_id,
|
||||
tenant_shard_id,
|
||||
)))
|
||||
}
|
||||
Some(TenantSlot::InProgress(barrier)) => {
|
||||
@@ -1465,7 +1464,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
Some(TenantSlot::Attached(tenant)) => tenant.clone(),
|
||||
_ => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
|
||||
tenant_id,
|
||||
tenant_shard_id,
|
||||
)))
|
||||
}
|
||||
}
|
||||
@@ -1493,7 +1492,7 @@ pub(crate) enum DeleteTimelineError {
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum TenantStateError {
|
||||
#[error("Tenant {0} is stopping")]
|
||||
IsStopping(TenantId),
|
||||
IsStopping(TenantShardId),
|
||||
#[error(transparent)]
|
||||
SlotError(#[from] TenantSlotError),
|
||||
#[error(transparent)]
|
||||
@@ -2123,7 +2122,7 @@ where
|
||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||
// wait for it but return an error right away because these are distinct requests.
|
||||
slot_guard.revert();
|
||||
return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
|
||||
return Err(TenantStateError::IsStopping(tenant_shard_id));
|
||||
}
|
||||
}
|
||||
Some(tenant)
|
||||
@@ -2252,7 +2251,6 @@ pub(crate) async fn immediate_gc(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::{info_span, Instrument};
|
||||
@@ -2273,7 +2271,7 @@ mod tests {
|
||||
|
||||
// harness loads it to active, which is forced and nothing is running on the tenant
|
||||
|
||||
let id = TenantShardId::unsharded(t.tenant_id());
|
||||
let id = t.tenant_shard_id();
|
||||
|
||||
// tenant harness configures the logging and we cannot escape it
|
||||
let _e = info_span!("testing", tenant_id = %id).entered();
|
||||
|
||||
@@ -522,8 +522,6 @@ impl RemoteTimelineClient {
|
||||
cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_shard_id.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Index,
|
||||
RemoteOpKind::Download,
|
||||
Arc::clone(&self.metrics),
|
||||
@@ -566,8 +564,6 @@ impl RemoteTimelineClient {
|
||||
cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_shard_id.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Download,
|
||||
Arc::clone(&self.metrics),
|
||||
@@ -1351,8 +1347,6 @@ impl RemoteTimelineClient {
|
||||
&self.cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_shard_id.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
Arc::clone(&self.metrics),
|
||||
@@ -1378,8 +1372,6 @@ impl RemoteTimelineClient {
|
||||
&self.cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_shard_id.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Index,
|
||||
RemoteOpKind::Upload,
|
||||
Arc::clone(&self.metrics),
|
||||
|
||||
@@ -252,6 +252,10 @@ pub struct Timeline {
|
||||
|
||||
pub(super) metrics: TimelineMetrics,
|
||||
|
||||
// `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code
|
||||
// in `crate::page_service` writes these metrics.
|
||||
pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
|
||||
|
||||
/// Ensures layers aren't frozen by checkpointer between
|
||||
/// [`Timeline::get_layer_for_write`] and layer reads.
|
||||
/// Locked automatically by [`TimelineWriter`] and checkpointer.
|
||||
@@ -1315,6 +1319,11 @@ impl Timeline {
|
||||
),
|
||||
),
|
||||
|
||||
query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
),
|
||||
|
||||
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
|
||||
|
||||
layer_flush_start_tx,
|
||||
|
||||
@@ -14,6 +14,7 @@ use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC
|
||||
use crate::tenant::TENANTS_SEGMENT_NAME;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
use std::os::unix::fs::FileExt;
|
||||
@@ -60,6 +61,7 @@ pub struct VirtualFile {
|
||||
// It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
|
||||
// strings.
|
||||
tenant_id: String,
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
}
|
||||
|
||||
@@ -301,15 +303,24 @@ impl VirtualFile {
|
||||
) -> Result<VirtualFile, std::io::Error> {
|
||||
let path_str = path.to_string();
|
||||
let parts = path_str.split('/').collect::<Vec<&str>>();
|
||||
let tenant_id;
|
||||
let timeline_id;
|
||||
if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
|
||||
tenant_id = parts[parts.len() - 4].to_string();
|
||||
timeline_id = parts[parts.len() - 2].to_string();
|
||||
} else {
|
||||
tenant_id = "*".to_string();
|
||||
timeline_id = "*".to_string();
|
||||
}
|
||||
let (tenant_id, shard_id, timeline_id) =
|
||||
if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
|
||||
let tenant_shard_part = parts[parts.len() - 4];
|
||||
let (tenant_id, shard_id) = match tenant_shard_part.parse::<TenantShardId>() {
|
||||
Ok(tenant_shard_id) => (
|
||||
tenant_shard_id.tenant_id.to_string(),
|
||||
format!("{}", tenant_shard_id.shard_slug()),
|
||||
),
|
||||
Err(_) => {
|
||||
// Malformed path: this ID is just for observability, so tolerate it
|
||||
// and pass through
|
||||
(tenant_shard_part.to_string(), "*".to_string())
|
||||
}
|
||||
};
|
||||
(tenant_id, shard_id, parts[parts.len() - 2].to_string())
|
||||
} else {
|
||||
("*".to_string(), "*".to_string(), "*".to_string())
|
||||
};
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
|
||||
|
||||
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
|
||||
@@ -333,6 +344,7 @@ impl VirtualFile {
|
||||
path: path.to_path_buf(),
|
||||
open_options: reopen_options,
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
@@ -574,7 +586,7 @@ impl VirtualFile {
|
||||
.read_at(buf, offset));
|
||||
if let Ok(size) = result {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&["read", &self.tenant_id, &self.timeline_id])
|
||||
.with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
|
||||
.add(size as i64);
|
||||
}
|
||||
result
|
||||
@@ -586,7 +598,7 @@ impl VirtualFile {
|
||||
.write_at(buf, offset));
|
||||
if let Ok(size) = result {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&["write", &self.tenant_id, &self.timeline_id])
|
||||
.with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
|
||||
.add(size as i64);
|
||||
}
|
||||
result
|
||||
|
||||
@@ -102,7 +102,9 @@ impl WalIngest {
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
assert!(!self.checkpoint_modified);
|
||||
if self.checkpoint.update_next_xid(decoded.xl_xid) {
|
||||
if decoded.xl_xid != pg_constants::INVALID_TRANSACTION_ID
|
||||
&& self.checkpoint.update_next_xid(decoded.xl_xid)
|
||||
{
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
|
||||
@@ -330,8 +332,13 @@ impl WalIngest {
|
||||
< 0
|
||||
{
|
||||
self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
|
||||
// Write a new checkpoint key-value pair on every checkpoint record, even
|
||||
// if nothing really changed. Not strictly required, but it seems nice to
|
||||
// have some trace of the checkpoint records in the layer files at the same
|
||||
// LSNs.
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
pg_constants::RM_LOGICALMSG_ID => {
|
||||
@@ -2201,7 +2208,8 @@ mod tests {
|
||||
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
|
||||
let remote_initdb_path =
|
||||
remote_initdb_archive_path(&tenant.tenant_shard_id().tenant_id, &TIMELINE_ID);
|
||||
let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());
|
||||
|
||||
std::fs::create_dir_all(initdb_path.parent().unwrap())
|
||||
|
||||
@@ -308,13 +308,13 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
Assert(victim->access_count == 0);
|
||||
#ifdef FALLOC_FL_PUNCH_HOLE
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
|
||||
elog(LOG, "Failed to punch hole in file: %m");
|
||||
neon_log(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
lfc_ctl->used -= 1;
|
||||
}
|
||||
lfc_ctl->limit = new_size;
|
||||
elog(DEBUG1, "set local file cache limit to %d", new_size);
|
||||
neon_log(DEBUG1, "set local file cache limit to %d", new_size);
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
@@ -327,7 +327,7 @@ lfc_init(void)
|
||||
* shared_preload_libraries.
|
||||
*/
|
||||
if (!process_shared_preload_libraries_in_progress)
|
||||
elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
|
||||
neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");
|
||||
|
||||
|
||||
DefineCustomIntVariable("neon.max_file_cache_size",
|
||||
@@ -643,7 +643,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
Assert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
elog(DEBUG2, "Swap file cache page");
|
||||
neon_log(DEBUG2, "Swap file cache page");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -846,10 +846,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
* wrong) function definition though.
|
||||
*/
|
||||
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
|
||||
elog(ERROR, "return type must be a row type");
|
||||
neon_log(ERROR, "return type must be a row type");
|
||||
|
||||
if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
|
||||
elog(ERROR, "incorrect number of output arguments");
|
||||
neon_log(ERROR, "incorrect number of output arguments");
|
||||
|
||||
/* Construct a tuple descriptor for the result rows. */
|
||||
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
|
||||
|
||||
@@ -990,7 +990,7 @@ nm_pack_request(NeonRequest *msg)
|
||||
case T_NeonErrorResponse:
|
||||
case T_NeonDbSizeResponse:
|
||||
default:
|
||||
elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
|
||||
neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
|
||||
break;
|
||||
}
|
||||
return s;
|
||||
@@ -1085,7 +1085,7 @@ nm_unpack_response(StringInfo s)
|
||||
case T_NeonGetPageRequest:
|
||||
case T_NeonDbSizeRequest:
|
||||
default:
|
||||
elog(ERROR, "unexpected neon message tag 0x%02x", tag);
|
||||
neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1277,7 +1277,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
XLogFlush(recptr);
|
||||
lsn = recptr;
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
@@ -1305,7 +1305,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
if (PageIsNew((Page) buffer))
|
||||
{
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
@@ -1313,7 +1313,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
else if (PageIsEmptyHeapPage((Page) buffer))
|
||||
{
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
@@ -1321,7 +1321,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
else
|
||||
{
|
||||
ereport(PANIC,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
@@ -1330,7 +1330,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
else
|
||||
{
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
@@ -1430,7 +1430,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
|
||||
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
|
||||
elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
|
||||
neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
|
||||
(uint32) ((lsn) >> 32), (uint32) (lsn));
|
||||
}
|
||||
else
|
||||
@@ -1445,7 +1445,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
|
||||
*latest = true;
|
||||
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
|
||||
Assert(lsn != InvalidXLogRecPtr);
|
||||
elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
|
||||
neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
|
||||
(uint32) ((lsn) >> 32), (uint32) (lsn));
|
||||
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
@@ -1465,7 +1465,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
|
||||
#endif
|
||||
if (lsn > flushlsn)
|
||||
{
|
||||
elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
|
||||
neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
|
||||
(uint32) (lsn >> 32), (uint32) lsn,
|
||||
(uint32) (flushlsn >> 32), (uint32) flushlsn);
|
||||
XLogFlush(lsn);
|
||||
@@ -1509,7 +1509,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
return mdexists(reln, forkNum);
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
|
||||
@@ -1561,7 +1561,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
@@ -1570,7 +1570,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
break;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
return exists;
|
||||
@@ -1587,7 +1587,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
@@ -1598,10 +1598,10 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
elog(SmgrTrace, "Create relation %u/%u/%u.%u",
|
||||
neon_log(SmgrTrace, "Create relation %u/%u/%u.%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum);
|
||||
|
||||
@@ -1696,7 +1696,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
@@ -1707,7 +1707,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1745,7 +1745,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
|
||||
|
||||
lsn = PageGetLSN((Page) buffer);
|
||||
elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum, blkno,
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
@@ -1785,7 +1785,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
@@ -1796,7 +1796,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (max_cluster_size > 0 &&
|
||||
@@ -1808,7 +1808,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DISK_FULL),
|
||||
errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
|
||||
errmsg("could not extend file because project size limit (%d MB) has been exceeded",
|
||||
max_cluster_size),
|
||||
errhint("This limit is defined by neon.max_cluster_size GUC")));
|
||||
}
|
||||
@@ -1821,7 +1821,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
||||
errmsg("cannot extend file \"%s\" beyond %u blocks",
|
||||
errmsg(NEON_TAG "cannot extend file \"%s\" beyond %u blocks",
|
||||
relpath(reln->smgr_rlocator, forkNum),
|
||||
InvalidBlockNumber)));
|
||||
|
||||
@@ -1882,7 +1882,7 @@ neon_open(SMgrRelation reln)
|
||||
mdopen(reln);
|
||||
|
||||
/* no work */
|
||||
elog(SmgrTrace, "[NEON_SMGR] open noop");
|
||||
neon_log(SmgrTrace, "open noop");
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1919,7 +1919,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
return mdprefetch(reln, forknum, blocknum);
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
|
||||
@@ -1964,11 +1964,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
/* not implemented */
|
||||
elog(SmgrTrace, "[NEON_SMGR] writeback noop");
|
||||
neon_log(SmgrTrace, "writeback noop");
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -2098,7 +2098,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
blkno,
|
||||
RelFileInfoFmt(rinfo),
|
||||
forkNum,
|
||||
@@ -2107,7 +2107,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
|
||||
/* buffer was used, clean up for later reuse */
|
||||
@@ -2131,7 +2131,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
@@ -2142,7 +2142,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
/* Try to read from local file cache */
|
||||
@@ -2170,7 +2170,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
{
|
||||
if (!PageIsNew((Page) pageserver_masked))
|
||||
{
|
||||
elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
blkno,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
@@ -2180,7 +2180,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
}
|
||||
else if (PageIsNew((Page) buffer))
|
||||
{
|
||||
elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
blkno,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
@@ -2195,7 +2195,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
|
||||
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
|
||||
{
|
||||
elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
blkno,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
@@ -2214,7 +2214,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
|
||||
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
|
||||
{
|
||||
elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
blkno,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
@@ -2294,13 +2294,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
neon_wallog_page(reln, forknum, blocknum, buffer, false);
|
||||
|
||||
lsn = PageGetLSN((Page) buffer);
|
||||
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, blocknum,
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
@@ -2327,7 +2327,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
@@ -2338,12 +2338,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
return mdnblocks(reln, forknum);
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
|
||||
{
|
||||
elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
|
||||
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, n_blocks);
|
||||
return n_blocks;
|
||||
@@ -2371,7 +2371,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
@@ -2380,11 +2380,11 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
break;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
|
||||
elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
||||
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
@@ -2427,7 +2427,7 @@ neon_dbsize(Oid dbNode)
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read db size of db %u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
|
||||
dbNode,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
errdetail("page server returned error: %s",
|
||||
@@ -2435,10 +2435,10 @@ neon_dbsize(Oid dbNode)
|
||||
break;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
|
||||
elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
dbNode,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
db_size);
|
||||
@@ -2458,7 +2458,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
@@ -2470,7 +2470,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
|
||||
@@ -2526,7 +2526,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
@@ -2538,10 +2538,10 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
elog(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -2566,17 +2566,17 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
* progress at a time. That's enough for the current usage.
|
||||
*/
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
|
||||
elog(ERROR, "unlogged relation build is already in progress");
|
||||
neon_log(ERROR, "unlogged relation build is already in progress");
|
||||
Assert(unlogged_build_rel == NULL);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("starting unlogged build of relation %u/%u/%u",
|
||||
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
@@ -2589,11 +2589,11 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
|
||||
elog(ERROR, "cannot perform unlogged index build, index is not empty ");
|
||||
neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
|
||||
|
||||
unlogged_build_rel = reln;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
|
||||
@@ -2620,7 +2620,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
Assert(unlogged_build_rel == reln);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
|
||||
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
|
||||
|
||||
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
@@ -2649,7 +2649,7 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
Assert(unlogged_build_rel == reln);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("ending unlogged build of relation %u/%u/%u",
|
||||
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
|
||||
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
@@ -2664,7 +2664,7 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
rinfob = InfoBFromSMgrRel(reln);
|
||||
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||
{
|
||||
elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
|
||||
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
|
||||
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
|
||||
forknum);
|
||||
|
||||
@@ -2707,7 +2707,7 @@ AtEOXact_neon(XactEvent event, void *arg)
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INTERNAL_ERROR),
|
||||
(errmsg("unlogged index build was not properly finished"))));
|
||||
(errmsg(NEON_TAG "unlogged index build was not properly finished"))));
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -2806,14 +2806,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
set_cached_relsize(rinfo, forknum, relsize);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
|
||||
elog(SmgrTrace, "Set length to %d", relsize);
|
||||
neon_log(SmgrTrace, "Set length to %d", relsize);
|
||||
}
|
||||
}
|
||||
|
||||
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
|
||||
|
||||
/*
|
||||
* TODO: May be it is better to make correspondent fgunctio from freespace.c public?
|
||||
* TODO: May be it is better to make correspondent function from freespace.c public?
|
||||
*/
|
||||
static BlockNumber
|
||||
get_fsm_physical_block(BlockNumber heapblk)
|
||||
@@ -2894,7 +2894,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno))
|
||||
elog(PANIC, "failed to locate backup block with ID %d", block_id);
|
||||
neon_log(PANIC, "failed to locate backup block with ID %d", block_id);
|
||||
#else
|
||||
XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
|
||||
#endif
|
||||
|
||||
@@ -959,8 +959,8 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
}
|
||||
|
||||
/*
|
||||
* If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
|
||||
* was committed yet. Start streaming then from the basebackup LSN.
|
||||
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
|
||||
* and nothing was committed yet. Start streaming then from the basebackup LSN.
|
||||
*/
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
|
||||
{
|
||||
@@ -973,12 +973,13 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
}
|
||||
|
||||
/*
|
||||
* If propEpochStartLsn is not 0, at least one msg with WAL was sent to
|
||||
* some connected safekeeper; it must have carried truncateLsn pointing to
|
||||
* the first record.
|
||||
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so it
|
||||
* should never be zero at this point, if we know timelineStartLsn.
|
||||
*
|
||||
* timelineStartLsn can be zero only on the first syncSafekeepers run.
|
||||
*/
|
||||
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
|
||||
(wp->config->syncSafekeepers && wp->truncateLsn == wp->propEpochStartLsn));
|
||||
(wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));
|
||||
|
||||
/*
|
||||
* We will be generating WAL since propEpochStartLsn, so we should set
|
||||
|
||||
@@ -5,7 +5,7 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = ["testing"]
|
||||
default = []
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
@@ -89,3 +89,4 @@ camino-tempfile.workspace = true
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
walkdir.workspace = true
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::auth::credentials::check_peer_addr_is_in_list;
|
||||
use crate::auth::validate_password_and_exchange;
|
||||
use crate::cache::Cached;
|
||||
use crate::console::errors::GetAuthInfoError;
|
||||
use crate::console::provider::ConsoleBackend;
|
||||
use crate::console::AuthSecret;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::proxy::connect_compute::handle_try_wake;
|
||||
@@ -43,11 +44,8 @@ use tracing::{error, info, warn};
|
||||
/// this helps us provide the credentials only to those auth
|
||||
/// backends which require them for the authentication process.
|
||||
pub enum BackendType<'a, T> {
|
||||
/// Current Cloud API (V2).
|
||||
Console(Cow<'a, console::provider::neon::Api>, T),
|
||||
/// Local mock of Cloud API (V2).
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(Cow<'a, console::provider::mock::Api>, T),
|
||||
/// Cloud API (V2).
|
||||
Console(Cow<'a, ConsoleBackend>, T),
|
||||
/// Authentication via a web browser.
|
||||
Link(Cow<'a, url::ApiUrl>),
|
||||
#[cfg(test)]
|
||||
@@ -64,9 +62,15 @@ impl std::fmt::Display for BackendType<'_, ()> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
|
||||
Console(api, _) => match &**api {
|
||||
ConsoleBackend::Console(endpoint) => {
|
||||
fmt.debug_tuple("Console").field(&endpoint.url()).finish()
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
ConsoleBackend::Postgres(endpoint) => {
|
||||
fmt.debug_tuple("Postgres").field(&endpoint.url()).finish()
|
||||
}
|
||||
},
|
||||
Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
|
||||
#[cfg(test)]
|
||||
Test(_) => fmt.debug_tuple("Test").finish(),
|
||||
@@ -81,8 +85,6 @@ impl<T> BackendType<'_, T> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => Console(Cow::Borrowed(c), x),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
|
||||
Link(c) => Link(Cow::Borrowed(c)),
|
||||
#[cfg(test)]
|
||||
Test(x) => Test(*x),
|
||||
@@ -98,8 +100,6 @@ impl<'a, T> BackendType<'a, T> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => Console(c, f(x)),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(c, x) => Postgres(c, f(x)),
|
||||
Link(c) => Link(c),
|
||||
#[cfg(test)]
|
||||
Test(x) => Test(x),
|
||||
@@ -114,8 +114,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => x.map(|x| Console(c, x)),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(c, x) => x.map(|x| Postgres(c, x)),
|
||||
Link(c) => Ok(Link(c)),
|
||||
#[cfg(test)]
|
||||
Test(x) => Ok(Test(x)),
|
||||
@@ -325,8 +323,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
||||
|
||||
match self {
|
||||
Console(_, user_info) => user_info.project.clone(),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(_, user_info) => user_info.project.clone(),
|
||||
Link(_) => Some("link".into()),
|
||||
#[cfg(test)]
|
||||
Test(_) => Some("test".into()),
|
||||
@@ -339,8 +335,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
||||
|
||||
match self {
|
||||
Console(_, user_info) => &user_info.user,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(_, user_info) => &user_info.user,
|
||||
Link(_) => "link",
|
||||
#[cfg(test)]
|
||||
Test(_) => "test",
|
||||
@@ -371,19 +365,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
||||
.await?;
|
||||
(cache_info, BackendType::Console(api, user_info))
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api, user_info) => {
|
||||
info!(
|
||||
user = &*user_info.user,
|
||||
project = user_info.project(),
|
||||
"performing authentication using a local postgres instance"
|
||||
);
|
||||
|
||||
let (cache_info, user_info) =
|
||||
auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
|
||||
.await?;
|
||||
(cache_info, BackendType::Postgres(api, user_info))
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url) => {
|
||||
info!("performing link authentication");
|
||||
@@ -414,8 +395,6 @@ impl BackendType<'_, ComputeUserInfo> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
|
||||
Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
|
||||
#[cfg(test)]
|
||||
Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))),
|
||||
@@ -432,8 +411,6 @@ impl BackendType<'_, ComputeUserInfo> {
|
||||
|
||||
match self {
|
||||
Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
|
||||
Link(_) => Ok(None),
|
||||
#[cfg(test)]
|
||||
Test(x) => x.wake_compute().map(Some),
|
||||
|
||||
@@ -57,24 +57,31 @@ pub(super) async fn authenticate(
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<NodeInfo> {
|
||||
let psql_session_id = new_psql_session_id();
|
||||
// registering waiter can fail if we get unlucky with rng.
|
||||
// just try again.
|
||||
let (psql_session_id, waiter) = loop {
|
||||
let psql_session_id = new_psql_session_id();
|
||||
|
||||
match console::mgmt::get_waiter(&psql_session_id) {
|
||||
Ok(waiter) => break (psql_session_id, waiter),
|
||||
Err(_e) => continue,
|
||||
}
|
||||
};
|
||||
|
||||
let span = info_span!("link", psql_session_id = &psql_session_id);
|
||||
let greeting = hello_message(link_uri, &psql_session_id);
|
||||
|
||||
let db_info = console::mgmt::with_waiter(psql_session_id, |waiter| async {
|
||||
// Give user a URL to spawn a new database.
|
||||
info!(parent: &span, "sending the auth URL to the user");
|
||||
client
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&Be::CLIENT_ENCODING)?
|
||||
.write_message(&Be::NoticeResponse(&greeting))
|
||||
.await?;
|
||||
// Give user a URL to spawn a new database.
|
||||
info!(parent: &span, "sending the auth URL to the user");
|
||||
client
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&Be::CLIENT_ENCODING)?
|
||||
.write_message(&Be::NoticeResponse(&greeting))
|
||||
.await?;
|
||||
|
||||
// Wait for web console response (see `mgmt`).
|
||||
info!(parent: &span, "waiting for console's reply...");
|
||||
waiter.await?.map_err(LinkAuthError::AuthFailed)
|
||||
})
|
||||
.await?;
|
||||
// Wait for web console response (see `mgmt`).
|
||||
info!(parent: &span, "waiting for console's reply...");
|
||||
let db_info = waiter.await.map_err(LinkAuthError::from)?;
|
||||
|
||||
client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
|
||||
|
||||
|
||||
@@ -249,12 +249,19 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
if let auth::BackendType::Console(api, _) = &config.auth_backend {
|
||||
let cache = api.caches.project_info.clone();
|
||||
if let Some(url) = args.redis_notifications {
|
||||
info!("Starting redis notifications listener ({url})");
|
||||
maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
|
||||
match &**api {
|
||||
proxy::console::provider::ConsoleBackend::Console(api) => {
|
||||
let cache = api.caches.project_info.clone();
|
||||
if let Some(url) = args.redis_notifications {
|
||||
info!("Starting redis notifications listener ({url})");
|
||||
maintenance_tasks
|
||||
.spawn(notifications::task_main(url.to_owned(), cache.clone()));
|
||||
}
|
||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
proxy::console::provider::ConsoleBackend::Postgres(_) => {}
|
||||
}
|
||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||
}
|
||||
|
||||
let maintenance = loop {
|
||||
@@ -351,13 +358,15 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
|
||||
|
||||
let api = console::provider::neon::Api::new(endpoint, caches, locks);
|
||||
let api = console::provider::ConsoleBackend::Console(api);
|
||||
auth::BackendType::Console(Cow::Owned(api), ())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
AuthBackend::Postgres => {
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let api = console::provider::mock::Api::new(url);
|
||||
auth::BackendType::Postgres(Cow::Owned(api), ())
|
||||
let api = console::provider::ConsoleBackend::Postgres(api);
|
||||
auth::BackendType::Console(Cow::Owned(api), ())
|
||||
}
|
||||
AuthBackend::Link => {
|
||||
let url = args.uri.parse()?;
|
||||
|
||||
2
proxy/src/cache/project_info.rs
vendored
2
proxy/src/cache/project_info.rs
vendored
@@ -266,7 +266,7 @@ impl ProjectInfoCacheImpl {
|
||||
tokio::time::interval(self.config.gc_interval / (self.cache.shards().len()) as u32);
|
||||
loop {
|
||||
interval.tick().await;
|
||||
if self.cache.len() <= self.config.size {
|
||||
if self.cache.len() < self.config.size {
|
||||
// If there are not too many entries, wait until the next gc cycle.
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -13,16 +13,10 @@ use tracing::{error, info, info_span, Instrument};
|
||||
static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
|
||||
|
||||
/// Give caller an opportunity to wait for the cloud's reply.
|
||||
pub async fn with_waiter<R, T, E>(
|
||||
pub fn get_waiter(
|
||||
psql_session_id: impl Into<String>,
|
||||
action: impl FnOnce(Waiter<'static, ComputeReady>) -> R,
|
||||
) -> Result<T, E>
|
||||
where
|
||||
R: std::future::Future<Output = Result<T, E>>,
|
||||
E: From<waiters::RegisterError>,
|
||||
{
|
||||
let waiter = CPLANE_WAITERS.register(psql_session_id.into())?;
|
||||
action(waiter).await
|
||||
) -> Result<Waiter<'static, ComputeReady>, waiters::RegisterError> {
|
||||
CPLANE_WAITERS.register(psql_session_id.into())
|
||||
}
|
||||
|
||||
pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> {
|
||||
@@ -77,7 +71,7 @@ async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
|
||||
}
|
||||
|
||||
/// A message received by `mgmt` when a compute node is ready.
|
||||
pub type ComputeReady = Result<DatabaseInfo, String>;
|
||||
pub type ComputeReady = DatabaseInfo;
|
||||
|
||||
// TODO: replace with an http-based protocol.
|
||||
struct MgmtHandler;
|
||||
@@ -102,7 +96,7 @@ fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), Qu
|
||||
let _enter = span.enter();
|
||||
info!("got response: {:?}", resp.result);
|
||||
|
||||
match notify(resp.session_id, Ok(resp.result)) {
|
||||
match notify(resp.session_id, resp.result) {
|
||||
Ok(()) => {
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
|
||||
@@ -248,23 +248,75 @@ pub trait Api {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
creds: &ComputeUserInfo,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError>;
|
||||
|
||||
async fn get_allowed_ips(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
creds: &ComputeUserInfo,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
creds: &ComputeUserInfo,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum ConsoleBackend {
|
||||
/// Current Cloud API (V2).
|
||||
Console(neon::Api),
|
||||
/// Local mock of Cloud API (V2).
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(mock::Api),
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Api for ConsoleBackend {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError> {
|
||||
use ConsoleBackend::*;
|
||||
match self {
|
||||
Console(api) => api.get_role_secret(ctx, user_info).await,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api) => api.get_role_secret(ctx, user_info).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_allowed_ips(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
|
||||
use ConsoleBackend::*;
|
||||
match self {
|
||||
Console(api) => api.get_allowed_ips(ctx, user_info).await,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api) => api.get_allowed_ips(ctx, user_info).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
|
||||
use ConsoleBackend::*;
|
||||
|
||||
match self {
|
||||
Console(api) => api.wake_compute(ctx, user_info).await,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api) => api.wake_compute(ctx, user_info).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Various caches for [`console`](super).
|
||||
pub struct ApiCaches {
|
||||
/// Cache for the `wake_compute` API method.
|
||||
|
||||
@@ -179,17 +179,18 @@ impl super::Api for Api {
|
||||
return Ok(Some(role_secret));
|
||||
}
|
||||
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
|
||||
let project_id = auth_info.project_id.unwrap_or(ep.clone());
|
||||
if let Some(secret) = &auth_info.secret {
|
||||
self.caches
|
||||
.project_info
|
||||
.insert_role_secret(&project_id, ep, user, secret.clone())
|
||||
if let Some(project_id) = auth_info.project_id {
|
||||
if let Some(secret) = &auth_info.secret {
|
||||
self.caches
|
||||
.project_info
|
||||
.insert_role_secret(&project_id, ep, user, secret.clone())
|
||||
}
|
||||
self.caches.project_info.insert_allowed_ips(
|
||||
&project_id,
|
||||
ep,
|
||||
Arc::new(auth_info.allowed_ips),
|
||||
);
|
||||
}
|
||||
self.caches.project_info.insert_allowed_ips(
|
||||
&project_id,
|
||||
ep,
|
||||
Arc::new(auth_info.allowed_ips),
|
||||
);
|
||||
// When we just got a secret, we don't need to invalidate it.
|
||||
Ok(auth_info.secret.map(Cached::new_uncached))
|
||||
}
|
||||
@@ -212,15 +213,16 @@ impl super::Api for Api {
|
||||
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
|
||||
let allowed_ips = Arc::new(auth_info.allowed_ips);
|
||||
let user = &user_info.user;
|
||||
let project_id = auth_info.project_id.unwrap_or(ep.clone());
|
||||
if let Some(secret) = &auth_info.secret {
|
||||
if let Some(project_id) = auth_info.project_id {
|
||||
if let Some(secret) = &auth_info.secret {
|
||||
self.caches
|
||||
.project_info
|
||||
.insert_role_secret(&project_id, ep, user, secret.clone())
|
||||
}
|
||||
self.caches
|
||||
.project_info
|
||||
.insert_role_secret(&project_id, ep, user, secret.clone())
|
||||
.insert_allowed_ips(&project_id, ep, allowed_ips.clone());
|
||||
}
|
||||
self.caches
|
||||
.project_info
|
||||
.insert_allowed_ips(&project_id, ep, allowed_ips.clone());
|
||||
Ok(Cached::new_uncached(allowed_ips))
|
||||
}
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ pub struct RequestMonitoring {
|
||||
user: Option<SmolStr>,
|
||||
application: Option<SmolStr>,
|
||||
error_kind: Option<ErrorKind>,
|
||||
success: bool,
|
||||
|
||||
// extra
|
||||
// This sender is here to keep the request monitoring channel open while requests are taking place.
|
||||
@@ -59,6 +60,7 @@ impl RequestMonitoring {
|
||||
user: None,
|
||||
application: None,
|
||||
error_kind: None,
|
||||
success: false,
|
||||
|
||||
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
|
||||
latency_timer: LatencyTimer::new(protocol),
|
||||
@@ -96,6 +98,10 @@ impl RequestMonitoring {
|
||||
self.user = Some(user);
|
||||
}
|
||||
|
||||
pub fn set_success(&mut self) {
|
||||
self.success = true;
|
||||
}
|
||||
|
||||
pub fn log(&mut self) {
|
||||
if let Some(tx) = self.sender.take() {
|
||||
let _: Result<(), _> = tx.send(self.clone());
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use std::sync::Arc;
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::BytesMut;
|
||||
use chrono::{Datelike, Timelike};
|
||||
use futures::{Stream, StreamExt};
|
||||
use parquet::{
|
||||
basic::Compression,
|
||||
@@ -86,6 +87,12 @@ struct RequestData {
|
||||
project: Option<String>,
|
||||
branch: Option<String>,
|
||||
error: Option<&'static str>,
|
||||
/// Success is counted if we form a HTTP response with sql rows inside
|
||||
/// Or if we make it to proxy_pass
|
||||
success: bool,
|
||||
/// Tracks time from session start (HTTP request/libpq TCP handshake)
|
||||
/// Through to success/failure
|
||||
duration_us: u64,
|
||||
}
|
||||
|
||||
impl From<RequestMonitoring> for RequestData {
|
||||
@@ -102,6 +109,11 @@ impl From<RequestMonitoring> for RequestData {
|
||||
protocol: value.protocol,
|
||||
region: value.region,
|
||||
error: value.error_kind.as_ref().map(|e| e.to_str()),
|
||||
success: value.success,
|
||||
duration_us: SystemTime::from(value.first_packet)
|
||||
.elapsed()
|
||||
.unwrap_or_default()
|
||||
.as_micros() as u64, // 584 millenia... good enough
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -266,7 +278,13 @@ async fn upload_parquet(
|
||||
|
||||
let compression = len as f64 / len_uncompressed as f64;
|
||||
let size = data.len();
|
||||
let id = uuid::Uuid::now_v7();
|
||||
let now = chrono::Utc::now();
|
||||
let id = uuid::Uuid::new_v7(uuid::Timestamp::from_unix(
|
||||
uuid::NoContext,
|
||||
// we won't be running this in 1970. this cast is ok
|
||||
now.timestamp() as u64,
|
||||
now.timestamp_subsec_nanos(),
|
||||
));
|
||||
|
||||
info!(
|
||||
%id,
|
||||
@@ -274,7 +292,14 @@ async fn upload_parquet(
|
||||
size, compression, "uploading request parquet file"
|
||||
);
|
||||
|
||||
let path = RemotePath::from_string(&format!("requests_{id}.parquet"))?;
|
||||
let year = now.year();
|
||||
let month = now.month();
|
||||
let day = now.day();
|
||||
let hour = now.hour();
|
||||
// segment files by time for S3 performance
|
||||
let path = RemotePath::from_string(&format!(
|
||||
"{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
|
||||
))?;
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
|
||||
@@ -332,6 +357,7 @@ mod tests {
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
};
|
||||
use tokio::{sync::mpsc, time};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData};
|
||||
|
||||
@@ -420,6 +446,8 @@ mod tests {
|
||||
protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
|
||||
region: "us-east-1",
|
||||
error: None,
|
||||
success: rng.gen(),
|
||||
duration_us: rng.gen_range(0..30_000_000),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -442,9 +470,11 @@ mod tests {
|
||||
|
||||
worker_inner(storage, rx, config).await.unwrap();
|
||||
|
||||
let mut files = std::fs::read_dir(tmpdir.as_std_path())
|
||||
.unwrap()
|
||||
.map(|entry| entry.unwrap().path())
|
||||
let mut files = WalkDir::new(tmpdir.as_std_path())
|
||||
.into_iter()
|
||||
.filter_map(|entry| entry.ok())
|
||||
.filter(|entry| entry.file_type().is_file())
|
||||
.map(|entry| entry.path().to_path_buf())
|
||||
.collect_vec();
|
||||
files.sort();
|
||||
|
||||
@@ -485,15 +515,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1029153, 3, 6000),
|
||||
(1029075, 3, 6000),
|
||||
(1029216, 3, 6000),
|
||||
(1029129, 3, 6000),
|
||||
(1029250, 3, 6000),
|
||||
(1029017, 3, 6000),
|
||||
(1029175, 3, 6000),
|
||||
(1029247, 3, 6000),
|
||||
(343124, 1, 2000)
|
||||
(1087635, 3, 6000),
|
||||
(1087288, 3, 6000),
|
||||
(1087444, 3, 6000),
|
||||
(1087572, 3, 6000),
|
||||
(1087468, 3, 6000),
|
||||
(1087500, 3, 6000),
|
||||
(1087533, 3, 6000),
|
||||
(1087566, 3, 6000),
|
||||
(362671, 1, 2000)
|
||||
],
|
||||
);
|
||||
|
||||
@@ -523,11 +553,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1166201, 6, 12000),
|
||||
(1163577, 6, 12000),
|
||||
(1164641, 6, 12000),
|
||||
(1168772, 6, 12000),
|
||||
(196761, 1, 2000)
|
||||
(1028637, 5, 10000),
|
||||
(1031969, 5, 10000),
|
||||
(1019900, 5, 10000),
|
||||
(1020365, 5, 10000),
|
||||
(1025010, 5, 10000)
|
||||
],
|
||||
);
|
||||
|
||||
@@ -559,11 +589,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1144934, 6, 12000),
|
||||
(1144941, 6, 12000),
|
||||
(1144735, 6, 12000),
|
||||
(1144936, 6, 12000),
|
||||
(191035, 1, 2000)
|
||||
(1210770, 6, 12000),
|
||||
(1211036, 6, 12000),
|
||||
(1210990, 6, 12000),
|
||||
(1210861, 6, 12000),
|
||||
(202073, 1, 2000)
|
||||
],
|
||||
);
|
||||
|
||||
@@ -588,15 +618,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1029153, 3, 6000),
|
||||
(1029075, 3, 6000),
|
||||
(1029216, 3, 6000),
|
||||
(1029129, 3, 6000),
|
||||
(1029250, 3, 6000),
|
||||
(1029017, 3, 6000),
|
||||
(1029175, 3, 6000),
|
||||
(1029247, 3, 6000),
|
||||
(343124, 1, 2000)
|
||||
(1087635, 3, 6000),
|
||||
(1087288, 3, 6000),
|
||||
(1087444, 3, 6000),
|
||||
(1087572, 3, 6000),
|
||||
(1087468, 3, 6000),
|
||||
(1087500, 3, 6000),
|
||||
(1087533, 3, 6000),
|
||||
(1087566, 3, 6000),
|
||||
(362671, 1, 2000)
|
||||
],
|
||||
);
|
||||
|
||||
@@ -633,7 +663,7 @@ mod tests {
|
||||
// files are smaller than the size threshold, but they took too long to fill so were flushed early
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[(515807, 2, 3001), (515585, 2, 3000), (515425, 2, 2999)],
|
||||
[(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)],
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
|
||||
@@ -356,6 +356,7 @@ pub async fn proxy_pass(
|
||||
compute: impl AsyncRead + AsyncWrite + Unpin,
|
||||
aux: MetricsAuxInfo,
|
||||
) -> anyhow::Result<()> {
|
||||
ctx.set_success();
|
||||
ctx.log();
|
||||
|
||||
let usage = USAGE_METRICS.register(Ids {
|
||||
|
||||
@@ -160,8 +160,6 @@ where
|
||||
let node_info = loop {
|
||||
let wake_res = match user_info {
|
||||
auth::BackendType::Console(api, user_info) => api.wake_compute(ctx, user_info).await,
|
||||
#[cfg(feature = "testing")]
|
||||
auth::BackendType::Postgres(api, user_info) => api.wake_compute(ctx, user_info).await,
|
||||
// nothing to do?
|
||||
auth::BackendType::Link(_) => return Err(err.into()),
|
||||
// test backend
|
||||
|
||||
@@ -46,14 +46,11 @@ enum Notification {
|
||||
}
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
|
||||
struct AllowedIpsUpdate {
|
||||
#[serde(rename = "project")]
|
||||
project_id: SmolStr,
|
||||
}
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
|
||||
struct PasswordUpdate {
|
||||
#[serde(rename = "project")]
|
||||
project_id: SmolStr,
|
||||
#[serde(rename = "role")]
|
||||
role_name: SmolStr,
|
||||
}
|
||||
fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
|
||||
@@ -151,7 +148,7 @@ mod tests {
|
||||
#[test]
|
||||
fn parse_allowed_ips() -> anyhow::Result<()> {
|
||||
let project_id = "new_project".to_string();
|
||||
let data = format!("{{\"project\": \"{project_id}\"}}");
|
||||
let data = format!("{{\"project_id\": \"{project_id}\"}}");
|
||||
let text = json!({
|
||||
"type": "message",
|
||||
"topic": "/allowed_ips_updated",
|
||||
@@ -177,7 +174,7 @@ mod tests {
|
||||
fn parse_password_updated() -> anyhow::Result<()> {
|
||||
let project_id = "new_project".to_string();
|
||||
let role_name = "new_role".to_string();
|
||||
let data = format!("{{\"project\": \"{project_id}\", \"role\": \"{role_name}\"}}");
|
||||
let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}");
|
||||
let text = json!({
|
||||
"type": "message",
|
||||
"topic": "/password_updated",
|
||||
|
||||
@@ -26,7 +26,7 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
|
||||
|
||||
use crate::{
|
||||
auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
|
||||
console,
|
||||
console::{self, messages::MetricsAuxInfo},
|
||||
context::RequestMonitoring,
|
||||
metrics::NUM_DB_CONNECTIONS_GAUGE,
|
||||
proxy::connect_compute::ConnectMechanism,
|
||||
@@ -362,6 +362,7 @@ impl GlobalConnPool {
|
||||
|
||||
// ok return cached connection if found and establish a new one otherwise
|
||||
let new_client = if let Some(client) = client {
|
||||
ctx.set_project(client.aux.clone());
|
||||
if client.inner.is_closed() {
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
|
||||
@@ -593,10 +594,6 @@ async fn connect_to_compute_once(
|
||||
span.in_scope(|| {
|
||||
info!(%conn_info, %session, "new connection");
|
||||
});
|
||||
let ids = Ids {
|
||||
endpoint_id: node_info.aux.endpoint_id.clone(),
|
||||
branch_id: node_info.aux.branch_id.clone(),
|
||||
};
|
||||
|
||||
let db_user = conn_info.db_and_user();
|
||||
tokio::spawn(
|
||||
@@ -664,7 +661,7 @@ async fn connect_to_compute_once(
|
||||
Ok(ClientInner {
|
||||
inner: client,
|
||||
session: tx,
|
||||
ids,
|
||||
aux: node_info.aux.clone(),
|
||||
conn_id,
|
||||
})
|
||||
}
|
||||
@@ -672,13 +669,17 @@ async fn connect_to_compute_once(
|
||||
struct ClientInner {
|
||||
inner: tokio_postgres::Client,
|
||||
session: tokio::sync::watch::Sender<uuid::Uuid>,
|
||||
ids: Ids,
|
||||
aux: MetricsAuxInfo,
|
||||
conn_id: uuid::Uuid,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn metrics(&self) -> Arc<MetricCounter> {
|
||||
USAGE_METRICS.register(self.inner.as_ref().unwrap().ids.clone())
|
||||
let aux = &self.inner.as_ref().unwrap().aux;
|
||||
USAGE_METRICS.register(Ids {
|
||||
endpoint_id: aux.endpoint_id.clone(),
|
||||
branch_id: aux.branch_id.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -497,6 +497,7 @@ async fn handle_inner(
|
||||
}
|
||||
};
|
||||
|
||||
ctx.set_success();
|
||||
ctx.log();
|
||||
let metrics = client.metrics();
|
||||
|
||||
|
||||
@@ -288,34 +288,32 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
|
||||
/// Deactivates the timeline and removes its data directory.
|
||||
async fn timeline_delete_force_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
|
||||
// error handling here when we're able to.
|
||||
let resp = GlobalTimelines::delete_force(&ttid)
|
||||
let resp = GlobalTimelines::delete(&ttid, only_local)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
|
||||
/// Deactivates all timelines for the tenant and removes its data directory.
|
||||
/// See `timeline_delete_force_handler`.
|
||||
async fn tenant_delete_force_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
/// See `timeline_delete_handler`.
|
||||
async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id = parse_request_param(&request, "tenant_id")?;
|
||||
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
|
||||
// Using an `InternalServerError` should be fixed when the types support it
|
||||
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
|
||||
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(
|
||||
@@ -512,10 +510,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
request_span(r, timeline_status_handler)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
request_span(r, timeline_delete_force_handler)
|
||||
request_span(r, timeline_delete_handler)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
request_span(r, tenant_delete_force_handler)
|
||||
request_span(r, tenant_delete_handler)
|
||||
})
|
||||
.post("/v1/pull_timeline", |r| {
|
||||
request_span(r, timeline_pull_handler)
|
||||
|
||||
@@ -88,6 +88,10 @@ impl SafeKeeperConf {
|
||||
self.tenant_dir(&ttid.tenant_id)
|
||||
.join(ttid.timeline_id.to_string())
|
||||
}
|
||||
|
||||
pub fn is_wal_backup_enabled(&self) -> bool {
|
||||
self.remote_storage.is_some() && self.wal_backup_enabled
|
||||
}
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
|
||||
@@ -742,6 +742,11 @@ where
|
||||
state.timeline_start_lsn
|
||||
);
|
||||
}
|
||||
if state.peer_horizon_lsn == Lsn(0) {
|
||||
// Update peer_horizon_lsn as soon as we know where timeline starts.
|
||||
// It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
|
||||
state.peer_horizon_lsn = msg.timeline_start_lsn;
|
||||
}
|
||||
if state.local_start_lsn == Lsn(0) {
|
||||
state.local_start_lsn = msg.start_streaming_at;
|
||||
info!("setting local_start_lsn to {:?}", state.local_start_lsn);
|
||||
|
||||
@@ -407,7 +407,7 @@ impl SafekeeperPostgresHandler {
|
||||
self.conf.timeline_dir(&tli.ttid),
|
||||
&persisted_state,
|
||||
start_pos,
|
||||
self.conf.wal_backup_enabled,
|
||||
self.conf.is_wal_backup_enabled(),
|
||||
)?;
|
||||
|
||||
// Split to concurrently receive and send data; replies are generally
|
||||
|
||||
@@ -33,12 +33,13 @@ use crate::safekeeper::{
|
||||
};
|
||||
use crate::send_wal::WalSenders;
|
||||
use crate::state::{TimelineMemState, TimelinePersistentState};
|
||||
use crate::wal_backup::{self};
|
||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
|
||||
use crate::metrics::FullTimelineInfo;
|
||||
use crate::wal_storage::Storage as wal_storage_iface;
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::{debug_dump, wal_storage};
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
/// Things safekeeper should know about timeline state on peers.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -471,14 +472,29 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete timeline from disk completely, by removing timeline directory. Background
|
||||
/// timeline activities will stop eventually.
|
||||
pub async fn delete_from_disk(
|
||||
/// Delete timeline from disk completely, by removing timeline directory.
|
||||
/// Background timeline activities will stop eventually.
|
||||
///
|
||||
/// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but
|
||||
/// deletion API endpoint is retriable.
|
||||
pub async fn delete(
|
||||
&self,
|
||||
shared_state: &mut MutexGuard<'_, SharedState>,
|
||||
only_local: bool,
|
||||
) -> Result<(bool, bool)> {
|
||||
let was_active = shared_state.active;
|
||||
self.cancel(shared_state);
|
||||
|
||||
// TODO: It's better to wait for s3 offloader termination before
|
||||
// removing data from s3. Though since s3 doesn't have transactions it
|
||||
// still wouldn't guarantee absense of data after removal.
|
||||
let conf = GlobalTimelines::get_global_config();
|
||||
if !only_local && conf.is_wal_backup_enabled() {
|
||||
// Note: we concurrently delete remote storage data from multiple
|
||||
// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
|
||||
// do some retries anyway.
|
||||
wal_backup::delete_timeline(&self.ttid).await?;
|
||||
}
|
||||
let dir_existed = delete_dir(&self.timeline_dir).await?;
|
||||
Ok((dir_existed, was_active))
|
||||
}
|
||||
|
||||
@@ -327,16 +327,20 @@ impl GlobalTimelines {
|
||||
}
|
||||
|
||||
/// Cancels timeline, then deletes the corresponding data directory.
|
||||
pub async fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
|
||||
/// If only_local, doesn't remove WAL segments in remote storage.
|
||||
pub async fn delete(
|
||||
ttid: &TenantTimelineId,
|
||||
only_local: bool,
|
||||
) -> Result<TimelineDeleteForceResult> {
|
||||
let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
|
||||
match tli_res {
|
||||
Ok(timeline) => {
|
||||
// Take a lock and finish the deletion holding this mutex.
|
||||
let mut shared_state = timeline.write_shared_state().await;
|
||||
|
||||
info!("deleting timeline {}", ttid);
|
||||
info!("deleting timeline {}, only_local={}", ttid, only_local);
|
||||
let (dir_existed, was_active) =
|
||||
timeline.delete_from_disk(&mut shared_state).await?;
|
||||
timeline.delete(&mut shared_state, only_local).await?;
|
||||
|
||||
// Remove timeline from the map.
|
||||
// FIXME: re-enable it once we fix the issue with recreation of deleted timelines
|
||||
@@ -369,8 +373,11 @@ impl GlobalTimelines {
|
||||
/// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
|
||||
/// created simultaneously. In that case the function will return error and the caller should
|
||||
/// retry tenant deletion again later.
|
||||
///
|
||||
/// If only_local, doesn't remove WAL segments in remote storage.
|
||||
pub async fn delete_force_all_for_tenant(
|
||||
tenant_id: &TenantId,
|
||||
only_local: bool,
|
||||
) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
|
||||
info!("deleting all timelines for tenant {}", tenant_id);
|
||||
let to_delete = Self::get_all_for_tenant(*tenant_id);
|
||||
@@ -379,7 +386,7 @@ impl GlobalTimelines {
|
||||
|
||||
let mut deleted = HashMap::new();
|
||||
for tli in &to_delete {
|
||||
match Self::delete_force(&tli.ttid).await {
|
||||
match Self::delete(&tli.ttid, only_local).await {
|
||||
Ok(result) => {
|
||||
deleted.insert(tli.ttid, result);
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::FuturesOrdered;
|
||||
use futures::StreamExt;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use std::cmp::min;
|
||||
@@ -166,6 +168,17 @@ async fn update_task(
|
||||
}
|
||||
}
|
||||
|
||||
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
|
||||
|
||||
// Storage must be configured and initialized when this is called.
|
||||
fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
|
||||
REMOTE_STORAGE
|
||||
.get()
|
||||
.expect("failed to get remote storage")
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
|
||||
|
||||
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
|
||||
@@ -199,7 +212,7 @@ pub async fn wal_backup_launcher_task_main(
|
||||
ttid = wal_backup_launcher_rx.recv() => {
|
||||
// channel is never expected to get closed
|
||||
let ttid = ttid.unwrap();
|
||||
if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
|
||||
if !conf.is_wal_backup_enabled() {
|
||||
continue; /* just drain the channel and do nothing */
|
||||
}
|
||||
async {
|
||||
@@ -484,18 +497,12 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
|
||||
res
|
||||
}
|
||||
|
||||
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
|
||||
|
||||
async fn backup_object(
|
||||
source_file: &Utf8Path,
|
||||
target_file: &RemotePath,
|
||||
size: usize,
|
||||
) -> Result<()> {
|
||||
let storage = REMOTE_STORAGE
|
||||
.get()
|
||||
.expect("failed to get remote storage")
|
||||
.as_ref()
|
||||
.unwrap();
|
||||
let storage = get_configured_remote_storage();
|
||||
|
||||
let file = File::open(&source_file)
|
||||
.await
|
||||
@@ -532,6 +539,39 @@ pub async fn read_object(
|
||||
Ok(Box::pin(reader))
|
||||
}
|
||||
|
||||
/// Delete WAL files for the given timeline. Remote storage must be configured
|
||||
/// when called.
|
||||
pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
let storage = get_configured_remote_storage();
|
||||
let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
|
||||
let remote_path = RemotePath::new(&ttid_path)?;
|
||||
|
||||
// A backoff::retry is used here for two reasons:
|
||||
// - To provide a backoff rather than busy-polling the API on errors
|
||||
// - To absorb transient 429/503 conditions without hitting our error
|
||||
// logging path for issues deleting objects.
|
||||
//
|
||||
// Note: listing segments might take a long time if there are many of them.
|
||||
// We don't currently have http requests timeout cancellation, but if/once
|
||||
// we have listing should get streaming interface to make progress.
|
||||
let token = CancellationToken::new(); // not really used
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let files = storage.list_files(Some(&remote_path)).await?;
|
||||
storage.delete_objects(&files).await?;
|
||||
Ok(())
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
10,
|
||||
"executing WAL segments deletion batch",
|
||||
backoff::Cancel::new(token, || anyhow::anyhow!("canceled")),
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Copy segments from one timeline to another. Used in copy_timeline.
|
||||
pub async fn copy_s3_segments(
|
||||
wal_seg_size: usize,
|
||||
|
||||
@@ -12,9 +12,11 @@ from pathlib import Path
|
||||
# Type-related stuff
|
||||
from typing import Callable, ClassVar, Dict, Iterator, Optional
|
||||
|
||||
import allure
|
||||
import pytest
|
||||
from _pytest.config import Config
|
||||
from _pytest.config.argparsing import Parser
|
||||
from _pytest.fixtures import FixtureRequest
|
||||
from _pytest.terminal import TerminalReporter
|
||||
|
||||
from fixtures.log_helper import log
|
||||
@@ -411,7 +413,10 @@ class NeonBenchmarker:
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[NeonBenchmarker]:
|
||||
def zenbenchmark(
|
||||
request: FixtureRequest,
|
||||
record_property: Callable[[str, object], None],
|
||||
) -> Iterator[NeonBenchmarker]:
|
||||
"""
|
||||
This is a python decorator for benchmark fixtures. It contains functions for
|
||||
recording measurements, and prints them out at the end.
|
||||
@@ -419,6 +424,21 @@ def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[Neo
|
||||
benchmarker = NeonBenchmarker(record_property)
|
||||
yield benchmarker
|
||||
|
||||
results = {}
|
||||
for _, recorded_property in request.node.user_properties:
|
||||
name = recorded_property["name"]
|
||||
value = str(recorded_property["value"])
|
||||
if (unit := recorded_property["unit"].strip()) != "":
|
||||
value += f" {unit}"
|
||||
results[name] = value
|
||||
|
||||
content = json.dumps(results, indent=2)
|
||||
allure.attach(
|
||||
content,
|
||||
"benchmarks.json",
|
||||
allure.attachment_type.JSON,
|
||||
)
|
||||
|
||||
|
||||
def pytest_addoption(parser: Parser):
|
||||
parser.addoption(
|
||||
|
||||
@@ -3352,9 +3352,15 @@ class SafekeeperHttpClient(requests.Session):
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_delete_force(self, tenant_id: TenantId, timeline_id: TimelineId) -> Dict[Any, Any]:
|
||||
# only_local doesn't remove segments in the remote storage.
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
|
||||
) -> Dict[Any, Any]:
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params={
|
||||
"only_local": str(only_local).lower(),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.remote_storage import RemoteStorageKind, S3Storage
|
||||
from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
@@ -233,23 +233,18 @@ def timeline_delete_wait_completed(
|
||||
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# TODO avoid by combining remote storage related stuff in single type
|
||||
# and just passing in this type instead of whole builder
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
# remote_storage must not be None, but that's easier for callers to make mypy happy
|
||||
def assert_prefix_empty(
|
||||
neon_env_builder: "NeonEnvBuilder",
|
||||
remote_storage: Optional[RemoteStorage],
|
||||
prefix: Optional[str] = None,
|
||||
allowed_postfix: Optional[str] = None,
|
||||
):
|
||||
response = list_prefix(neon_env_builder, prefix)
|
||||
assert remote_storage is not None
|
||||
response = list_prefix(remote_storage, prefix)
|
||||
keys = response["KeyCount"]
|
||||
objects: List[ObjectTypeDef] = response.get("Contents", [])
|
||||
common_prefixes = response.get("CommonPrefixes", [])
|
||||
|
||||
remote_storage = neon_env_builder.pageserver_remote_storage
|
||||
is_mock_s3 = isinstance(remote_storage, S3Storage) and not remote_storage.cleanup
|
||||
|
||||
if is_mock_s3:
|
||||
@@ -283,19 +278,20 @@ def assert_prefix_empty(
|
||||
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
|
||||
|
||||
|
||||
def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
|
||||
response = list_prefix(neon_env_builder, prefix)
|
||||
# remote_storage must not be None, but that's easier for callers to make mypy happy
|
||||
def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None):
|
||||
assert remote_storage is not None
|
||||
response = list_prefix(remote_storage, prefix)
|
||||
assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
|
||||
|
||||
|
||||
def list_prefix(
|
||||
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
|
||||
remote: RemoteStorage, prefix: Optional[str] = None, delimiter: str = "/"
|
||||
) -> ListObjectsV2OutputTypeDef:
|
||||
"""
|
||||
Note that this function takes into account prefix_in_bucket.
|
||||
"""
|
||||
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
|
||||
remote = neon_env_builder.pageserver_remote_storage
|
||||
assert isinstance(remote, S3Storage), "localfs is currently not supported"
|
||||
assert remote.client is not None
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import enum
|
||||
import time
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
@@ -119,6 +120,19 @@ class EvictionEnv:
|
||||
for tid, tlid in self.timelines
|
||||
}
|
||||
|
||||
def count_layers_per_tenant(self, pageserver: NeonPageserver) -> Dict[TenantId, int]:
|
||||
ret: Counter[TenantId] = Counter()
|
||||
|
||||
for tenant_id, timeline_id in self.timelines:
|
||||
timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
assert timeline_dir.exists()
|
||||
for file in timeline_dir.iterdir():
|
||||
if "__" not in file.name:
|
||||
continue
|
||||
ret[tenant_id] += 1
|
||||
|
||||
return dict(ret)
|
||||
|
||||
def warm_up_tenant(self, tenant_id: TenantId):
|
||||
"""
|
||||
Start a read-only compute at the LSN after pgbench -i, and run pgbench -S against it.
|
||||
@@ -503,6 +517,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
|
||||
|
||||
(total_on_disk, _, _) = env.timelines_du(env.pageserver)
|
||||
du_by_timeline = env.du_by_timeline(env.pageserver)
|
||||
tenant_layers = env.count_layers_per_tenant(env.pageserver)
|
||||
|
||||
# pick smaller or greater (iteration order is insertion order of scale=4 and scale=6)
|
||||
[warm, cold] = list(du_by_timeline.keys())
|
||||
@@ -556,8 +571,31 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
|
||||
cold_size < cold_upper
|
||||
), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
|
||||
else:
|
||||
# just go with the space was freed, find proper limits later
|
||||
pass
|
||||
# with relative order what matters is the amount of layers, with a
|
||||
# fudge factor of whether the eviction bothers tenants with highest
|
||||
# layer count the most. last accessed times between tenants does not
|
||||
# matter.
|
||||
layers_now = env.count_layers_per_tenant(env.pageserver)
|
||||
|
||||
expected_ratio = later_total_on_disk / total_on_disk
|
||||
log.info(
|
||||
f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
|
||||
)
|
||||
|
||||
for tenant_id, original_count in tenant_layers.items():
|
||||
count_now = layers_now[tenant_id]
|
||||
ratio = count_now / original_count
|
||||
abs_diff = abs(ratio - expected_ratio)
|
||||
assert original_count > count_now
|
||||
log.info(
|
||||
f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < 0.1"
|
||||
)
|
||||
|
||||
# in this test case both relative_spare and relative_equal produce
|
||||
# the same outcomes; this must be a quantization effect of similar
|
||||
# sizes (-s4 and -s6) and small (5MB) layer size.
|
||||
# for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
|
||||
assert abs_diff < 0.05
|
||||
|
||||
|
||||
def poor_mans_du(
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
# Test restarting page server, while safekeeper and compute node keep
|
||||
# running.
|
||||
def test_next_xid(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -52,3 +60,161 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT count(*) FROM t")
|
||||
assert cur.fetchone() == (iterations,)
|
||||
|
||||
|
||||
# Test for a bug we had, where nextXid was incorrectly updated when the
|
||||
# XID counter reached 2 billion. The nextXid tracking logic incorrectly
|
||||
# treated 0 (InvalidTransactionId) as a regular XID, and after reaching
|
||||
# 2 billion, it started to look like a very new XID, which caused nextXid
|
||||
# to be immediately advanced to the next epoch.
|
||||
#
|
||||
def test_import_at_2bil(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
pg_distrib_dir: Path,
|
||||
pg_bin,
|
||||
vanilla_pg,
|
||||
):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
|
||||
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
|
||||
psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
|
||||
|
||||
# Reset the vanilla Postgres instance to somewhat before 2 billion transactions.
|
||||
pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
|
||||
cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)]
|
||||
pg_bin.run_capture(cmd, env=psql_env)
|
||||
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
|
||||
vanilla_pg.safe_psql(
|
||||
"""create table tt as select 'long string to consume some space' || g
|
||||
from generate_series(1,300000) g"""
|
||||
)
|
||||
assert vanilla_pg.safe_psql("select count(*) from tt") == [(300000,)]
|
||||
vanilla_pg.safe_psql("CREATE TABLE t (t text);")
|
||||
vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')")
|
||||
|
||||
endpoint_id = "ep-import_from_vanilla"
|
||||
tenant = TenantId.generate()
|
||||
timeline = TimelineId.generate()
|
||||
|
||||
env.pageserver.tenant_create(tenant)
|
||||
|
||||
# Take basebackup
|
||||
basebackup_dir = os.path.join(test_output_dir, "basebackup")
|
||||
base_tar = os.path.join(basebackup_dir, "base.tar")
|
||||
wal_tar = os.path.join(basebackup_dir, "pg_wal.tar")
|
||||
os.mkdir(basebackup_dir)
|
||||
vanilla_pg.safe_psql("CHECKPOINT")
|
||||
pg_bin.run(
|
||||
[
|
||||
"pg_basebackup",
|
||||
"-F",
|
||||
"tar",
|
||||
"-d",
|
||||
vanilla_pg.connstr(),
|
||||
"-D",
|
||||
basebackup_dir,
|
||||
]
|
||||
)
|
||||
|
||||
# Get start_lsn and end_lsn
|
||||
with open(os.path.join(basebackup_dir, "backup_manifest")) as f:
|
||||
manifest = json.load(f)
|
||||
start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"]
|
||||
end_lsn = manifest["WAL-Ranges"][0]["End-LSN"]
|
||||
|
||||
def import_tar(base, wal):
|
||||
env.neon_cli.raw_cli(
|
||||
[
|
||||
"timeline",
|
||||
"import",
|
||||
"--tenant-id",
|
||||
str(tenant),
|
||||
"--timeline-id",
|
||||
str(timeline),
|
||||
"--node-name",
|
||||
endpoint_id,
|
||||
"--base-lsn",
|
||||
start_lsn,
|
||||
"--base-tarfile",
|
||||
base,
|
||||
"--end-lsn",
|
||||
end_lsn,
|
||||
"--wal-tarfile",
|
||||
wal,
|
||||
"--pg-version",
|
||||
env.pg_version,
|
||||
]
|
||||
)
|
||||
|
||||
# Importing correct backup works
|
||||
import_tar(base_tar, wal_tar)
|
||||
wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn))
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
endpoint_id,
|
||||
tenant_id=tenant,
|
||||
config_lines=[
|
||||
"log_autovacuum_min_duration = 0",
|
||||
"autovacuum_naptime='5 s'",
|
||||
],
|
||||
)
|
||||
assert endpoint.safe_psql("select count(*) from t") == [(1,)]
|
||||
|
||||
# Ok, consume
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
# Install extension containing function needed for test
|
||||
cur.execute("CREATE EXTENSION neon_test_utils")
|
||||
|
||||
# Advance nextXid close to 2 billion XIDs
|
||||
while True:
|
||||
xid = int(query_scalar(cur, "SELECT txid_current()"))
|
||||
log.info(f"xid now {xid}")
|
||||
# Consume 10k transactons at a time until we get to 2^31 - 200k
|
||||
if xid < 2 * 1024 * 1024 * 1024 - 100000:
|
||||
cur.execute("select test_consume_xids(50000);")
|
||||
elif xid < 2 * 1024 * 1024 * 1024 - 10000:
|
||||
cur.execute("select test_consume_xids(5000);")
|
||||
else:
|
||||
break
|
||||
|
||||
# Run a bunch of real INSERTs to cross over the 2 billion mark
|
||||
# Use a begin-exception block to have a separate sub-XID for each insert.
|
||||
cur.execute(
|
||||
"""
|
||||
do $$
|
||||
begin
|
||||
for i in 1..10000 loop
|
||||
-- Use a begin-exception block to generate a new subtransaction on each iteration
|
||||
begin
|
||||
insert into t values (i);
|
||||
exception when others then
|
||||
raise 'not expected %', sqlerrm;
|
||||
end;
|
||||
end loop;
|
||||
end;
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
# A checkpoint writes a WAL record with xl_xid=0. Many other WAL
|
||||
# records would have the same effect.
|
||||
cur.execute("checkpoint")
|
||||
|
||||
# wait until pageserver receives that data
|
||||
wait_for_wal_insert_lsn(env, endpoint, tenant, timeline)
|
||||
|
||||
# Restart endpoint
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT count(*) from t")
|
||||
assert cur.fetchone() == (10000 + 1,)
|
||||
|
||||
@@ -216,8 +216,14 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
log.info(f"group: {m.group(1)}")
|
||||
return int(m.group(1), 16)
|
||||
|
||||
assert neon_env_builder.pageserver_remote_storage is not None
|
||||
pre_upgrade_keys = list(
|
||||
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
|
||||
[
|
||||
o["Key"]
|
||||
for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
|
||||
"Contents"
|
||||
]
|
||||
]
|
||||
)
|
||||
for key in pre_upgrade_keys:
|
||||
assert parse_generation_suffix(key) is None
|
||||
@@ -232,7 +238,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
legacy_objects: list[str] = []
|
||||
suffixed_objects = []
|
||||
post_upgrade_keys = list(
|
||||
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
|
||||
[
|
||||
o["Key"]
|
||||
for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
|
||||
"Contents"
|
||||
]
|
||||
]
|
||||
)
|
||||
for key in post_upgrade_keys:
|
||||
log.info(f"post-upgrade key: {key}")
|
||||
|
||||
@@ -504,7 +504,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
|
||||
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
|
||||
@@ -75,7 +75,7 @@ def test_tenant_delete_smoke(
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
|
||||
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -96,7 +96,7 @@ def test_tenant_delete_smoke(
|
||||
assert not tenant_path.exists()
|
||||
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -207,7 +207,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -268,7 +268,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
|
||||
# Check remote is empty
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -304,7 +304,7 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
|
||||
# sanity check, data should be there
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -343,7 +343,7 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
)
|
||||
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -378,7 +378,7 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -543,7 +543,7 @@ def test_tenant_delete_concurrent(
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -788,6 +788,9 @@ def test_tenant_delete_races_timeline_creation(
|
||||
".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
|
||||
)
|
||||
|
||||
# This can occur sometimes.
|
||||
CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*"
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# lucky race with stopping from flushing a layer we fail to schedule any uploads
|
||||
@@ -796,6 +799,9 @@ def test_tenant_delete_races_timeline_creation(
|
||||
".*POST.*/timeline.* request was dropped before completing",
|
||||
# Timeline creation runs into this error
|
||||
CANCELLED_ERROR,
|
||||
# Timeline deletion can run into this error during deletion
|
||||
CONFLICT_MESSAGE,
|
||||
".*tenant_delete_handler.*still waiting, taking longer than expected.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -853,9 +859,11 @@ def test_tenant_delete_races_timeline_creation(
|
||||
except PageserverApiException:
|
||||
pass
|
||||
|
||||
os.wait(4)
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
|
||||
@@ -191,7 +191,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
|
||||
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -275,7 +275,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
# Check remote is empty
|
||||
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -449,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
|
||||
assert all([tl["state"] == "Active" for tl in timelines])
|
||||
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -466,7 +466,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
|
||||
)
|
||||
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -482,7 +482,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
|
||||
wait_until(
|
||||
2,
|
||||
0.5,
|
||||
lambda: assert_prefix_empty(neon_env_builder),
|
||||
lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage),
|
||||
)
|
||||
|
||||
|
||||
@@ -673,7 +673,7 @@ def test_timeline_delete_works_for_remote_smoke(
|
||||
|
||||
for timeline_id in timeline_ids:
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -690,7 +690,7 @@ def test_timeline_delete_works_for_remote_smoke(
|
||||
timeline_delete_wait_completed(ps_http, tenant_id=tenant_id, timeline_id=timeline_id)
|
||||
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -703,7 +703,7 @@ def test_timeline_delete_works_for_remote_smoke(
|
||||
|
||||
# for some reason the check above doesnt immediately take effect for the below.
|
||||
# Assume it is mock server inconsistency and check twice.
|
||||
wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder))
|
||||
wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
|
||||
|
||||
|
||||
def test_delete_orphaned_objects(
|
||||
@@ -791,7 +791,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
|
||||
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -839,7 +839,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
assert reason.endswith(f"failpoint: {failpoint}"), reason
|
||||
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
@@ -870,7 +870,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
assert not tenant_path.exists()
|
||||
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
|
||||
|
||||
@@ -117,6 +118,8 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
|
||||
# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
|
||||
# record.
|
||||
#
|
||||
# FIXME: This test is broken
|
||||
@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/6412#issuecomment-1902072541")
|
||||
def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
|
||||
@@ -33,13 +33,19 @@ from fixtures.neon_fixtures import (
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
assert_prefix_not_empty,
|
||||
timeline_delete_wait_completed,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.port_distributor import PortDistributor
|
||||
from fixtures.remote_storage import RemoteStorageKind, default_remote_storage
|
||||
from fixtures.remote_storage import (
|
||||
RemoteStorageKind,
|
||||
default_remote_storage,
|
||||
s3_storage,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import get_dir_size, query_scalar, start_in_background
|
||||
|
||||
@@ -118,7 +124,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
|
||||
with env.pageserver.http_client() as pageserver_http:
|
||||
timeline_details = [
|
||||
pageserver_http.timeline_detail(
|
||||
tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name]
|
||||
tenant_id=tenant_id,
|
||||
timeline_id=branch_names_to_timeline_ids[branch_name],
|
||||
)
|
||||
for branch_name in branch_names
|
||||
]
|
||||
@@ -457,10 +464,19 @@ def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId,
|
||||
|
||||
def test_wal_backup(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
|
||||
remote_storage_kind = s3_storage()
|
||||
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# These are expected after timeline deletion on safekeepers.
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*Timeline .* was not found in global map.*",
|
||||
".*Timeline .* was cancelled and cannot be used anymore.*",
|
||||
]
|
||||
)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_backup")
|
||||
endpoint = env.endpoints.create_start("test_safekeepers_wal_backup")
|
||||
@@ -488,7 +504,8 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
|
||||
# put one of safekeepers down again
|
||||
env.safekeepers[0].stop()
|
||||
# restart postgres
|
||||
endpoint.stop_and_destroy().create_start("test_safekeepers_wal_backup")
|
||||
endpoint.stop()
|
||||
endpoint = env.endpoints.create_start("test_safekeepers_wal_backup")
|
||||
# and ensure offloading still works
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -498,6 +515,17 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
|
||||
partial(is_segment_offloaded, env.safekeepers[1], tenant_id, timeline_id, seg_end),
|
||||
f"segment ending at {seg_end} get offloaded",
|
||||
)
|
||||
env.safekeepers[0].start()
|
||||
endpoint.stop()
|
||||
|
||||
# Test that after timeline deletion remote objects are gone.
|
||||
prefix = "/".join([str(tenant_id), str(timeline_id)])
|
||||
assert_prefix_not_empty(neon_env_builder.safekeepers_remote_storage, prefix)
|
||||
|
||||
for sk in env.safekeepers:
|
||||
sk_http = sk.http_client()
|
||||
sk_http.timeline_delete(tenant_id, timeline_id)
|
||||
assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix)
|
||||
|
||||
|
||||
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -586,7 +614,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
|
||||
# advancing peer_horizon_lsn.
|
||||
for sk in env.safekeepers:
|
||||
cli = sk.http_client()
|
||||
cli.timeline_delete_force(tenant_id, timeline_id)
|
||||
cli.timeline_delete(tenant_id, timeline_id, only_local=True)
|
||||
# restart safekeeper to clear its in-memory state
|
||||
sk.stop()
|
||||
# wait all potenital in flight pushes to broker arrive before starting
|
||||
@@ -1623,7 +1651,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
endpoint_3.stop_and_destroy()
|
||||
|
||||
# Remove initial tenant's br1 (active)
|
||||
assert sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"]
|
||||
assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
|
||||
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
|
||||
assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
|
||||
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
|
||||
@@ -1631,7 +1659,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
|
||||
|
||||
# Ensure repeated deletion succeeds
|
||||
assert not sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"]
|
||||
assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
|
||||
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
|
||||
assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
|
||||
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
|
||||
@@ -1642,13 +1670,13 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
# Ensure we cannot delete the other tenant
|
||||
for sk_h in [sk_http, sk_http_noauth]:
|
||||
with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
|
||||
assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other)
|
||||
assert sk_h.timeline_delete(tenant_id_other, timeline_id_other)
|
||||
with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
|
||||
assert sk_h.tenant_delete_force(tenant_id_other)
|
||||
assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
|
||||
|
||||
# Remove initial tenant's br2 (inactive)
|
||||
assert sk_http.timeline_delete_force(tenant_id, timeline_id_2)["dir_existed"]
|
||||
assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"]
|
||||
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
|
||||
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
|
||||
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
|
||||
@@ -1656,7 +1684,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
|
||||
|
||||
# Remove non-existing branch, should succeed
|
||||
assert not sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16))["dir_existed"]
|
||||
assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"]
|
||||
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
|
||||
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
|
||||
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists()
|
||||
|
||||
@@ -6,7 +6,7 @@ commands:
|
||||
sysvInitAction: sysinit
|
||||
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
|
||||
- name: pgbouncer
|
||||
user: nobody
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
- name: postgres-exporter
|
||||
@@ -36,7 +36,9 @@ files:
|
||||
max_client_conn=10000
|
||||
default_pool_size=64
|
||||
max_prepared_statements=0
|
||||
admin_users=cloud_admin
|
||||
admin_users=postgres
|
||||
unix_socket_dir=/tmp/
|
||||
unix_socket_mode=0777
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
@@ -198,7 +200,7 @@ merge: |
|
||||
|
||||
RUN set -e \
|
||||
&& chown postgres:postgres /etc/pgbouncer.ini \
|
||||
&& chmod 0644 /etc/pgbouncer.ini \
|
||||
&& chmod 0666 /etc/pgbouncer.ini \
|
||||
&& chmod 0644 /etc/cgconfig.conf \
|
||||
&& chmod 0644 /etc/sql_exporter.yml \
|
||||
&& chmod 0644 /etc/neon_collector.yml
|
||||
|
||||
Reference in New Issue
Block a user