mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-21 20:32:56 +00:00
Compare commits
9 Commits
release-41
...
jcsp/disk-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ed3e3b6f61 | ||
|
|
098ef0956b | ||
|
|
127837abb0 | ||
|
|
b2c96047d0 | ||
|
|
44202eeb3b | ||
|
|
4bef977c56 | ||
|
|
a0b862a8bd | ||
|
|
767ef29390 | ||
|
|
a8a800af51 |
12
.github/workflows/build_and_test.yml
vendored
12
.github/workflows/build_and_test.yml
vendored
@@ -338,6 +338,16 @@ jobs:
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER=neon-github-sandbox
|
||||
export REMOTE_STORAGE_AZURE_REGION=eastus2
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
# Install target binaries
|
||||
@@ -837,7 +847,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.18.1
|
||||
VM_BUILDER_VERSION: v0.18.2
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
|
||||
// Then we could replace the custo Ord and PartialOrd implementations below with
|
||||
// deriving them.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
|
||||
// Then we could replace the custom Ord and PartialOrd implementations below with
|
||||
// deriving them. This will require changes in walredoproc.c.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: Oid,
|
||||
@@ -40,21 +40,9 @@ impl PartialOrd for RelTag {
|
||||
|
||||
impl Ord for RelTag {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let mut cmp = self.spcnode.cmp(&other.spcnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.dbnode.cmp(&other.dbnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.relnode.cmp(&other.relnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.forknum.cmp(&other.forknum);
|
||||
|
||||
cmp
|
||||
// Custom ordering where we put forknum to the end of the list
|
||||
let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
|
||||
(self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1479,6 +1479,8 @@ threshold = "20m"
|
||||
Some(DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 0,
|
||||
target_avail_bytes: None,
|
||||
target_usage_pct: None,
|
||||
period: Duration::from_secs(10),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
|
||||
@@ -67,16 +67,40 @@ use crate::{
|
||||
pub struct DiskUsageEvictionTaskConfig {
|
||||
pub max_usage_pct: Percent,
|
||||
pub min_avail_bytes: u64,
|
||||
|
||||
// Control how far we will go when evicting: when usage exceeds max_usage_pct or min_avail_bytes,
|
||||
// we will keep evicting layers until we reach the target. The resulting disk usage should look
|
||||
// like a sawtooth bouncing between the upper max/min line and the lower target line.
|
||||
#[serde(default)]
|
||||
pub target_usage_pct: Option<Percent>,
|
||||
#[serde(default)]
|
||||
pub target_avail_bytes: Option<u64>,
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub period: Duration,
|
||||
#[cfg(feature = "testing")]
|
||||
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
enum Status {
|
||||
/// We are within disk limits, and not currently doing any eviction
|
||||
#[default]
|
||||
Idle,
|
||||
/// Disk limits have been exceeded: we will evict soon
|
||||
UnderPressure,
|
||||
/// We are currently doing an eviction pass.
|
||||
Evicting,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct State {
|
||||
/// Exclude http requests and background task from running at the same time.
|
||||
mutex: tokio::sync::Mutex<()>,
|
||||
|
||||
/// Publish the current status of eviction work, for visibility to other subsystems
|
||||
/// that modify their behavior if disk pressure is high or if eviction is going on.
|
||||
status: std::sync::RwLock<Status>,
|
||||
}
|
||||
|
||||
pub fn launch_disk_usage_global_eviction_task(
|
||||
@@ -176,7 +200,9 @@ async fn disk_usage_eviction_task(
|
||||
}
|
||||
|
||||
pub trait Usage: Clone + Copy + std::fmt::Debug {
|
||||
fn has_pressure(&self) -> bool;
|
||||
fn pressure(&self) -> f64;
|
||||
fn over_pressure(&self) -> bool;
|
||||
fn no_pressure(&self) -> bool;
|
||||
fn add_available_bytes(&mut self, bytes: u64);
|
||||
}
|
||||
|
||||
@@ -189,13 +215,19 @@ async fn disk_usage_eviction_task_iteration(
|
||||
) -> anyhow::Result<()> {
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
.context("get filesystem-level disk usage before evictions")?;
|
||||
|
||||
if usage_pre.over_pressure() {
|
||||
*state.status.write().unwrap() = Status::Evicting;
|
||||
}
|
||||
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||
match res {
|
||||
Ok(outcome) => {
|
||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||
match outcome {
|
||||
let new_status = match outcome {
|
||||
IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
|
||||
// nothing to do, select statement below will handle things
|
||||
Status::Idle
|
||||
}
|
||||
IterationOutcome::Finished(outcome) => {
|
||||
// Verify with statvfs whether we made any real progress
|
||||
@@ -205,21 +237,30 @@ async fn disk_usage_eviction_task_iteration(
|
||||
|
||||
debug!(?after, "disk usage");
|
||||
|
||||
if after.has_pressure() {
|
||||
if after.over_pressure() {
|
||||
// Don't bother doing an out-of-order iteration here now.
|
||||
// In practice, the task period is set to a value in the tens-of-seconds range,
|
||||
// which will cause another iteration to happen soon enough.
|
||||
// TODO: deltas between the three different usages would be helpful,
|
||||
// consider MiB, GiB, TiB
|
||||
warn!(?outcome, ?after, "disk usage still high");
|
||||
Status::UnderPressure
|
||||
} else {
|
||||
info!(?outcome, ?after, "disk usage pressure relieved");
|
||||
Status::Idle
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
*state.status.write().unwrap() = new_status;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("disk_usage_eviction_iteration failed: {:#}", e);
|
||||
*state.status.write().unwrap() = if usage_pre.over_pressure() {
|
||||
Status::UnderPressure
|
||||
} else {
|
||||
Status::Idle
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -285,8 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
|
||||
debug!(?usage_pre, "disk usage");
|
||||
|
||||
if !usage_pre.has_pressure() {
|
||||
if !usage_pre.over_pressure() {
|
||||
return Ok(IterationOutcome::NoPressure);
|
||||
} else {
|
||||
*state.status.write().unwrap() = Status::Evicting;
|
||||
}
|
||||
|
||||
warn!(
|
||||
@@ -334,7 +377,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
let mut warned = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
|
||||
if !usage_planned.has_pressure() {
|
||||
if usage_planned.no_pressure() {
|
||||
debug!(
|
||||
no_candidates_evicted = i,
|
||||
"took enough candidates for pressure to be relieved"
|
||||
@@ -644,22 +687,57 @@ mod filesystem_level_usage {
|
||||
}
|
||||
|
||||
impl super::Usage for Usage<'_> {
|
||||
fn has_pressure(&self) -> bool {
|
||||
let usage_pct =
|
||||
(100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
|
||||
/// Does the pressure exceed 1.0, i.e. has the disk usage exceeded upper bounds?
|
||||
///
|
||||
/// This is the condition for starting eviction.
|
||||
fn over_pressure(&self) -> bool {
|
||||
self.pressure() >= 1.0
|
||||
}
|
||||
|
||||
let pressures = [
|
||||
(
|
||||
"min_avail_bytes",
|
||||
self.avail_bytes < self.config.min_avail_bytes,
|
||||
),
|
||||
(
|
||||
"max_usage_pct",
|
||||
usage_pct >= self.config.max_usage_pct.get() as u64,
|
||||
),
|
||||
];
|
||||
/// Is the pressure <0, ie.. has disk usage gone below the target bound?
|
||||
///
|
||||
/// This is the condition for dropping out of eviction.
|
||||
fn no_pressure(&self) -> bool {
|
||||
self.pressure() <= 0.0
|
||||
}
|
||||
|
||||
pressures.into_iter().any(|(_, has_pressure)| has_pressure)
|
||||
fn pressure(&self) -> f64 {
|
||||
let max_usage = std::cmp::min(
|
||||
self.total_bytes - self.config.min_avail_bytes,
|
||||
(self.total_bytes as f64 * (self.config.max_usage_pct.get() as f64 / 100.0)) as u64,
|
||||
);
|
||||
|
||||
let mut target_usage = max_usage;
|
||||
if let Some(target_avail_bytes) = self.config.target_avail_bytes {
|
||||
target_usage = std::cmp::min(target_usage, self.total_bytes - target_avail_bytes);
|
||||
}
|
||||
if let Some(target_usage_pct) = self.config.target_usage_pct {
|
||||
target_usage = std::cmp::min(
|
||||
target_usage,
|
||||
(self.total_bytes as f64 * (target_usage_pct.get() as f64 / 100.0)) as u64,
|
||||
);
|
||||
};
|
||||
|
||||
let usage = self.total_bytes - self.avail_bytes;
|
||||
eprintln!(
|
||||
"pressure: {} {}, current {}",
|
||||
target_usage, max_usage, usage
|
||||
);
|
||||
if target_usage == max_usage {
|
||||
// We are configured with a zero sized range: treat anything at+beyond limit as pressure 1.0, else 0.0
|
||||
if usage >= max_usage {
|
||||
1.0
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
} else if usage <= target_usage {
|
||||
// No pressure.
|
||||
0.0
|
||||
} else {
|
||||
// We are above target: pressure is the ratio of how much we exceed target to the size of the gap
|
||||
let range_size = (max_usage - target_usage) as f64;
|
||||
(usage - target_usage) as f64 / range_size
|
||||
}
|
||||
}
|
||||
|
||||
fn add_available_bytes(&mut self, bytes: u64) {
|
||||
@@ -713,6 +791,8 @@ mod filesystem_level_usage {
|
||||
config: &DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(85).unwrap(),
|
||||
min_avail_bytes: 0,
|
||||
target_avail_bytes: None,
|
||||
target_usage_pct: None,
|
||||
period: Duration::MAX,
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
@@ -721,24 +801,24 @@ mod filesystem_level_usage {
|
||||
avail_bytes: 0,
|
||||
};
|
||||
|
||||
assert!(usage.has_pressure(), "expected pressure at 100%");
|
||||
assert!(usage.over_pressure(), "expected pressure at 100%");
|
||||
|
||||
usage.add_available_bytes(14_000);
|
||||
assert!(usage.has_pressure(), "expected pressure at 86%");
|
||||
assert!(usage.over_pressure(), "expected pressure at 86%");
|
||||
|
||||
usage.add_available_bytes(999);
|
||||
assert!(usage.has_pressure(), "expected pressure at 85.001%");
|
||||
assert!(usage.over_pressure(), "expected pressure at 85.001%");
|
||||
|
||||
usage.add_available_bytes(1);
|
||||
assert!(usage.has_pressure(), "expected pressure at precisely 85%");
|
||||
assert!(usage.over_pressure(), "expected pressure at precisely 85%");
|
||||
|
||||
usage.add_available_bytes(1);
|
||||
assert!(!usage.has_pressure(), "no pressure at 84.999%");
|
||||
assert!(!usage.over_pressure(), "no pressure at 84.999%");
|
||||
|
||||
usage.add_available_bytes(999);
|
||||
assert!(!usage.has_pressure(), "no pressure at 84%");
|
||||
assert!(!usage.over_pressure(), "no pressure at 84%");
|
||||
|
||||
usage.add_available_bytes(16_000);
|
||||
assert!(!usage.has_pressure());
|
||||
assert!(!usage.over_pressure());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1452,10 +1452,22 @@ async fn disk_usage_eviction_run(
|
||||
}
|
||||
|
||||
impl crate::disk_usage_eviction_task::Usage for Usage {
|
||||
fn has_pressure(&self) -> bool {
|
||||
fn over_pressure(&self) -> bool {
|
||||
self.config.evict_bytes > self.freed_bytes
|
||||
}
|
||||
|
||||
fn no_pressure(&self) -> bool {
|
||||
!self.over_pressure()
|
||||
}
|
||||
|
||||
fn pressure(&self) -> f64 {
|
||||
if self.over_pressure() {
|
||||
1.0
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
}
|
||||
|
||||
fn add_available_bytes(&mut self, bytes: u64) {
|
||||
self.freed_bytes += bytes;
|
||||
}
|
||||
|
||||
@@ -675,9 +675,8 @@ impl Timeline {
|
||||
|
||||
result.add_key(CONTROLFILE_KEY);
|
||||
result.add_key(CHECKPOINT_KEY);
|
||||
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
}
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
|
||||
Ok(result.to_keyspace())
|
||||
}
|
||||
|
||||
|
||||
@@ -60,6 +60,8 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
bail!("failpoint before-upload-layer")
|
||||
});
|
||||
|
||||
pausable_failpoint!("before-upload-layer-pausable");
|
||||
|
||||
let storage_path = remote_path(conf, source_path, generation)?;
|
||||
let source_file_res = fs::File::open(&source_path).await;
|
||||
let source_file = match source_file_res {
|
||||
|
||||
@@ -2793,10 +2793,13 @@ impl Timeline {
|
||||
)
|
||||
};
|
||||
|
||||
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
|
||||
// The new on-disk layers are now in the layer map. We can remove the
|
||||
// in-memory layer from the map now. The flushed layer is stored in
|
||||
// the mapping in `create_delta_layer`.
|
||||
{
|
||||
let metadata = {
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
if let Some(ref l) = delta_layer_to_add {
|
||||
@@ -2812,8 +2815,17 @@ impl Timeline {
|
||||
}
|
||||
|
||||
guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
|
||||
if disk_consistent_lsn != old_disk_consistent_lsn {
|
||||
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||
|
||||
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
||||
Some(self.schedule_uploads(disk_consistent_lsn, layer_paths_to_upload)?)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
// release lock on 'layers'
|
||||
}
|
||||
};
|
||||
|
||||
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
||||
// a compaction can delete the file and then it won't be available for uploads any more.
|
||||
@@ -2829,28 +2841,22 @@ impl Timeline {
|
||||
//
|
||||
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
|
||||
// *all* the layers, to avoid fsyncing the file multiple times.
|
||||
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
|
||||
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
|
||||
// After crash, we will restart WAL streaming and processing from that point.
|
||||
if disk_consistent_lsn != old_disk_consistent_lsn {
|
||||
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
||||
self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)
|
||||
// If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
|
||||
if let Some(metadata) = metadata {
|
||||
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
|
||||
.await
|
||||
.context("update_metadata_file")?;
|
||||
// Also update the in-memory copy
|
||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||
.context("save_metadata")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update metadata file
|
||||
async fn update_metadata_file(
|
||||
fn schedule_uploads(
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
// We can only save a valid 'prev_record_lsn' value on disk if we
|
||||
// flushed *all* in-memory changes to disk. We only track
|
||||
// 'prev_record_lsn' in memory for the latest processed record, so we
|
||||
@@ -2887,10 +2893,6 @@ impl Timeline {
|
||||
x.unwrap()
|
||||
));
|
||||
|
||||
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
|
||||
.await
|
||||
.context("save_metadata")?;
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
for (path, layer_metadata) in layer_paths_to_upload {
|
||||
remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
|
||||
@@ -2898,6 +2900,20 @@ impl Timeline {
|
||||
remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
|
||||
}
|
||||
|
||||
Ok(metadata)
|
||||
}
|
||||
|
||||
async fn update_metadata_file(
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let metadata = self.schedule_uploads(disk_consistent_lsn, layer_paths_to_upload)?;
|
||||
|
||||
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
|
||||
.await
|
||||
.context("save_metadata")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,9 @@ mod hacks;
|
||||
mod link;
|
||||
|
||||
pub use link::LinkAuthError;
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
|
||||
use crate::proxy::{handle_try_wake, retry_after};
|
||||
use crate::{
|
||||
auth::{self, ClientCredentials},
|
||||
config::AuthenticationConfig,
|
||||
@@ -16,8 +18,9 @@ use crate::{
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use std::borrow::Cow;
|
||||
use std::ops::ControlFlow;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
/// A product of successful authentication.
|
||||
pub struct AuthSuccess<T> {
|
||||
@@ -117,22 +120,27 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ComputeCredentials {
|
||||
Password(Vec<u8>),
|
||||
AuthKeys(AuthKeys),
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks(
|
||||
async fn auth_quirks_creds(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the endpoint (project) name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
if creds.project.is_none() {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::password_hack(api, extra, creds, client).await;
|
||||
return hacks::password_hack(creds, client).await;
|
||||
}
|
||||
|
||||
// Password hack should set the project name.
|
||||
@@ -143,13 +151,55 @@ async fn auth_quirks(
|
||||
// Currently, we use it for websocket connections (latency).
|
||||
if allow_cleartext {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::cleartext_hack(api, extra, creds, client).await;
|
||||
return hacks::cleartext_hack(client).await;
|
||||
}
|
||||
|
||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||
classic::authenticate(api, extra, creds, client, config).await
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
let auth_stuff = auth_quirks_creds(api, extra, creds, client, allow_cleartext, config).await?;
|
||||
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
let wake_res = api.wake_compute(extra, creds).await;
|
||||
match handle_try_wake(wake_res, num_retries) {
|
||||
Err(e) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
return Err(e.into());
|
||||
}
|
||||
Ok(ControlFlow::Continue(e)) => {
|
||||
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
|
||||
}
|
||||
Ok(ControlFlow::Break(n)) => break n,
|
||||
}
|
||||
|
||||
let wait_duration = retry_after(num_retries);
|
||||
num_retries += 1;
|
||||
tokio::time::sleep(wait_duration).await;
|
||||
};
|
||||
|
||||
match auth_stuff.value {
|
||||
ComputeCredentials::Password(password) => node.config.password(password),
|
||||
ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
|
||||
};
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: auth_stuff.reported_auth_ok,
|
||||
value: node,
|
||||
})
|
||||
}
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
/// Get compute endpoint name from the credentials.
|
||||
pub fn get_endpoint(&self) -> Option<String> {
|
||||
|
||||
@@ -1,17 +1,14 @@
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use super::AuthSuccess;
|
||||
use super::{AuthSuccess, ComputeCredentials};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
compute,
|
||||
config::AuthenticationConfig,
|
||||
console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
|
||||
proxy::{handle_try_wake, retry_after},
|
||||
console::{self, AuthInfo, ConsoleReqExtra},
|
||||
sasl, scram,
|
||||
stream::PqStream,
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, warn};
|
||||
use tracing::{info, warn};
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
api: &impl console::Api,
|
||||
@@ -19,7 +16,7 @@ pub(super) async fn authenticate(
|
||||
creds: &ClientCredentials<'_>,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
info!("fetching user's authentication info");
|
||||
let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
@@ -66,38 +63,17 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
};
|
||||
|
||||
Some(compute::ScramKeys {
|
||||
compute::ScramKeys {
|
||||
client_key: client_key.as_bytes(),
|
||||
server_key: secret.server_key.as_bytes(),
|
||||
})
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
let wake_res = api.wake_compute(extra, creds).await;
|
||||
match handle_try_wake(wake_res, num_retries) {
|
||||
Err(e) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
return Err(e.into());
|
||||
}
|
||||
Ok(ControlFlow::Continue(e)) => {
|
||||
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
|
||||
}
|
||||
Ok(ControlFlow::Break(n)) => break n,
|
||||
}
|
||||
|
||||
let wait_duration = retry_after(num_retries);
|
||||
num_retries += 1;
|
||||
tokio::time::sleep(wait_duration).await;
|
||||
};
|
||||
if let Some(keys) = scram_keys {
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
node.config.auth_keys(AuthKeys::ScramSha256(keys));
|
||||
}
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
|
||||
scram_keys,
|
||||
)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
use super::AuthSuccess;
|
||||
use super::{AuthSuccess, ComputeCredentials};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
console::{
|
||||
self,
|
||||
provider::{CachedNodeInfo, ConsoleReqExtra},
|
||||
},
|
||||
stream,
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -15,11 +11,8 @@ use tracing::{info, warn};
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn cleartext_hack(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
let password = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword)
|
||||
@@ -27,24 +20,19 @@ pub async fn cleartext_hack(
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(password);
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
value: ComputeCredentials::Password(password),
|
||||
})
|
||||
}
|
||||
|
||||
/// Workaround for clients which don't provide an endpoint (project) name.
|
||||
/// Very similar to [`cleartext_hack`], but there's a specific password format.
|
||||
pub async fn password_hack(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
warn!("project not specified, resorting to the password hack auth flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
@@ -55,12 +43,9 @@ pub async fn password_hack(
|
||||
info!(project = &payload.endpoint, "received missing parameter");
|
||||
creds.project = Some(payload.endpoint);
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(payload.password);
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
value: ComputeCredentials::Password(payload.password),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -90,7 +90,11 @@ pub mod errors {
|
||||
status: http::StatusCode::LOCKED,
|
||||
ref text,
|
||||
} => {
|
||||
!text.contains("written data quota exceeded")
|
||||
// written data quota exceeded
|
||||
// data transfer quota exceeded
|
||||
// compute time quota exceeded
|
||||
// logical size quota exceeded
|
||||
!text.contains("quota exceeded")
|
||||
&& !text.contains("the limit for current plan reached")
|
||||
}
|
||||
// retry server errors
|
||||
|
||||
@@ -486,16 +486,20 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
|
||||
|
||||
def evict_all_layers(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
initial_local_layers = sorted(
|
||||
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||
)
|
||||
client = env.pageserver.http_client()
|
||||
for layer in initial_local_layers:
|
||||
if "ephemeral" in layer.name or "temp" in layer.name:
|
||||
|
||||
layer_map = client.layer_map_info(tenant_id, timeline_id)
|
||||
|
||||
for layer in layer_map.historic_layers:
|
||||
if layer.remote:
|
||||
log.info(
|
||||
f"Skipping trying to evict remote layer {tenant_id}/{timeline_id} {layer.layer_file_name}"
|
||||
)
|
||||
continue
|
||||
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
|
||||
client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
|
||||
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.layer_file_name}")
|
||||
client.evict_layer(
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.layer_file_name
|
||||
)
|
||||
|
||||
|
||||
def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
@@ -757,12 +757,14 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
|
||||
create_thread.join()
|
||||
|
||||
|
||||
# Regression test for a race condition where L0 layers are compacted before the upload,
|
||||
# resulting in the uploading complaining about the file not being found
|
||||
# https://github.com/neondatabase/neon/issues/4526
|
||||
def test_compaction_delete_before_upload(
|
||||
def test_compaction_waits_for_upload(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Compaction waits for outstanding uploads to complete, so that it avoids deleting layers
|
||||
files that have not yet been uploaded. This test forces a race between upload and
|
||||
compaction.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
@@ -792,50 +794,81 @@ def test_compaction_delete_before_upload(
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
# Now make the flushing hang and update one small piece of data
|
||||
client.configure_failpoints(("flush-frozen-pausable", "pause"))
|
||||
client.configure_failpoints(("before-upload-layer-pausable", "pause"))
|
||||
|
||||
endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
|
||||
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
|
||||
barrier = threading.Barrier(2)
|
||||
checkpoint_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
|
||||
compact_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
|
||||
compact_barrier = threading.Barrier(2)
|
||||
|
||||
def checkpoint_in_background():
|
||||
barrier.wait()
|
||||
try:
|
||||
log.info("Checkpoint starting")
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
q.put(None)
|
||||
log.info("Checkpoint complete")
|
||||
checkpoint_result.put(None)
|
||||
except PageserverApiException as e:
|
||||
q.put(e)
|
||||
log.info("Checkpoint errored: {e}")
|
||||
checkpoint_result.put(e)
|
||||
|
||||
create_thread = threading.Thread(target=checkpoint_in_background)
|
||||
create_thread.start()
|
||||
def compact_in_background():
|
||||
compact_barrier.wait()
|
||||
try:
|
||||
log.info("Compaction starting")
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
log.info("Compaction complete")
|
||||
compact_result.put(None)
|
||||
except PageserverApiException as e:
|
||||
log.info("Compaction errored: {e}")
|
||||
compact_result.put(e)
|
||||
|
||||
checkpoint_thread = threading.Thread(target=checkpoint_in_background)
|
||||
checkpoint_thread.start()
|
||||
|
||||
compact_thread = threading.Thread(target=compact_in_background)
|
||||
compact_thread.start()
|
||||
|
||||
try:
|
||||
barrier.wait()
|
||||
# Start the checkpoint, see that it blocks
|
||||
log.info("Waiting to see checkpoint hang...")
|
||||
time.sleep(5)
|
||||
assert checkpoint_result.empty()
|
||||
|
||||
time.sleep(4)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
# Start the compaction, see that it finds work to do but blocks
|
||||
compact_barrier.wait()
|
||||
log.info("Waiting to see compaction hang...")
|
||||
time.sleep(5)
|
||||
assert compact_result.empty()
|
||||
|
||||
client.configure_failpoints(("flush-frozen-pausable", "off"))
|
||||
# This is logged once compaction is started, but before we wait for operations to complete
|
||||
assert env.pageserver.log_contains("compact_level0_phase1 stats available.")
|
||||
|
||||
conflict = q.get()
|
||||
# Once we unblock uploads the compaction should complete successfully
|
||||
log.info("Disabling failpoint")
|
||||
client.configure_failpoints(("before-upload-layer-pausable", "off"))
|
||||
log.info("Awaiting compaction result")
|
||||
assert compact_result.get(timeout=10) is None
|
||||
log.info("Awaiting checkpoint result")
|
||||
assert checkpoint_result.get(timeout=10) is None
|
||||
|
||||
assert conflict is None
|
||||
except Exception:
|
||||
# Log the actual failure's backtrace here, before we proceed to join threads
|
||||
log.exception("Failure, cleaning up...")
|
||||
raise
|
||||
finally:
|
||||
create_thread.join()
|
||||
compact_barrier.abort()
|
||||
|
||||
# Add a delay for the uploads to run into either the file not found or the
|
||||
time.sleep(4)
|
||||
checkpoint_thread.join()
|
||||
compact_thread.join()
|
||||
|
||||
# Ensure that this actually terminates
|
||||
wait_upload_queue_empty(client, tenant_id, timeline_id)
|
||||
|
||||
# For now we are hitting this message.
|
||||
# Maybe in the future the underlying race condition will be fixed,
|
||||
# but until then, ensure that this message is hit instead.
|
||||
assert env.pageserver.log_contains(
|
||||
# We should not have hit the error handling path in uploads where the remote file is gone
|
||||
assert not env.pageserver.log_contains(
|
||||
"File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user