mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 17:32:56 +00:00
Merge WITH CONFLICTS 2025-03-11 main commit '158db414bf881fb358494e3215d192c8fa420a53' into yuchen/dire
ct-io-delta-image-layer-write Conflicts: pageserver/src/virtual_file.rs pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
This commit is contained in:
4
.github/workflows/build_and_test.yml
vendored
4
.github/workflows/build_and_test.yml
vendored
@@ -1175,7 +1175,7 @@ jobs:
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageBroker=false \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.meta.outputs.build-tag}} \
|
||||
@@ -1183,7 +1183,7 @@ jobs:
|
||||
|
||||
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageBroker=false \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.meta.outputs.build-tag}}
|
||||
|
||||
@@ -31,6 +31,10 @@ reason = "the marvin attack only affects private key decryption, not public key
|
||||
id = "RUSTSEC-2024-0436"
|
||||
reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact."
|
||||
|
||||
[[advisories.ignore]]
|
||||
id = "RUSTSEC-2025-0014"
|
||||
reason = "The humantime is widely used and is not easy to replace right now. It is unmaintained, but it has no known vulnerabilities to care about. #11179"
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
|
||||
|
||||
@@ -1476,8 +1476,14 @@ pub struct TenantScanRemoteStorageResponse {
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum TenantSorting {
|
||||
/// Total size of layers on local disk for all timelines in a shard.
|
||||
ResidentSize,
|
||||
/// The logical size of the largest timeline within a _tenant_ (not shard). Only tracked on
|
||||
/// shard 0, contains the sum across all shards.
|
||||
MaxLogicalSize,
|
||||
/// The logical size of the largest timeline within a _tenant_ (not shard), divided by number of
|
||||
/// shards. Only tracked on shard 0, and estimates the per-shard logical size.
|
||||
MaxLogicalSizePerShard,
|
||||
}
|
||||
|
||||
impl Default for TenantSorting {
|
||||
@@ -1507,14 +1513,20 @@ pub struct TopTenantShardsRequest {
|
||||
pub struct TopTenantShardItem {
|
||||
pub id: TenantShardId,
|
||||
|
||||
/// Total size of layers on local disk for all timelines in this tenant
|
||||
/// Total size of layers on local disk for all timelines in this shard.
|
||||
pub resident_size: u64,
|
||||
|
||||
/// Total size of layers in remote storage for all timelines in this tenant
|
||||
/// Total size of layers in remote storage for all timelines in this shard.
|
||||
pub physical_size: u64,
|
||||
|
||||
/// The largest logical size of a timeline within this tenant
|
||||
/// The largest logical size of a timeline within this _tenant_ (not shard). This is only
|
||||
/// tracked on shard 0, and contains the sum of the logical size across all shards.
|
||||
pub max_logical_size: u64,
|
||||
|
||||
/// The largest logical size of a timeline within this _tenant_ (not shard) divided by number of
|
||||
/// shards. This is only tracked on shard 0, and is only an estimate as we divide it evenly by
|
||||
/// shard count, rounded up.
|
||||
pub max_logical_size_per_shard: u64,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
|
||||
@@ -284,6 +284,18 @@ impl Client {
|
||||
simple_query::batch_execute(self.inner(), query).await
|
||||
}
|
||||
|
||||
pub async fn discard_all(&self) -> Result<ReadyForQueryStatus, Error> {
|
||||
// clear the prepared statements that are about to be nuked from the postgres session
|
||||
{
|
||||
let mut typeinfo = self.inner.cached_typeinfo.lock();
|
||||
typeinfo.typeinfo = None;
|
||||
typeinfo.typeinfo_composite = None;
|
||||
typeinfo.typeinfo_enum = None;
|
||||
}
|
||||
|
||||
self.batch_execute("discard all").await
|
||||
}
|
||||
|
||||
/// Begins a new database transaction.
|
||||
///
|
||||
/// The transaction will roll back by default - use the `commit` method to commit it.
|
||||
|
||||
@@ -3223,6 +3223,7 @@ async fn post_top_tenants(
|
||||
match order_by {
|
||||
TenantSorting::ResidentSize => sizes.resident_size,
|
||||
TenantSorting::MaxLogicalSize => sizes.max_logical_size,
|
||||
TenantSorting::MaxLogicalSizePerShard => sizes.max_logical_size_per_shard,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3842,6 +3842,7 @@ impl Tenant {
|
||||
resident_size: 0,
|
||||
physical_size: 0,
|
||||
max_logical_size: 0,
|
||||
max_logical_size_per_shard: 0,
|
||||
};
|
||||
|
||||
for timeline in self.timelines.lock().unwrap().values() {
|
||||
@@ -3858,6 +3859,10 @@ impl Tenant {
|
||||
);
|
||||
}
|
||||
|
||||
result.max_logical_size_per_shard = result
|
||||
.max_logical_size
|
||||
.div_ceil(self.tenant_shard_id.shard_count.count() as u64);
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -175,6 +175,7 @@ impl BlobWriter {
|
||||
start_offset: u64,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
ctx: &RequestContext,
|
||||
flush_task_span: tracing::Span,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
io_buf: Some(BytesMut::new()),
|
||||
@@ -184,6 +185,7 @@ impl BlobWriter {
|
||||
|| IoBufferMut::with_capacity(Self::CAPACITY),
|
||||
gate.enter()?,
|
||||
ctx,
|
||||
flush_task_span,
|
||||
),
|
||||
offset: start_offset,
|
||||
})
|
||||
@@ -331,6 +333,7 @@ pub(crate) mod tests {
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use tracing::info_span;
|
||||
|
||||
use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
@@ -354,7 +357,7 @@ pub(crate) mod tests {
|
||||
let mut offsets = Vec::new();
|
||||
{
|
||||
let file = Arc::new(VirtualFile::create_v2(pathbuf.as_path(), ctx).await?);
|
||||
let mut wtr = BlobWriter::new(file, 0, &gate, ctx).unwrap();
|
||||
let mut wtr = BlobWriter::new(file, 0, &gate, ctx, info_span!("test")).unwrap();
|
||||
for blob in blobs.iter() {
|
||||
let (_, res) = if compression {
|
||||
let res = wtr
|
||||
|
||||
@@ -9,7 +9,7 @@ use camino::Utf8PathBuf;
|
||||
use num_traits::Num;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio_epoll_uring::{BoundedBuf, Slice};
|
||||
use tracing::error;
|
||||
use tracing::{error, info_span};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
|
||||
@@ -77,6 +77,7 @@ impl EphemeralFile {
|
||||
|| IoBufferMut::with_capacity(TAIL_SZ),
|
||||
gate.enter()?,
|
||||
ctx,
|
||||
info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename),
|
||||
),
|
||||
_gate_guard: gate.enter()?,
|
||||
})
|
||||
|
||||
@@ -18,7 +18,7 @@ use tokio::fs::{self, File, OpenOptions};
|
||||
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
use tracing::{info_span, warn};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::{backoff, pausable_failpoint};
|
||||
@@ -230,6 +230,7 @@ async fn download_object(
|
||||
|| IoBufferMut::with_capacity(super::BUFFER_SIZE),
|
||||
gate.enter().map_err(|_| DownloadError::Cancelled)?,
|
||||
ctx,
|
||||
info_span!(parent: None, "download_object_buffered_writer", %dst_path),
|
||||
);
|
||||
|
||||
// TODO: use vectored write (writev) once supported by tokio-epoll-uring.
|
||||
|
||||
@@ -433,7 +433,13 @@ impl DeltaLayerWriterInner {
|
||||
let file = Arc::new(VirtualFile::create_v2(&path, ctx).await?);
|
||||
|
||||
// Start at PAGE_SZ, make room for the header block
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, ctx)?;
|
||||
let blob_writer = BlobWriter::new(
|
||||
file,
|
||||
PAGE_SZ as u64,
|
||||
gate,
|
||||
ctx,
|
||||
info_span!(parent: None, "delta_layer_writer_flush_task", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %path),
|
||||
)?;
|
||||
|
||||
// Initialize the b-tree index builder
|
||||
let block_buf = BlockBuf::new();
|
||||
|
||||
@@ -796,7 +796,13 @@ impl ImageLayerWriterInner {
|
||||
};
|
||||
|
||||
// Start at `PAGE_SZ` to make room for the header block.
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, ctx)?;
|
||||
let blob_writer = BlobWriter::new(
|
||||
file,
|
||||
PAGE_SZ as u64,
|
||||
gate,
|
||||
ctx,
|
||||
info_span!(parent: None, "image_layer_writer_flush_task", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %path),
|
||||
)?;
|
||||
|
||||
// Initialize the b-tree index builder
|
||||
let block_buf = BlockBuf::new();
|
||||
|
||||
@@ -393,6 +393,9 @@ impl GcCompactionQueue {
|
||||
if job.dry_run {
|
||||
flags |= CompactFlags::DryRun;
|
||||
}
|
||||
if options.flags.contains(CompactFlags::NoYield) {
|
||||
flags |= CompactFlags::NoYield;
|
||||
}
|
||||
let options = CompactOptions {
|
||||
flags,
|
||||
sub_compaction: false,
|
||||
@@ -1092,7 +1095,7 @@ impl Timeline {
|
||||
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
|
||||
|
||||
tracing::info!(
|
||||
"latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
"starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
*latest_gc_cutoff,
|
||||
self.gc_info.read().unwrap().cutoffs.time
|
||||
);
|
||||
@@ -1121,6 +1124,7 @@ impl Timeline {
|
||||
// Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being
|
||||
// wrong. If ShardedRange claims the local page count is zero, then no keys in this layer
|
||||
// should be !is_key_disposable()
|
||||
// TODO: exclude sparse keyspace from this check, otherwise it will infinitely loop.
|
||||
let range = layer_desc.get_key_range();
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
@@ -2619,6 +2623,7 @@ impl Timeline {
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let sub_compaction = options.sub_compaction;
|
||||
let job = GcCompactJob::from_compact_options(options.clone());
|
||||
let no_yield = options.flags.contains(CompactFlags::NoYield);
|
||||
if sub_compaction {
|
||||
info!(
|
||||
"running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
|
||||
@@ -2633,14 +2638,15 @@ impl Timeline {
|
||||
idx + 1,
|
||||
jobs_len
|
||||
);
|
||||
self.compact_with_gc_inner(cancel, job, ctx).await?;
|
||||
self.compact_with_gc_inner(cancel, job, ctx, no_yield)
|
||||
.await?;
|
||||
}
|
||||
if jobs_len == 0 {
|
||||
info!("no jobs to run, skipping gc bottom-most compaction");
|
||||
}
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
self.compact_with_gc_inner(cancel, job, ctx).await
|
||||
self.compact_with_gc_inner(cancel, job, ctx, no_yield).await
|
||||
}
|
||||
|
||||
async fn compact_with_gc_inner(
|
||||
@@ -2648,6 +2654,7 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
job: GcCompactJob,
|
||||
ctx: &RequestContext,
|
||||
no_yield: bool,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
@@ -2917,14 +2924,18 @@ impl Timeline {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
if !no_yield {
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!(
|
||||
"preempt gc-compaction when downloading layers: too many L0 layers"
|
||||
);
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
}
|
||||
let resident_layer = layer
|
||||
.download_and_keep_resident(ctx)
|
||||
@@ -3058,16 +3069,21 @@ impl Timeline {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
keys_processed += 1;
|
||||
if keys_processed % 1000 == 0 {
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
|
||||
if !no_yield {
|
||||
keys_processed += 1;
|
||||
if keys_processed % 1000 == 0 {
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!(
|
||||
"preempt gc-compaction in the main loop: too many L0 layers"
|
||||
);
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
}
|
||||
}
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
|
||||
@@ -1289,10 +1289,8 @@ impl OwnedAsyncWriter for VirtualFile {
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<FullSlice<Buf>> {
|
||||
let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await;
|
||||
res?;
|
||||
Ok(buf)
|
||||
) -> (FullSlice<Buf>, std::io::Result<()>) {
|
||||
VirtualFile::write_all_at(self, buf, offset, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ pub trait OwnedAsyncWriter {
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> impl std::future::Future<Output = std::io::Result<FullSlice<Buf>>> + Send;
|
||||
) -> impl std::future::Future<Output = (FullSlice<Buf>, std::io::Result<()>)> + Send;
|
||||
}
|
||||
|
||||
/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
|
||||
@@ -69,6 +69,7 @@ where
|
||||
buf_new: impl Fn() -> B,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: &RequestContext,
|
||||
flush_task_span: tracing::Span,
|
||||
) -> Self {
|
||||
Self {
|
||||
writer: writer.clone(),
|
||||
@@ -78,6 +79,7 @@ where
|
||||
buf_new(),
|
||||
gate_guard,
|
||||
ctx.attached_child(),
|
||||
flush_task_span,
|
||||
),
|
||||
submit_offset: start_offset,
|
||||
}
|
||||
@@ -121,9 +123,8 @@ where
|
||||
let mut bytes_amount = submit_offset;
|
||||
if let Some(buf) = handle_tail(buf) {
|
||||
bytes_amount += buf.pending() as u64;
|
||||
let _ = writer
|
||||
.write_all_at(buf.flush(), submit_offset, &ctx)
|
||||
.await?;
|
||||
let (_, res) = writer.write_all_at(buf.flush(), submit_offset, &ctx).await;
|
||||
let _: () = res?;
|
||||
}
|
||||
Ok((bytes_amount, writer))
|
||||
}
|
||||
@@ -299,12 +300,12 @@ mod tests {
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
_: &RequestContext,
|
||||
) -> std::io::Result<FullSlice<Buf>> {
|
||||
) -> (FullSlice<Buf>, std::io::Result<()>) {
|
||||
self.writes
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push((Vec::from(&buf[..]), offset));
|
||||
Ok(buf)
|
||||
(buf, Ok(()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -324,6 +325,7 @@ mod tests {
|
||||
|| IoBufferMut::with_capacity(2),
|
||||
gate.enter()?,
|
||||
ctx,
|
||||
tracing::Span::none(),
|
||||
);
|
||||
|
||||
writer.write_buffered_borrowed(b"abc", ctx).await?;
|
||||
|
||||
@@ -1,9 +1,14 @@
|
||||
use std::ops::ControlFlow;
|
||||
use std::{marker::PhantomData, sync::Arc};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, info, info_span, warn};
|
||||
use utils::sync::duplex;
|
||||
|
||||
use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter};
|
||||
use crate::context::RequestContext;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAligned;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
|
||||
|
||||
@@ -120,6 +125,7 @@ where
|
||||
buf: B,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: RequestContext,
|
||||
span: tracing::Span,
|
||||
) -> Self
|
||||
where
|
||||
B: Buffer<IoBuf = Buf> + Send + 'static,
|
||||
@@ -127,11 +133,14 @@ where
|
||||
// It is fine to buffer up to only 1 message. We only 1 message in-flight at a time.
|
||||
let (front, back) = duplex::mpsc::channel(1);
|
||||
|
||||
let join_handle = tokio::spawn(async move {
|
||||
FlushBackgroundTask::new(back, file, gate_guard, ctx)
|
||||
.run(buf.flush())
|
||||
.await
|
||||
});
|
||||
let join_handle = tokio::spawn(
|
||||
async move {
|
||||
FlushBackgroundTask::new(back, file, gate_guard, ctx)
|
||||
.run(buf.flush())
|
||||
.await
|
||||
}
|
||||
.instrument(span),
|
||||
);
|
||||
|
||||
FlushHandle {
|
||||
inner: Some(FlushHandleInner {
|
||||
@@ -240,6 +249,7 @@ where
|
||||
/// The passed in slice is immediately sent back to the flush handle through the duplex channel.
|
||||
async fn run(mut self, slice: FullSlice<Buf>) -> std::io::Result<RequestContext> {
|
||||
// Sends the extra buffer back to the handle.
|
||||
// TODO: can this ever await and or fail? I think not.
|
||||
self.channel.send(slice).await.map_err(|_| {
|
||||
std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early")
|
||||
})?;
|
||||
@@ -255,10 +265,47 @@ where
|
||||
}
|
||||
|
||||
// Write slice to disk at `offset`.
|
||||
let slice = self
|
||||
.writer
|
||||
.write_all_at(request.slice, request.offset, &self.ctx)
|
||||
.await?;
|
||||
//
|
||||
// Error handling happens according to the current policy of crashing
|
||||
// on fatal IO errors and retrying in place otherwise (deeming all other errors retryable).
|
||||
// (The upper layers of the Pageserver write path are not equipped to retry write errors
|
||||
// becasuse they often deallocate the buffers that were already written).
|
||||
//
|
||||
// TODO: cancellation sensitiity.
|
||||
// Without it, if we hit a bug where retrying is never successful,
|
||||
// then we can't shut down the timeline/tenant/pageserver cleanly because
|
||||
// layers of the Pageserver write path are holding the gate open for EphemeralFile.
|
||||
//
|
||||
// TODO: use utils::backoff::retry once async closures are actually usable
|
||||
//
|
||||
let mut slice_storage = Some(request.slice);
|
||||
for attempt in 1.. {
|
||||
let result = async {
|
||||
if attempt > 1 {
|
||||
info!("retrying flush");
|
||||
}
|
||||
let slice = slice_storage.take().expect(
|
||||
"likely previous invocation of this future didn't get polled to completion",
|
||||
);
|
||||
let (slice, res) = self.writer.write_all_at(slice, request.offset, &self.ctx).await;
|
||||
slice_storage = Some(slice);
|
||||
let res = res.maybe_fatal_err("owned_buffers_io flush");
|
||||
let Err(err) = res else {
|
||||
return ControlFlow::Break(());
|
||||
};
|
||||
warn!(%err, "error flushing buffered writer buffer to disk, retrying after backoff");
|
||||
static NO_CANCELLATION: Lazy<CancellationToken> = Lazy::new(CancellationToken::new);
|
||||
utils::backoff::exponential_backoff(attempt, 1.0, 10.0, &NO_CANCELLATION).await;
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
.instrument(info_span!("flush_attempt", %attempt))
|
||||
.await;
|
||||
match result {
|
||||
ControlFlow::Break(()) => break,
|
||||
ControlFlow::Continue(()) => continue,
|
||||
}
|
||||
}
|
||||
let slice = slice_storage.expect("loop must have run at least once");
|
||||
|
||||
#[cfg(test)]
|
||||
{
|
||||
|
||||
@@ -57,10 +57,11 @@ static void SendProposerGreeting(Safekeeper *sk);
|
||||
static void RecvAcceptorGreeting(Safekeeper *sk);
|
||||
static void SendVoteRequest(Safekeeper *sk);
|
||||
static void RecvVoteResponse(Safekeeper *sk);
|
||||
static bool VotesCollected(WalProposer *wp);
|
||||
static void HandleElectedProposer(WalProposer *wp);
|
||||
static term_t GetHighestTerm(TermHistory *th);
|
||||
static term_t GetEpoch(Safekeeper *sk);
|
||||
static void DetermineEpochStartLsn(WalProposer *wp);
|
||||
static term_t GetLastLogTerm(Safekeeper *sk);
|
||||
static void ProcessPropStartPos(WalProposer *wp);
|
||||
static void SendProposerElected(Safekeeper *sk);
|
||||
static void StartStreaming(Safekeeper *sk);
|
||||
static void SendMessageToNode(Safekeeper *sk);
|
||||
@@ -97,6 +98,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
wp = palloc0(sizeof(WalProposer));
|
||||
wp->config = config;
|
||||
wp->api = api;
|
||||
wp->state = WPS_COLLECTING_TERMS;
|
||||
|
||||
wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
|
||||
|
||||
@@ -518,7 +520,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* nodes are transferred from SS_VOTING to sending actual vote
|
||||
* requests.
|
||||
*/
|
||||
case SS_VOTING:
|
||||
case SS_WAIT_VOTING:
|
||||
wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
@@ -547,7 +549,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
/*
|
||||
* Idle state for waiting votes from quorum.
|
||||
*/
|
||||
case SS_IDLE:
|
||||
case SS_WAIT_ELECTED:
|
||||
wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
@@ -721,6 +723,15 @@ SendProposerGreeting(Safekeeper *sk)
|
||||
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
|
||||
}
|
||||
|
||||
/*
|
||||
* Have we received greeting from enough (quorum) safekeepers to start voting?
|
||||
*/
|
||||
static bool
|
||||
TermsCollected(WalProposer *wp)
|
||||
{
|
||||
return wp->n_connected >= wp->quorum;
|
||||
}
|
||||
|
||||
static void
|
||||
RecvAcceptorGreeting(Safekeeper *sk)
|
||||
{
|
||||
@@ -754,7 +765,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
}
|
||||
|
||||
/* Protocol is all good, move to voting. */
|
||||
sk->state = SS_VOTING;
|
||||
sk->state = SS_WAIT_VOTING;
|
||||
|
||||
/*
|
||||
* Note: it would be better to track the counter on per safekeeper basis,
|
||||
@@ -762,17 +773,18 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
* as is for now.
|
||||
*/
|
||||
++wp->n_connected;
|
||||
if (wp->n_connected <= wp->quorum)
|
||||
if (wp->state == WPS_COLLECTING_TERMS)
|
||||
{
|
||||
/* We're still collecting terms from the majority. */
|
||||
wp->propTerm = Max(sk->greetResponse.term, wp->propTerm);
|
||||
|
||||
/* Quorum is acquried, prepare the vote request. */
|
||||
if (wp->n_connected == wp->quorum)
|
||||
if (TermsCollected(wp))
|
||||
{
|
||||
wp->propTerm++;
|
||||
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||
|
||||
wp->state = WPS_CAMPAIGN;
|
||||
wp->voteRequest.pam.tag = 'v';
|
||||
wp->voteRequest.generation = wp->mconf.generation;
|
||||
wp->voteRequest.term = wp->propTerm;
|
||||
@@ -787,12 +799,10 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we have quorum. If there aren't enough safekeepers, wait and
|
||||
* do nothing. We'll eventually get a task when the election starts.
|
||||
*
|
||||
* If we do have quorum, we can start an election.
|
||||
* If we have quorum, start (or just send vote request to newly connected
|
||||
* node) election, otherwise wait until we have more greetings.
|
||||
*/
|
||||
if (wp->n_connected < wp->quorum)
|
||||
if (wp->state == WPS_COLLECTING_TERMS)
|
||||
{
|
||||
/*
|
||||
* SS_VOTING is an idle state; read-ready indicates the connection
|
||||
@@ -807,11 +817,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
*/
|
||||
for (int j = 0; j < wp->n_safekeepers; j++)
|
||||
{
|
||||
/*
|
||||
* Remember: SS_VOTING indicates that the safekeeper is
|
||||
* participating in voting, but hasn't sent anything yet.
|
||||
*/
|
||||
if (wp->safekeeper[j].state == SS_VOTING)
|
||||
if (wp->safekeeper[j].state == SS_WAIT_VOTING)
|
||||
SendVoteRequest(&wp->safekeeper[j]);
|
||||
}
|
||||
}
|
||||
@@ -838,6 +844,8 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
Assert(wp->state >= WPS_CAMPAIGN);
|
||||
|
||||
sk->voteResponse.apm.tag = 'v';
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
|
||||
return;
|
||||
@@ -856,7 +864,7 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
* we are not elected yet and thus need the vote.
|
||||
*/
|
||||
if ((!sk->voteResponse.voteGiven) &&
|
||||
(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
|
||||
(sk->voteResponse.term > wp->propTerm || wp->state == WPS_CAMPAIGN))
|
||||
{
|
||||
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
@@ -864,38 +872,83 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
}
|
||||
Assert(sk->voteResponse.term == wp->propTerm);
|
||||
|
||||
/* Handshake completed, do we have quorum? */
|
||||
/* ready for elected message */
|
||||
sk->state = SS_WAIT_ELECTED;
|
||||
|
||||
wp->n_votes++;
|
||||
if (wp->n_votes < wp->quorum)
|
||||
/* Are we already elected? */
|
||||
if (wp->state == WPS_CAMPAIGN)
|
||||
{
|
||||
sk->state = SS_IDLE; /* can't do much yet, no quorum */
|
||||
}
|
||||
else if (wp->n_votes > wp->quorum)
|
||||
{
|
||||
/* already elected, start streaming */
|
||||
SendProposerElected(sk);
|
||||
/* no; check if this vote makes us elected */
|
||||
if (VotesCollected(wp))
|
||||
{
|
||||
wp->state = WPS_ELECTED;
|
||||
HandleElectedProposer(wp);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* can't do much yet, no quorum */
|
||||
return;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
sk->state = SS_IDLE;
|
||||
/* Idle state waits for read-ready events */
|
||||
wp->api.update_event_set(sk, WL_SOCKET_READABLE);
|
||||
|
||||
HandleElectedProposer(sk->wp);
|
||||
Assert(wp->state == WPS_ELECTED);
|
||||
/* send elected only to this sk */
|
||||
SendProposerElected(sk);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks if enough votes has been collected to get elected and if that's the
|
||||
* case finds the highest vote, setting donor, donorLastLogTerm,
|
||||
* propTermStartLsn fields. Also sets truncateLsn.
|
||||
*/
|
||||
static bool
|
||||
VotesCollected(WalProposer *wp)
|
||||
{
|
||||
int n_ready = 0;
|
||||
|
||||
/* assumed to be called only when not elected yet */
|
||||
Assert(wp->state == WPS_CAMPAIGN);
|
||||
|
||||
wp->propTermStartLsn = InvalidXLogRecPtr;
|
||||
wp->donorLastLogTerm = 0;
|
||||
wp->truncateLsn = InvalidXLogRecPtr;
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
|
||||
{
|
||||
n_ready++;
|
||||
|
||||
if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
|
||||
(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
|
||||
wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn))
|
||||
{
|
||||
wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
|
||||
wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
|
||||
wp->donor = i;
|
||||
}
|
||||
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
|
||||
}
|
||||
}
|
||||
|
||||
return n_ready >= wp->quorum;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called once a majority of acceptors have voted for us and current proposer
|
||||
* has been elected.
|
||||
*
|
||||
* Sends ProposerElected message to all acceptors in SS_IDLE state and starts
|
||||
* Sends ProposerElected message to all acceptors in SS_WAIT_ELECTED state and starts
|
||||
* replication from walsender.
|
||||
*/
|
||||
static void
|
||||
HandleElectedProposer(WalProposer *wp)
|
||||
{
|
||||
DetermineEpochStartLsn(wp);
|
||||
ProcessPropStartPos(wp);
|
||||
Assert(wp->propTermStartLsn != InvalidXLogRecPtr);
|
||||
|
||||
/*
|
||||
* Synchronously download WAL from the most advanced safekeeper. We do
|
||||
@@ -907,40 +960,24 @@ HandleElectedProposer(WalProposer *wp)
|
||||
wp_log(FATAL, "failed to download WAL for logical replicaiton");
|
||||
}
|
||||
|
||||
/*
|
||||
* Zero propEpochStartLsn means majority of safekeepers doesn't have any
|
||||
* WAL, timeline was just created. Compute bumps it to basebackup LSN,
|
||||
* otherwise we must be sync-safekeepers and we have nothing to do then.
|
||||
*
|
||||
* Proceeding is not only pointless but harmful, because we'd give
|
||||
* safekeepers term history starting with 0/0. These hacks will go away
|
||||
* once we disable implicit timeline creation on safekeepers and create it
|
||||
* with non zero LSN from the start.
|
||||
*/
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr)
|
||||
{
|
||||
Assert(wp->config->syncSafekeepers);
|
||||
wp_log(LOG, "elected with zero propEpochStartLsn in sync-safekeepers, exiting");
|
||||
wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
|
||||
}
|
||||
|
||||
if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
|
||||
if (wp->truncateLsn == wp->propTermStartLsn && wp->config->syncSafekeepers)
|
||||
{
|
||||
/* Sync is not needed: just exit */
|
||||
wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
|
||||
wp->api.finish_sync_safekeepers(wp, wp->propTermStartLsn);
|
||||
/* unreachable */
|
||||
}
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
if (wp->safekeeper[i].state == SS_IDLE)
|
||||
if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
|
||||
SendProposerElected(&wp->safekeeper[i]);
|
||||
}
|
||||
|
||||
/*
|
||||
* The proposer has been elected, and there will be no quorum waiting
|
||||
* after this point. There will be no safekeeper with state SS_IDLE also,
|
||||
* because that state is used only for quorum waiting.
|
||||
* after this point. There will be no safekeeper with state
|
||||
* SS_WAIT_ELECTED also, because that state is used only for quorum
|
||||
* waiting.
|
||||
*/
|
||||
|
||||
if (wp->config->syncSafekeepers)
|
||||
@@ -957,7 +994,7 @@ HandleElectedProposer(WalProposer *wp)
|
||||
return;
|
||||
}
|
||||
|
||||
wp->api.start_streaming(wp, wp->propEpochStartLsn);
|
||||
wp->api.start_streaming(wp, wp->propTermStartLsn);
|
||||
/* Should not return here */
|
||||
}
|
||||
|
||||
@@ -970,7 +1007,7 @@ GetHighestTerm(TermHistory *th)
|
||||
|
||||
/* safekeeper's epoch is the term of the highest entry in the log */
|
||||
static term_t
|
||||
GetEpoch(Safekeeper *sk)
|
||||
GetLastLogTerm(Safekeeper *sk)
|
||||
{
|
||||
return GetHighestTerm(&sk->voteResponse.termHistory);
|
||||
}
|
||||
@@ -991,72 +1028,52 @@ SkipXLogPageHeader(WalProposer *wp, XLogRecPtr lsn)
|
||||
}
|
||||
|
||||
/*
|
||||
* Called after majority of acceptors gave votes, it calculates the most
|
||||
* advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
|
||||
* which we'll write WAL in our term.
|
||||
*
|
||||
* Sets truncateLsn along the way (though it is not of much use at this point --
|
||||
* only for skipping recovery).
|
||||
* Called after quorum gave votes and proposer starting position (highest vote
|
||||
* term + flush LSN) -- is determined (VotesCollected true), this function
|
||||
* adopts it: pushes LSN to shmem, sets wp term history, verifies that the
|
||||
* basebackup matches.
|
||||
*/
|
||||
static void
|
||||
DetermineEpochStartLsn(WalProposer *wp)
|
||||
ProcessPropStartPos(WalProposer *wp)
|
||||
{
|
||||
TermHistory *dth;
|
||||
int n_ready = 0;
|
||||
WalproposerShmemState *walprop_shared;
|
||||
|
||||
wp->propEpochStartLsn = InvalidXLogRecPtr;
|
||||
wp->donorEpoch = 0;
|
||||
wp->truncateLsn = InvalidXLogRecPtr;
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
if (wp->safekeeper[i].state == SS_IDLE)
|
||||
{
|
||||
n_ready++;
|
||||
|
||||
if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch ||
|
||||
(GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch &&
|
||||
wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn))
|
||||
{
|
||||
wp->donorEpoch = GetEpoch(&wp->safekeeper[i]);
|
||||
wp->propEpochStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
|
||||
wp->donor = i;
|
||||
}
|
||||
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_ready < wp->quorum)
|
||||
{
|
||||
/*
|
||||
* This is a rare case that can be triggered if safekeeper has voted
|
||||
* and disconnected. In this case, its state will not be SS_IDLE and
|
||||
* its vote cannot be used, because we clean up `voteResponse` in
|
||||
* `ShutdownConnection`.
|
||||
*/
|
||||
wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready);
|
||||
}
|
||||
/* must have collected votes */
|
||||
Assert(wp->state == WPS_ELECTED);
|
||||
|
||||
/*
|
||||
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are
|
||||
* bootstrapping and nothing was committed yet. Start streaming then from
|
||||
* the basebackup LSN.
|
||||
* If propTermStartLsn is 0, it means flushLsn is 0 everywhere, we are
|
||||
* bootstrapping and nothing was committed yet. Start streaming from the
|
||||
* basebackup LSN then.
|
||||
*
|
||||
* In case of sync-safekeepers just exit: proceeding is not only pointless
|
||||
* but harmful, because we'd give safekeepers term history starting with
|
||||
* 0/0. These hacks will go away once we disable implicit timeline
|
||||
* creation on safekeepers and create it with non zero LSN from the start.
|
||||
*/
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
|
||||
if (wp->propTermStartLsn == InvalidXLogRecPtr)
|
||||
{
|
||||
wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
|
||||
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
if (!wp->config->syncSafekeepers)
|
||||
{
|
||||
wp->propTermStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
|
||||
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propTermStartLsn));
|
||||
}
|
||||
else
|
||||
{
|
||||
wp_log(LOG, "elected with zero propTermStartLsn in sync-safekeepers, exiting");
|
||||
wp->api.finish_sync_safekeepers(wp, wp->propTermStartLsn);
|
||||
}
|
||||
}
|
||||
pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
|
||||
pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propTermStartLsn);
|
||||
|
||||
Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers);
|
||||
|
||||
/*
|
||||
* We will be generating WAL since propEpochStartLsn, so we should set
|
||||
* We will be generating WAL since propTermStartLsn, so we should set
|
||||
* availableLsn to mark this LSN as the latest available position.
|
||||
*/
|
||||
wp->availableLsn = wp->propEpochStartLsn;
|
||||
wp->availableLsn = wp->propTermStartLsn;
|
||||
|
||||
/*
|
||||
* Proposer's term history is the donor's + its own entry.
|
||||
@@ -1067,12 +1084,12 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (dth->n_entries > 0)
|
||||
memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn;
|
||||
|
||||
wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
wp->quorum,
|
||||
wp->propTerm,
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->propTermStartLsn),
|
||||
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn));
|
||||
|
||||
@@ -1090,7 +1107,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
* Safekeepers don't skip header as they need continious stream of
|
||||
* data, so correct LSN for comparison.
|
||||
*/
|
||||
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
|
||||
if (SkipXLogPageHeader(wp, wp->propTermStartLsn) != wp->api.get_redo_start_lsn(wp))
|
||||
{
|
||||
/*
|
||||
* However, allow to proceed if last_log_term on the node which
|
||||
@@ -1111,8 +1128,8 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
*/
|
||||
disable_core_dump();
|
||||
wp_log(PANIC,
|
||||
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
"collected propTermStartLsn %X/%X, but basebackup LSN %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->propTermStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
|
||||
}
|
||||
}
|
||||
@@ -1623,7 +1640,7 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
|
||||
* Like in Raft, we aren't allowed to commit entries from previous
|
||||
* terms, so ignore reported LSN until it gets to epochStartLsn.
|
||||
*/
|
||||
responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propEpochStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
|
||||
responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
|
||||
}
|
||||
qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
|
||||
|
||||
@@ -1656,10 +1673,10 @@ UpdateDonorShmem(WalProposer *wp)
|
||||
* about its position immediately after election before any feedbacks are
|
||||
* sent.
|
||||
*/
|
||||
if (wp->safekeeper[wp->donor].state >= SS_IDLE)
|
||||
if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED)
|
||||
{
|
||||
donor = &wp->safekeeper[wp->donor];
|
||||
donor_lsn = wp->propEpochStartLsn;
|
||||
donor_lsn = wp->propTermStartLsn;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1748,7 +1765,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
bool synced = sk->appendResponse.commitLsn >= wp->propEpochStartLsn;
|
||||
bool synced = sk->appendResponse.commitLsn >= wp->propTermStartLsn;
|
||||
|
||||
/* alive safekeeper which is not synced yet; wait for it */
|
||||
if (sk->state != SS_OFFLINE && !synced)
|
||||
@@ -1772,7 +1789,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
|
||||
*/
|
||||
BroadcastAppendRequest(wp);
|
||||
|
||||
wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
|
||||
wp->api.finish_sync_safekeepers(wp, wp->propTermStartLsn);
|
||||
/* unreachable */
|
||||
}
|
||||
}
|
||||
@@ -2378,7 +2395,7 @@ FormatSafekeeperState(Safekeeper *sk)
|
||||
case SS_HANDSHAKE_RECV:
|
||||
return_val = "handshake (receiving)";
|
||||
break;
|
||||
case SS_VOTING:
|
||||
case SS_WAIT_VOTING:
|
||||
return_val = "voting";
|
||||
break;
|
||||
case SS_WAIT_VERDICT:
|
||||
@@ -2387,7 +2404,7 @@ FormatSafekeeperState(Safekeeper *sk)
|
||||
case SS_SEND_ELECTED_FLUSH:
|
||||
return_val = "send-announcement-flush";
|
||||
break;
|
||||
case SS_IDLE:
|
||||
case SS_WAIT_ELECTED:
|
||||
return_val = "idle";
|
||||
break;
|
||||
case SS_ACTIVE:
|
||||
@@ -2476,8 +2493,8 @@ SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_even
|
||||
* Idle states use read-readiness as a sign that the connection
|
||||
* has been disconnected.
|
||||
*/
|
||||
case SS_VOTING:
|
||||
case SS_IDLE:
|
||||
case SS_WAIT_VOTING:
|
||||
case SS_WAIT_ELECTED:
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
return;
|
||||
|
||||
|
||||
@@ -73,12 +73,12 @@ typedef enum
|
||||
* Moved externally by execution of SS_HANDSHAKE_RECV, when we received a
|
||||
* quorum of handshakes.
|
||||
*/
|
||||
SS_VOTING,
|
||||
SS_WAIT_VOTING,
|
||||
|
||||
/*
|
||||
* Already sent voting information, waiting to receive confirmation from
|
||||
* the node. After receiving, moves to SS_IDLE, if the quorum isn't
|
||||
* reached yet.
|
||||
* the node. After receiving, moves to SS_WAIT_ELECTED, if the quorum
|
||||
* isn't reached yet.
|
||||
*/
|
||||
SS_WAIT_VERDICT,
|
||||
|
||||
@@ -91,7 +91,7 @@ typedef enum
|
||||
*
|
||||
* Moves to SS_ACTIVE only by call to StartStreaming.
|
||||
*/
|
||||
SS_IDLE,
|
||||
SS_WAIT_ELECTED,
|
||||
|
||||
/*
|
||||
* Active phase, when we acquired quorum and have WAL to send or feedback
|
||||
@@ -751,6 +751,15 @@ typedef struct WalProposerConfig
|
||||
#endif
|
||||
} WalProposerConfig;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
/* collecting greetings to determine term to campaign for */
|
||||
WPS_COLLECTING_TERMS,
|
||||
/* campaing started, waiting for votes */
|
||||
WPS_CAMPAIGN,
|
||||
/* successfully elected */
|
||||
WPS_ELECTED,
|
||||
} WalProposerState;
|
||||
|
||||
/*
|
||||
* WAL proposer state.
|
||||
@@ -758,6 +767,7 @@ typedef struct WalProposerConfig
|
||||
typedef struct WalProposer
|
||||
{
|
||||
WalProposerConfig *config;
|
||||
WalProposerState state;
|
||||
/* Current walproposer membership configuration */
|
||||
MembershipConfiguration mconf;
|
||||
|
||||
@@ -813,10 +823,10 @@ typedef struct WalProposer
|
||||
TermHistory propTermHistory;
|
||||
|
||||
/* epoch start lsn of the proposer */
|
||||
XLogRecPtr propEpochStartLsn;
|
||||
XLogRecPtr propTermStartLsn;
|
||||
|
||||
/* Most advanced acceptor epoch */
|
||||
term_t donorEpoch;
|
||||
term_t donorLastLogTerm;
|
||||
|
||||
/* Most advanced acceptor */
|
||||
int donor;
|
||||
|
||||
@@ -1496,7 +1496,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
|
||||
|
||||
snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
|
||||
Assert(!sk->xlogreader);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix);
|
||||
if (sk->xlogreader == NULL)
|
||||
wpg_log(FATAL, "failed to allocate xlog reader");
|
||||
}
|
||||
|
||||
@@ -290,7 +290,7 @@ impl ConnCfg {
|
||||
"connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}, query_id={}",
|
||||
self.0.get_ssl_mode(),
|
||||
ctx.get_proxy_latency(),
|
||||
ctx.get_testodrome_id(),
|
||||
ctx.get_testodrome_id().unwrap_or_default(),
|
||||
);
|
||||
|
||||
// NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
|
||||
|
||||
@@ -272,6 +272,13 @@ impl RequestContext {
|
||||
.set_user_agent(user_agent);
|
||||
}
|
||||
|
||||
pub(crate) fn set_testodrome_id(&self, query_id: String) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.set_testodrome_id(query_id);
|
||||
}
|
||||
|
||||
pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.auth_method = Some(auth_method);
|
||||
@@ -371,13 +378,12 @@ impl RequestContext {
|
||||
.accumulated()
|
||||
}
|
||||
|
||||
pub(crate) fn get_testodrome_id(&self) -> String {
|
||||
pub(crate) fn get_testodrome_id(&self) -> Option<String> {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.testodrome_query_id
|
||||
.clone()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
pub(crate) fn success(&self) {
|
||||
|
||||
@@ -571,6 +571,11 @@ impl ConnectMechanism for TokioMechanism {
|
||||
"compute_id",
|
||||
tracing::field::display(&node_info.aux.compute_id),
|
||||
);
|
||||
|
||||
if let Some(query_id) = ctx.get_testodrome_id() {
|
||||
info!("latency={}, query_id={}", ctx.get_proxy_latency(), query_id);
|
||||
}
|
||||
|
||||
Ok(poll_client(
|
||||
self.pool.clone(),
|
||||
ctx,
|
||||
@@ -628,6 +633,10 @@ impl ConnectMechanism for HyperMechanism {
|
||||
tracing::field::display(&node_info.aux.compute_id),
|
||||
);
|
||||
|
||||
if let Some(query_id) = ctx.get_testodrome_id() {
|
||||
info!("latency={}, query_id={}", ctx.get_proxy_latency(), query_id);
|
||||
}
|
||||
|
||||
Ok(poll_http2_client(
|
||||
self.pool.clone(),
|
||||
ctx,
|
||||
|
||||
@@ -35,6 +35,7 @@ use super::conn_pool_lib::{
|
||||
Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn,
|
||||
EndpointConnPool,
|
||||
};
|
||||
use super::sql_over_http::SqlOverHttpError;
|
||||
use crate::context::RequestContext;
|
||||
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||
use crate::metrics::Metrics;
|
||||
@@ -274,18 +275,23 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
|
||||
}
|
||||
|
||||
impl ClientInnerCommon<postgres_client::Client> {
|
||||
pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
|
||||
pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), SqlOverHttpError> {
|
||||
if let ClientDataEnum::Local(local_data) = &mut self.data {
|
||||
local_data.jti += 1;
|
||||
let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
|
||||
|
||||
// discard all cannot run in a transaction. must be executed alone.
|
||||
self.inner.batch_execute("discard all").await?;
|
||||
self.inner
|
||||
.discard_all()
|
||||
.await
|
||||
.map_err(SqlOverHttpError::InternalPostgres)?;
|
||||
|
||||
// initiates the auth session
|
||||
// this is safe from query injections as the jwt format free of any escape characters.
|
||||
let query = format!("select auth.jwt_session_init('{token}')");
|
||||
self.inner.batch_execute(&query).await?;
|
||||
self.inner
|
||||
.batch_execute(&query)
|
||||
.await
|
||||
.map_err(SqlOverHttpError::InternalPostgres)?;
|
||||
|
||||
let pid = self.inner.get_process_id();
|
||||
info!(pid, jti = local_data.jti, "user session state init");
|
||||
|
||||
@@ -446,6 +446,15 @@ async fn request_handler(
|
||||
.map(Into::into),
|
||||
);
|
||||
|
||||
let testodrome_id = request
|
||||
.headers()
|
||||
.get("X-Neon-Query-ID")
|
||||
.map(|value| value.to_str().unwrap_or_default().to_string());
|
||||
|
||||
if let Some(query_id) = testodrome_id {
|
||||
ctx.set_testodrome_id(query_id);
|
||||
}
|
||||
|
||||
let span = ctx.span();
|
||||
info!(parent: &span, "performing websocket upgrade");
|
||||
|
||||
|
||||
@@ -412,8 +412,12 @@ pub(crate) enum SqlOverHttpError {
|
||||
ResponseTooLarge(usize),
|
||||
#[error("invalid isolation level")]
|
||||
InvalidIsolationLevel,
|
||||
/// for queries our customers choose to run
|
||||
#[error("{0}")]
|
||||
Postgres(#[from] postgres_client::Error),
|
||||
Postgres(#[source] postgres_client::Error),
|
||||
/// for queries we choose to run
|
||||
#[error("{0}")]
|
||||
InternalPostgres(#[source] postgres_client::Error),
|
||||
#[error("{0}")]
|
||||
JsonConversion(#[from] JsonConversionError),
|
||||
#[error("{0}")]
|
||||
@@ -429,6 +433,13 @@ impl ReportableError for SqlOverHttpError {
|
||||
SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User,
|
||||
SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
|
||||
SqlOverHttpError::Postgres(p) => p.get_error_kind(),
|
||||
SqlOverHttpError::InternalPostgres(p) => {
|
||||
if p.as_db_error().is_some() {
|
||||
ErrorKind::Service
|
||||
} else {
|
||||
ErrorKind::Compute
|
||||
}
|
||||
}
|
||||
SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
|
||||
SqlOverHttpError::Cancelled(c) => c.get_error_kind(),
|
||||
}
|
||||
@@ -444,6 +455,7 @@ impl UserFacingError for SqlOverHttpError {
|
||||
SqlOverHttpError::ResponseTooLarge(_) => self.to_string(),
|
||||
SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
|
||||
SqlOverHttpError::Postgres(p) => p.to_string(),
|
||||
SqlOverHttpError::InternalPostgres(p) => p.to_string(),
|
||||
SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
|
||||
SqlOverHttpError::Cancelled(_) => self.to_string(),
|
||||
}
|
||||
@@ -462,6 +474,7 @@ impl HttpCodeError for SqlOverHttpError {
|
||||
SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE,
|
||||
SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::InternalPostgres(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
SqlOverHttpError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
SqlOverHttpError::Cancelled(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
}
|
||||
@@ -671,16 +684,14 @@ async fn handle_db_inner(
|
||||
let authenticate_and_connect = Box::pin(
|
||||
async {
|
||||
let keys = match auth {
|
||||
AuthData::Password(pw) => {
|
||||
backend
|
||||
.authenticate_with_password(ctx, &conn_info.user_info, &pw)
|
||||
.await?
|
||||
}
|
||||
AuthData::Jwt(jwt) => {
|
||||
backend
|
||||
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
|
||||
.await?
|
||||
}
|
||||
AuthData::Password(pw) => backend
|
||||
.authenticate_with_password(ctx, &conn_info.user_info, &pw)
|
||||
.await
|
||||
.map_err(HttpConnError::AuthError)?,
|
||||
AuthData::Jwt(jwt) => backend
|
||||
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
|
||||
.await
|
||||
.map_err(HttpConnError::AuthError)?,
|
||||
};
|
||||
|
||||
let client = match keys.keys {
|
||||
@@ -703,7 +714,7 @@ async fn handle_db_inner(
|
||||
// not strictly necessary to mark success here,
|
||||
// but it's just insurance for if we forget it somewhere else
|
||||
ctx.success();
|
||||
Ok::<_, HttpConnError>(client)
|
||||
Ok::<_, SqlOverHttpError>(client)
|
||||
}
|
||||
.map_err(SqlOverHttpError::from),
|
||||
);
|
||||
@@ -933,11 +944,15 @@ impl BatchQueryData {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
|
||||
let transaction = builder.start().await.inspect_err(|_| {
|
||||
// if we cannot start a transaction, we should return immediately
|
||||
// and not return to the pool. connection is clearly broken
|
||||
discard.discard();
|
||||
})?;
|
||||
let transaction = builder
|
||||
.start()
|
||||
.await
|
||||
.inspect_err(|_| {
|
||||
// if we cannot start a transaction, we should return immediately
|
||||
// and not return to the pool. connection is clearly broken
|
||||
discard.discard();
|
||||
})
|
||||
.map_err(SqlOverHttpError::Postgres)?;
|
||||
|
||||
let json_output = match query_batch(
|
||||
config,
|
||||
@@ -950,11 +965,15 @@ impl BatchQueryData {
|
||||
{
|
||||
Ok(json_output) => {
|
||||
info!("commit");
|
||||
let status = transaction.commit().await.inspect_err(|_| {
|
||||
// if we cannot commit - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
})?;
|
||||
let status = transaction
|
||||
.commit()
|
||||
.await
|
||||
.inspect_err(|_| {
|
||||
// if we cannot commit - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
})
|
||||
.map_err(SqlOverHttpError::Postgres)?;
|
||||
discard.check_idle(status);
|
||||
json_output
|
||||
}
|
||||
@@ -969,11 +988,15 @@ impl BatchQueryData {
|
||||
}
|
||||
Err(err) => {
|
||||
info!("rollback");
|
||||
let status = transaction.rollback().await.inspect_err(|_| {
|
||||
// if we cannot rollback - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
})?;
|
||||
let status = transaction
|
||||
.rollback()
|
||||
.await
|
||||
.inspect_err(|_| {
|
||||
// if we cannot rollback - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
})
|
||||
.map_err(SqlOverHttpError::Postgres)?;
|
||||
discard.check_idle(status);
|
||||
return Err(err);
|
||||
}
|
||||
@@ -1032,7 +1055,12 @@ async fn query_to_json<T: GenericClient>(
|
||||
let query_start = Instant::now();
|
||||
|
||||
let query_params = data.params;
|
||||
let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
|
||||
let mut row_stream = std::pin::pin!(
|
||||
client
|
||||
.query_raw_txt(&data.query, query_params)
|
||||
.await
|
||||
.map_err(SqlOverHttpError::Postgres)?
|
||||
);
|
||||
let query_acknowledged = Instant::now();
|
||||
|
||||
// Manually drain the stream into a vector to leave row_stream hanging
|
||||
@@ -1040,7 +1068,7 @@ async fn query_to_json<T: GenericClient>(
|
||||
// big.
|
||||
let mut rows: Vec<postgres_client::Row> = Vec::new();
|
||||
while let Some(row) = row_stream.next().await {
|
||||
let row = row?;
|
||||
let row = row.map_err(SqlOverHttpError::Postgres)?;
|
||||
*current_size += row.body_len();
|
||||
rows.push(row);
|
||||
// we don't have a streaming response support yet so this is to prevent OOM
|
||||
@@ -1091,7 +1119,14 @@ async fn query_to_json<T: GenericClient>(
|
||||
"dataTypeModifier": c.type_modifier(),
|
||||
"format": "text",
|
||||
}));
|
||||
columns.push(client.get_type(c.type_oid()).await?);
|
||||
|
||||
match client.get_type(c.type_oid()).await {
|
||||
Ok(t) => columns.push(t),
|
||||
Err(err) => {
|
||||
tracing::warn!(?err, "unable to query type information");
|
||||
return Err(SqlOverHttpError::InternalPostgres(err));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
|
||||
|
||||
@@ -511,8 +511,7 @@ impl ApiImpl for SimulationApi {
|
||||
// collected quorum with lower term, then got rejected by next connected safekeeper
|
||||
executor::exit(1, msg.to_owned());
|
||||
}
|
||||
if msg.contains("collected propEpochStartLsn") && msg.contains(", but basebackup LSN ")
|
||||
{
|
||||
if msg.contains("collected propTermStartLsn") && msg.contains(", but basebackup LSN ") {
|
||||
// sync-safekeepers collected wrong quorum, walproposer collected another quorum
|
||||
executor::exit(1, msg.to_owned());
|
||||
}
|
||||
@@ -529,7 +528,7 @@ impl ApiImpl for SimulationApi {
|
||||
}
|
||||
|
||||
fn after_election(&self, wp: &mut walproposer::bindings::WalProposer) {
|
||||
let prop_lsn = wp.propEpochStartLsn;
|
||||
let prop_lsn = wp.propTermStartLsn;
|
||||
let prop_term = wp.propTerm;
|
||||
|
||||
let mut prev_lsn: u64 = 0;
|
||||
@@ -612,7 +611,7 @@ impl ApiImpl for SimulationApi {
|
||||
sk: &mut walproposer::bindings::Safekeeper,
|
||||
) -> bool {
|
||||
let mut startpos = wp.truncateLsn;
|
||||
let endpos = wp.propEpochStartLsn;
|
||||
let endpos = wp.propTermStartLsn;
|
||||
|
||||
if startpos == endpos {
|
||||
debug!("recovery_download: nothing to download");
|
||||
|
||||
@@ -152,10 +152,8 @@ impl TenantRefAccumulator {
|
||||
}
|
||||
}
|
||||
|
||||
if !ancestor_refs.is_empty() {
|
||||
tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len());
|
||||
self.ancestor_ref_shards.update(ttid, ancestor_refs);
|
||||
}
|
||||
tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len());
|
||||
self.ancestor_ref_shards.update(ttid, ancestor_refs);
|
||||
}
|
||||
|
||||
/// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve
|
||||
@@ -779,7 +777,7 @@ pub async fn pageserver_physical_gc(
|
||||
|
||||
let mut summary = GcSummary::default();
|
||||
{
|
||||
let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
|
||||
let timelines = timelines.try_buffered(CONCURRENCY);
|
||||
let timelines = timelines.try_flatten();
|
||||
|
||||
let timelines = timelines.map_ok(|(ttid, tenant_manifest_arc)| {
|
||||
@@ -793,8 +791,8 @@ pub async fn pageserver_physical_gc(
|
||||
tenant_manifest_arc,
|
||||
)
|
||||
});
|
||||
let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
|
||||
|
||||
let timelines = timelines.try_buffered(CONCURRENCY);
|
||||
let mut timelines = std::pin::pin!(timelines);
|
||||
// Drain futures for per-shard GC, populating accumulator as a side effect
|
||||
while let Some(i) = timelines.next().await {
|
||||
summary.merge(i?);
|
||||
|
||||
Reference in New Issue
Block a user