mirror of
https://github.com/neondatabase/neon.git
synced 2026-07-04 04:30:38 +00:00
Compare commits
13 Commits
wp-neon-wa
...
control-pl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cd277a1420 | ||
|
|
ff34746d21 | ||
|
|
699049b8f3 | ||
|
|
2c544343e0 | ||
|
|
193e60e2b8 | ||
|
|
1bbd6cae24 | ||
|
|
65f48c7002 | ||
|
|
d9d8e9afc7 | ||
|
|
7914eaf1e6 | ||
|
|
37fdbc3aaa | ||
|
|
7aa1e58301 | ||
|
|
f2892d3798 | ||
|
|
b492cedf51 |
3
Cargo.lock
generated
3
Cargo.lock
generated
@@ -3864,6 +3864,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"http-types",
|
||||
"hyper",
|
||||
@@ -4291,6 +4292,7 @@ dependencies = [
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"url",
|
||||
@@ -5220,6 +5222,7 @@ checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
|
||||
@@ -420,6 +420,12 @@ impl ShardIdentity {
|
||||
String::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
||||
/// for special cases on shard 0 such as ingesting relation sizes.
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.number == ShardNumber(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for ShardIndex {
|
||||
|
||||
@@ -16,10 +16,11 @@ aws-credential-types.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
tokio-util.workspace = true
|
||||
tokio-util = { workspace = true, features = ["compat"] }
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
scopeguard.workspace = true
|
||||
|
||||
@@ -1,21 +1,24 @@
|
||||
//! Azure Blob Storage wrapper
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::{borrow::Cow, io::Cursor};
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
use azure_core::request_options::{MaxResults, Metadata, Range};
|
||||
use azure_core::RetryOptions;
|
||||
use azure_identity::DefaultAzureCredential;
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use http_types::StatusCode;
|
||||
use tokio::io::AsyncRead;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
@@ -49,7 +52,8 @@ impl AzureBlobStorage {
|
||||
StorageCredentials::token_credential(Arc::new(token_credential))
|
||||
};
|
||||
|
||||
let builder = ClientBuilder::new(account, credentials);
|
||||
// we have an outer retry
|
||||
let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
|
||||
|
||||
let client = builder.container_client(azure_config.container_name.to_owned());
|
||||
|
||||
@@ -116,7 +120,8 @@ impl AzureBlobStorage {
|
||||
let mut metadata = HashMap::new();
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
@@ -127,10 +132,10 @@ impl AzureBlobStorage {
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
buf.extend_from_slice(&data.slice(..));
|
||||
bufs.push(data);
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(Cursor::new(buf)),
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
@@ -217,9 +222,10 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -227,13 +233,12 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
let _permit = self.permit(RequestKind::Put).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
|
||||
// TODO FIX THIS UGLY HACK and don't buffer the entire object
|
||||
// into RAM here, but use the streaming interface. For that,
|
||||
// we'd have to change the interface though...
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::with_capacity(data_size_bytes);
|
||||
tokio::io::copy(&mut from, &mut buf).await?;
|
||||
let body = azure_core::Body::Bytes(buf.into());
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
|
||||
@@ -312,3 +317,153 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// Hack to work around not being able to stream once with azure sdk.
|
||||
///
|
||||
/// Azure sdk clones streams around with the assumption that they are like
|
||||
/// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
|
||||
/// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
|
||||
/// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
|
||||
/// seekable, but we can also just re-try the request easier.
|
||||
#[project = NonSeekableStreamProj]
|
||||
enum NonSeekableStream<S> {
|
||||
/// A stream wrappers initial form.
|
||||
///
|
||||
/// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
|
||||
/// clone before first request, then this must be changed.
|
||||
Initial {
|
||||
inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
|
||||
len: usize,
|
||||
},
|
||||
/// The actually readable variant, produced by cloning the Initial variant.
|
||||
///
|
||||
/// The sdk currently always clones once, even without retry policy.
|
||||
Actual {
|
||||
#[pin]
|
||||
inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
|
||||
len: usize,
|
||||
read_any: bool,
|
||||
},
|
||||
/// Most likely unneeded, but left to make life easier, in case more clones are added.
|
||||
Cloned {
|
||||
len_was: usize,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> NonSeekableStream<S>
|
||||
where
|
||||
S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
{
|
||||
fn new(inner: S, len: usize) -> NonSeekableStream<S> {
|
||||
use tokio_util::compat::TokioAsyncReadCompatExt;
|
||||
|
||||
let inner = tokio_util::io::StreamReader::new(inner).compat();
|
||||
let inner = Some(inner);
|
||||
let inner = std::sync::Mutex::new(inner);
|
||||
NonSeekableStream::Initial { inner, len }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> std::fmt::Debug for NonSeekableStream<S> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
|
||||
Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
|
||||
Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> futures::io::AsyncRead for NonSeekableStream<S>
|
||||
where
|
||||
S: Stream<Item = std::io::Result<Bytes>>,
|
||||
{
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut [u8],
|
||||
) -> std::task::Poll<std::io::Result<usize>> {
|
||||
match self.project() {
|
||||
NonSeekableStreamProj::Actual {
|
||||
inner, read_any, ..
|
||||
} => {
|
||||
*read_any = true;
|
||||
inner.poll_read(cx, buf)
|
||||
}
|
||||
// NonSeekableStream::Initial does not support reading because it is just much easier
|
||||
// to have the mutex in place where one does not poll the contents, or that's how it
|
||||
// seemed originally. If there is a version upgrade which changes the cloning, then
|
||||
// that support needs to be hacked in.
|
||||
//
|
||||
// including {self:?} into the message would be useful, but unsure how to unproject.
|
||||
_ => std::task::Poll::Ready(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"cloned or initial values cannot be read",
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> Clone for NonSeekableStream<S> {
|
||||
/// Weird clone implementation exists to support the sdk doing cloning before issuing the first
|
||||
/// request, see type documentation.
|
||||
fn clone(&self) -> Self {
|
||||
use NonSeekableStream::*;
|
||||
|
||||
match self {
|
||||
Initial { inner, len } => {
|
||||
if let Some(inner) = inner.lock().unwrap().take() {
|
||||
Actual {
|
||||
inner,
|
||||
len: *len,
|
||||
read_any: false,
|
||||
}
|
||||
} else {
|
||||
Self::Cloned { len_was: *len }
|
||||
}
|
||||
}
|
||||
Actual { len, .. } => Cloned { len_was: *len },
|
||||
Cloned { len_was } => Cloned { len_was: *len_was },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<S> azure_core::SeekableStream for NonSeekableStream<S>
|
||||
where
|
||||
S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
|
||||
{
|
||||
async fn reset(&mut self) -> azure_core::error::Result<()> {
|
||||
use NonSeekableStream::*;
|
||||
|
||||
let msg = match self {
|
||||
Initial { inner, .. } => {
|
||||
if inner.get_mut().unwrap().is_some() {
|
||||
return Ok(());
|
||||
} else {
|
||||
"reset after first clone is not supported"
|
||||
}
|
||||
}
|
||||
Actual { read_any, .. } if !*read_any => return Ok(()),
|
||||
Actual { .. } => "reset after reading is not supported",
|
||||
Cloned { .. } => "reset after second clone is not supported",
|
||||
};
|
||||
Err(azure_core::error::Error::new(
|
||||
azure_core::error::ErrorKind::Io,
|
||||
std::io::Error::new(std::io::ErrorKind::Other, msg),
|
||||
))
|
||||
}
|
||||
|
||||
// Note: it is not documented if this should be the total or remaining length, total passes the
|
||||
// tests.
|
||||
fn len(&self) -> usize {
|
||||
use NonSeekableStream::*;
|
||||
match self {
|
||||
Initial { len, .. } => *len,
|
||||
Actual { len, .. } => *len,
|
||||
Cloned { len_was, .. } => *len_was,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,8 +19,10 @@ use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::A
|
||||
use anyhow::{bail, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use tokio::sync::Semaphore;
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
|
||||
@@ -179,7 +181,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
@@ -206,7 +208,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
}
|
||||
|
||||
pub struct Download {
|
||||
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
|
||||
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
@@ -300,7 +302,7 @@ impl GenericRemoteStorage {
|
||||
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -398,7 +400,7 @@ impl GenericRemoteStorage {
|
||||
/// this path is used for the remote object id conversion only.
|
||||
pub async fn upload_storage_object(
|
||||
&self,
|
||||
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
) -> anyhow::Result<()> {
|
||||
|
||||
@@ -7,11 +7,14 @@
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::Stream;
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
@@ -219,7 +222,7 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -252,9 +255,12 @@ impl RemoteStorage for LocalFs {
|
||||
);
|
||||
|
||||
let from_size_bytes = data_size_bytes as u64;
|
||||
let data = tokio_util::io::StreamReader::new(data);
|
||||
let data = std::pin::pin!(data);
|
||||
let mut buffer_to_read = data.take(from_size_bytes);
|
||||
|
||||
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
|
||||
// alternatively we could just write the bytes to a file, but local_fs is a testing utility
|
||||
let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
@@ -308,7 +314,7 @@ impl RemoteStorage for LocalFs {
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let source = io::BufReader::new(
|
||||
let source = ReaderStream::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
@@ -348,16 +354,14 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
let mut source = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
@@ -371,11 +375,13 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(match end_exclusive {
|
||||
Some(end_exclusive) => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
|
||||
download_stream: Box::pin(ReaderStream::new(
|
||||
source.take(end_exclusive - start_inclusive),
|
||||
)),
|
||||
},
|
||||
None => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(source),
|
||||
download_stream: Box::pin(ReaderStream::new(source)),
|
||||
},
|
||||
})
|
||||
} else {
|
||||
@@ -475,7 +481,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use camino_tempfile::tempdir;
|
||||
use futures_util::Stream;
|
||||
use std::{collections::HashMap, io::Write};
|
||||
|
||||
async fn read_and_assert_remote_file_contents(
|
||||
@@ -485,7 +493,7 @@ mod fs_tests {
|
||||
remote_storage_path: &RemotePath,
|
||||
expected_metadata: Option<&StorageMetadata>,
|
||||
) -> anyhow::Result<String> {
|
||||
let mut download = storage
|
||||
let download = storage
|
||||
.download(remote_storage_path)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
|
||||
@@ -494,13 +502,9 @@ mod fs_tests {
|
||||
"Unexpected metadata returned for the downloaded file"
|
||||
);
|
||||
|
||||
let mut contents = String::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_string(&mut contents)
|
||||
.await
|
||||
.context("Failed to read remote file contents into string")?;
|
||||
Ok(contents)
|
||||
let contents = aggregate(download.download_stream).await?;
|
||||
|
||||
String::from_utf8(contents).map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -529,25 +533,26 @@ mod fs_tests {
|
||||
let storage = create_storage()?;
|
||||
|
||||
let id = RemotePath::new(Utf8Path::new("dummy"))?;
|
||||
let content = std::io::Cursor::new(b"12345");
|
||||
let content = Bytes::from_static(b"12345");
|
||||
let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));
|
||||
|
||||
// Check that you get an error if the size parameter doesn't match the actual
|
||||
// size of the stream.
|
||||
storage
|
||||
.upload(Box::new(content.clone()), 0, &id, None)
|
||||
.upload(content(), 0, &id, None)
|
||||
.await
|
||||
.expect_err("upload with zero size succeeded");
|
||||
storage
|
||||
.upload(Box::new(content.clone()), 4, &id, None)
|
||||
.upload(content(), 4, &id, None)
|
||||
.await
|
||||
.expect_err("upload with too short size succeeded");
|
||||
storage
|
||||
.upload(Box::new(content.clone()), 6, &id, None)
|
||||
.upload(content(), 6, &id, None)
|
||||
.await
|
||||
.expect_err("upload with too large size succeeded");
|
||||
|
||||
// Correct size is 5, this should succeed.
|
||||
storage.upload(Box::new(content), 5, &id, None).await?;
|
||||
storage.upload(content(), 5, &id, None).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -595,7 +600,7 @@ mod fs_tests {
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut first_part_download = storage
|
||||
let first_part_download = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.await?;
|
||||
assert!(
|
||||
@@ -603,21 +608,13 @@ mod fs_tests {
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut first_part_download.download_stream,
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
let first_part_remote = aggregate(first_part_download.download_stream).await?;
|
||||
assert_eq!(
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
first_part_local, first_part_remote,
|
||||
"First part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
let mut second_part_download = storage
|
||||
let second_part_download = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
@@ -629,17 +626,9 @@ mod fs_tests {
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut second_part_download.download_stream,
|
||||
&mut second_part_remote,
|
||||
)
|
||||
.await?;
|
||||
second_part_remote.flush().await?;
|
||||
let second_part_remote = second_part_remote.into_inner().into_inner();
|
||||
let second_part_remote = aggregate(second_part_download.download_stream).await?;
|
||||
assert_eq!(
|
||||
second_part_local,
|
||||
second_part_remote.as_slice(),
|
||||
second_part_local, second_part_remote,
|
||||
"Second part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
@@ -729,17 +718,10 @@ mod fs_tests {
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut partial_download_with_metadata = storage
|
||||
let partial_download_with_metadata = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.await?;
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut partial_download_with_metadata.download_stream,
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
|
||||
assert_eq!(
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
@@ -815,16 +797,16 @@ mod fs_tests {
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.upload(Box::new(file), size, &relative_path, metadata)
|
||||
.await?;
|
||||
let file = tokio_util::io::ReaderStream::new(file);
|
||||
|
||||
storage.upload(file, size, &relative_path, metadata).await?;
|
||||
Ok(relative_path)
|
||||
}
|
||||
|
||||
async fn create_file_for_upload(
|
||||
path: &Utf8Path,
|
||||
contents: &str,
|
||||
) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
|
||||
) -> anyhow::Result<(fs::File, usize)> {
|
||||
std::fs::create_dir_all(path.parent().unwrap())?;
|
||||
let mut file_for_writing = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
@@ -834,7 +816,7 @@ mod fs_tests {
|
||||
drop(file_for_writing);
|
||||
let file_size = path.metadata()?.len() as usize;
|
||||
Ok((
|
||||
io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
|
||||
fs::OpenOptions::new().read(true).open(&path).await?,
|
||||
file_size,
|
||||
))
|
||||
}
|
||||
@@ -848,4 +830,16 @@ mod fs_tests {
|
||||
files.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
async fn aggregate(
|
||||
stream: impl Stream<Item = std::io::Result<Bytes>>,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
use futures::stream::StreamExt;
|
||||
let mut out = Vec::new();
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
while let Some(res) = stream.next().await {
|
||||
out.extend_from_slice(&res?[..]);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,9 +4,14 @@
|
||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||
//! their bucket prefixes are both specified and different.
|
||||
|
||||
use std::{borrow::Cow, sync::Arc};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{Context, Poll},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::Context as _;
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
@@ -28,11 +33,10 @@ use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use aws_smithy_types::body::SdkBody;
|
||||
use aws_smithy_types::byte_stream::ByteStream;
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::io::{self, AsyncRead};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
@@ -63,7 +67,7 @@ struct GetObjectRequest {
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
);
|
||||
@@ -225,12 +229,15 @@ impl S3Bucket {
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
|
||||
let body = object_output.body;
|
||||
let body = ByteStreamAsStream::from(body);
|
||||
let body = PermitCarrying::new(permit, body);
|
||||
let body = TimedDownload::new(started_at, body);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
|
||||
started_at,
|
||||
RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
|
||||
))),
|
||||
download_stream: Box::pin(body),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||
@@ -243,29 +250,55 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
struct ByteStreamAsStream {
|
||||
#[pin]
|
||||
inner: aws_smithy_types::byte_stream::ByteStream
|
||||
}
|
||||
}
|
||||
|
||||
impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
|
||||
fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
|
||||
ByteStreamAsStream { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for ByteStreamAsStream {
|
||||
type Item = std::io::Result<Bytes>;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
// this does the std::io::ErrorKind::Other conversion
|
||||
self.project().inner.poll_next(cx).map_err(|x| x.into())
|
||||
}
|
||||
|
||||
// cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
|
||||
// sense and Stream::size_hint does not really
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
||||
struct RatelimitedAsyncRead<S> {
|
||||
struct PermitCarrying<S> {
|
||||
permit: tokio::sync::OwnedSemaphorePermit,
|
||||
#[pin]
|
||||
inner: S,
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
|
||||
impl<S> PermitCarrying<S> {
|
||||
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
||||
RatelimitedAsyncRead { permit, inner }
|
||||
Self { permit, inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
let this = self.project();
|
||||
this.inner.poll_read(cx, buf)
|
||||
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
|
||||
type Item = <S as Stream>::Item;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
self.project().inner.poll_next(cx)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.inner.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -285,7 +318,7 @@ pin_project_lite::pin_project! {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> TimedDownload<S> {
|
||||
impl<S> TimedDownload<S> {
|
||||
fn new(started_at: std::time::Instant, inner: S) -> Self {
|
||||
TimedDownload {
|
||||
started_at,
|
||||
@@ -295,25 +328,26 @@ impl<S: AsyncRead> TimedDownload<S> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||
type Item = <S as Stream>::Item;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
use std::task::ready;
|
||||
|
||||
let this = self.project();
|
||||
let before = buf.filled().len();
|
||||
let read = std::task::ready!(this.inner.poll_read(cx, buf));
|
||||
|
||||
let read_eof = buf.filled().len() == before;
|
||||
|
||||
match read {
|
||||
Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
|
||||
Ok(()) => { /* still in progress */ }
|
||||
Err(_) => *this.outcome = AttemptOutcome::Err,
|
||||
let res = ready!(this.inner.poll_next(cx));
|
||||
match &res {
|
||||
Some(Ok(_)) => {}
|
||||
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
|
||||
None => *this.outcome = metrics::AttemptOutcome::Ok,
|
||||
}
|
||||
|
||||
std::task::Poll::Ready(read)
|
||||
Poll::Ready(res)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.inner.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -403,7 +437,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -413,7 +447,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let body = Body::wrap_stream(ReaderStream::new(from));
|
||||
let body = Body::wrap_stream(from);
|
||||
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
|
||||
|
||||
let res = self
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! This module provides a wrapper around a real RemoteStorage implementation that
|
||||
//! causes the first N attempts at each upload or download operatio to fail. For
|
||||
//! testing purposes.
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
@@ -108,7 +110,7 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
|
||||
@@ -7,7 +7,9 @@ use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
@@ -180,23 +182,14 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
let data3 = "remote blob data3".as_bytes();
|
||||
let data3_len = data3.len();
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None).await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None).await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None).await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
@@ -219,53 +212,56 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data = "remote blob data here".as_bytes();
|
||||
let data_len = data.len() as u64;
|
||||
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data), data.len(), &path, None)
|
||||
.await?;
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
|
||||
async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
|
||||
tokio::io::copy_buf(
|
||||
&mut tokio_util::io::StreamReader::new(dl.download_stream),
|
||||
&mut buf,
|
||||
)
|
||||
.await?;
|
||||
Ok(buf)
|
||||
}
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// Full range (end specified)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(data_len))
|
||||
.download_byte_range(&path, 0, Some(len as u64))
|
||||
.await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[4..10]);
|
||||
assert_eq!(&buf, &orig[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(data_len * 100))
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||
.await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[8..]);
|
||||
assert_eq!(&buf, &orig[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[4..]);
|
||||
assert_eq!(&buf, &orig[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
@@ -504,11 +500,8 @@ async fn upload_azure_data(
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
@@ -589,11 +582,8 @@ async fn upload_simple_azure_data(
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
@@ -622,3 +612,32 @@ async fn upload_simple_azure_data(
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
|
||||
// to binary
|
||||
fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
@@ -7,7 +7,9 @@ use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
@@ -176,23 +178,14 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
let data3 = "remote blob data3".as_bytes();
|
||||
let data3_len = data3.len();
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None).await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None).await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None).await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
@@ -432,11 +425,9 @@ async fn upload_s3_data(
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
@@ -517,11 +508,9 @@ async fn upload_simple_s3_data(
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
@@ -550,3 +539,30 @@ async fn upload_simple_s3_data(
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
@@ -2,9 +2,8 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! a neon Timeline.
|
||||
//!
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::pin::Pin;
|
||||
use std::task::{self, Poll};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
@@ -13,7 +12,8 @@ use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::StreamExt;
|
||||
use nix::NixPath;
|
||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
|
||||
use tokio::fs::{File, OpenOptions};
|
||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_tar::Archive;
|
||||
use tokio_tar::Builder;
|
||||
use tokio_tar::HeaderMode;
|
||||
@@ -629,70 +629,16 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
|
||||
Ok(Bytes::from(buf))
|
||||
}
|
||||
|
||||
/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then
|
||||
///
|
||||
/// The number of yields is bounded by above by the number of times poll_write is called,
|
||||
/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total.
|
||||
/// This is an explicit choice as the `YieldingVec` is meant to give the async executor
|
||||
/// breathing room between units of CPU intensive preparation of buffers to be written.
|
||||
/// Once a write call is issued, the whole buffer has been prepared already, so there is no
|
||||
/// gain in splitting up the memcopy further.
|
||||
struct YieldingVec {
|
||||
yield_budget: usize,
|
||||
// the buffer written into
|
||||
buf: Vec<u8>,
|
||||
}
|
||||
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&tmp_path)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {tmp_path}"))?;
|
||||
|
||||
impl YieldingVec {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
yield_budget: 0,
|
||||
buf: Vec::new(),
|
||||
}
|
||||
}
|
||||
// Whether we should yield for a read operation of given size
|
||||
fn should_yield(&mut self, add_buf_len: usize) -> bool {
|
||||
// Set this limit to a small value so that we are a
|
||||
// good async citizen and yield repeatedly (but not
|
||||
// too often for many small writes to cause many yields)
|
||||
const YIELD_DIST: usize = 1024;
|
||||
|
||||
let target_buf_len = self.buf.len() + add_buf_len;
|
||||
let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST;
|
||||
if self.yield_budget < target_buf_len {
|
||||
self.yield_budget += add_buf_len;
|
||||
}
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncWrite for YieldingVec {
|
||||
fn poll_write(
|
||||
mut self: Pin<&mut Self>,
|
||||
cx: &mut task::Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<std::io::Result<usize>> {
|
||||
if self.should_yield(buf.len()) {
|
||||
cx.waker().wake_by_ref();
|
||||
return Poll::Pending;
|
||||
}
|
||||
self.get_mut().buf.extend_from_slice(buf);
|
||||
Poll::Ready(Ok(buf.len()))
|
||||
}
|
||||
|
||||
fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll<std::io::Result<()>> {
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
|
||||
fn poll_shutdown(
|
||||
self: Pin<&mut Self>,
|
||||
_cx: &mut task::Context<'_>,
|
||||
) -> Poll<std::io::Result<()>> {
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
|
||||
let mut paths = Vec::new();
|
||||
for entry in WalkDir::new(pgdata_path) {
|
||||
let entry = entry?;
|
||||
@@ -707,7 +653,7 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
|
||||
// Do a sort to get a more consistent listing
|
||||
paths.sort_unstable();
|
||||
let zstd = ZstdEncoder::with_quality_and_params(
|
||||
YieldingVec::new(),
|
||||
file,
|
||||
Level::Default,
|
||||
&[CParameter::enable_long_distance_matching(true)],
|
||||
);
|
||||
@@ -725,13 +671,14 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
|
||||
}
|
||||
let mut zstd = builder.into_inner().await?;
|
||||
zstd.shutdown().await?;
|
||||
let compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.buf.len();
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000;
|
||||
let mut compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.metadata().await?.len();
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
|
||||
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
|
||||
}
|
||||
Ok(compressed.buf)
|
||||
compressed.seek(SeekFrom::Start(0)).await?;
|
||||
Ok((compressed, compressed_len))
|
||||
}
|
||||
|
||||
pub async fn extract_tar_zst(
|
||||
|
||||
@@ -774,6 +774,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
)]
|
||||
pub(crate) enum StorageIoOperation {
|
||||
Open,
|
||||
OpenAfterReplace,
|
||||
Close,
|
||||
CloseByReplace,
|
||||
Read,
|
||||
@@ -787,6 +788,7 @@ impl StorageIoOperation {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
StorageIoOperation::Open => "open",
|
||||
StorageIoOperation::OpenAfterReplace => "open-after-replace",
|
||||
StorageIoOperation::Close => "close",
|
||||
StorageIoOperation::CloseByReplace => "close-by-replace",
|
||||
StorageIoOperation::Read => "read",
|
||||
@@ -840,7 +842,6 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug)]
|
||||
struct GlobalAndPerTimelineHistogram {
|
||||
global: Histogram,
|
||||
@@ -1167,6 +1168,30 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
records_received: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_received",
|
||||
"Number of WAL records received from safekeepers"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
records_committed: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_committed",
|
||||
"Number of WAL records which resulted in writes to pageserver storage"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
records_filtered: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_filtered",
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
|
||||
@@ -1368,6 +1368,10 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn is_empty(&self) -> bool {
|
||||
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
|
||||
}
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
//!
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
@@ -69,6 +68,7 @@ use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::metadata::load_metadata;
|
||||
pub use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::storage_layer::DeltaLayer;
|
||||
use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::InitializationOrder;
|
||||
@@ -2949,10 +2949,10 @@ impl Tenant {
|
||||
};
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
// temporary directory for basebackup files for the given timeline.
|
||||
|
||||
let timelines_path = self.conf.timelines_path(&self.tenant_shard_id);
|
||||
let pgdata_path = path_with_suffix_extension(
|
||||
self.conf
|
||||
.timelines_path(&self.tenant_shard_id)
|
||||
.join(format!("basebackup-{timeline_id}")),
|
||||
timelines_path.join(format!("basebackup-{timeline_id}")),
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
@@ -2983,31 +2983,43 @@ impl Tenant {
|
||||
)
|
||||
.await
|
||||
.context("download initdb tar")?;
|
||||
let buf_read = Box::pin(BufReader::new(initdb_tar_zst));
|
||||
let buf_read =
|
||||
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
|
||||
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
|
||||
.await
|
||||
.context("extract initdb tar")?;
|
||||
|
||||
if initdb_tar_zst_path.exists() {
|
||||
tokio::fs::remove_file(&initdb_tar_zst_path)
|
||||
.await
|
||||
.context("tempfile removal")?;
|
||||
}
|
||||
tokio::fs::remove_file(&initdb_tar_zst_path)
|
||||
.await
|
||||
.or_else(|e| {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// If something else already removed the file, ignore the error
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
})
|
||||
.with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?;
|
||||
} else {
|
||||
// Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
|
||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
||||
|
||||
// Upload the created data dir to S3
|
||||
if let Some(storage) = &self.remote_storage {
|
||||
let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
|
||||
let pgdata_zstd = Bytes::from(pgdata_zstd);
|
||||
let temp_path = timelines_path.join(format!(
|
||||
"{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
||||
));
|
||||
|
||||
let (pgdata_zstd, tar_zst_size) =
|
||||
import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self::remote_timeline_client::upload_initdb_dir(
|
||||
storage,
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&timeline_id,
|
||||
pgdata_zstd.clone(),
|
||||
pgdata_zstd.try_clone().await?,
|
||||
tar_zst_size,
|
||||
)
|
||||
.await
|
||||
},
|
||||
@@ -3019,6 +3031,18 @@ impl Tenant {
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await?;
|
||||
|
||||
tokio::fs::remove_file(&temp_path)
|
||||
.await
|
||||
.or_else(|e| {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// If something else already removed the file, ignore the error
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
})
|
||||
.with_context(|| format!("tempfile removal {temp_path}"))?;
|
||||
}
|
||||
}
|
||||
let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
|
||||
@@ -77,8 +77,10 @@ async fn create_remote_delete_mark(
|
||||
let data: &[u8] = &[];
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let data = bytes::Bytes::from_static(data);
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data)));
|
||||
remote_storage
|
||||
.upload(data, 0, &remote_mark_path, None)
|
||||
.upload(stream, 0, &remote_mark_path, None)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
|
||||
@@ -254,6 +254,9 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
|
||||
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted(IndexPart),
|
||||
|
||||
@@ -75,12 +75,11 @@ pub async fn download_layer_file<'a>(
|
||||
|
||||
let (mut destination_file, bytes_amount) = download_retry(
|
||||
|| async {
|
||||
// TODO: this doesn't use the cached fd for some reason?
|
||||
let mut destination_file = fs::File::create(&temp_file_path)
|
||||
let destination_file = tokio::fs::File::create(&temp_file_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
let mut download = storage
|
||||
let download = storage
|
||||
.download(&remote_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -90,9 +89,14 @@ pub async fn download_layer_file<'a>(
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut destination_file =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let bytes_amount = tokio::time::timeout(
|
||||
MAX_DOWNLOAD_DURATION,
|
||||
tokio::io::copy(&mut download.download_stream, &mut destination_file),
|
||||
tokio::io::copy_buf(&mut reader, &mut destination_file),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
|
||||
@@ -103,6 +107,8 @@ pub async fn download_layer_file<'a>(
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let destination_file = destination_file.into_inner();
|
||||
|
||||
Ok((destination_file, bytes_amount))
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
@@ -220,20 +226,22 @@ async fn do_download_index_part(
|
||||
index_generation: Generation,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
use futures::stream::StreamExt;
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||
|
||||
let index_part_bytes = download_retry_forever(
|
||||
|| async {
|
||||
let mut index_part_download = storage.download(&remote_path).await?;
|
||||
let index_part_download = storage.download(&remote_path).await?;
|
||||
|
||||
let mut index_part_bytes = Vec::new();
|
||||
tokio::io::copy(
|
||||
&mut index_part_download.download_stream,
|
||||
&mut index_part_bytes,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
let mut stream = std::pin::pin!(index_part_download.download_stream);
|
||||
while let Some(chunk) = stream.next().await {
|
||||
let chunk = chunk
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
index_part_bytes.extend_from_slice(&chunk[..]);
|
||||
}
|
||||
Ok(index_part_bytes)
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
@@ -394,11 +402,13 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("timeline dir creation {timeline_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
}
|
||||
let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
|
||||
let temp_path = timeline_path.join(format!(
|
||||
"{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
||||
));
|
||||
|
||||
let file = download_retry(
|
||||
|| async {
|
||||
let mut file = OpenOptions::new()
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
@@ -408,13 +418,17 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut download = storage.download(&remote_path).await?;
|
||||
let download = storage.download(&remote_path).await?;
|
||||
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
|
||||
|
||||
tokio::io::copy(&mut download.download_stream, &mut file)
|
||||
tokio::io::copy_buf(&mut download, &mut writer)
|
||||
.await
|
||||
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut file = writer.into_inner();
|
||||
|
||||
file.seek(std::io::SeekFrom::Start(0))
|
||||
.await
|
||||
.with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
|
||||
@@ -426,10 +440,10 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
if temp_path.exists() {
|
||||
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
|
||||
// We don't have async here nor do we want to pile on any extra errors.
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
|
||||
// We don't have async here nor do we want to pile on any extra errors.
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
warn!("error deleting temporary file {temp_path}: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use fail::fail_point;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::io::ErrorKind;
|
||||
use tokio::fs;
|
||||
use tokio::fs::{self, File};
|
||||
|
||||
use super::Generation;
|
||||
use crate::{
|
||||
@@ -41,11 +40,15 @@ pub(super) async fn upload_index_part<'a>(
|
||||
.to_s3_bytes()
|
||||
.context("serialize index part file into bytes")?;
|
||||
let index_part_size = index_part_bytes.len();
|
||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
|
||||
storage
|
||||
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
|
||||
index_part_size,
|
||||
&remote_path,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
}
|
||||
@@ -101,8 +104,10 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
let fs_size = usize::try_from(fs_size)
|
||||
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
|
||||
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
storage
|
||||
.upload(source_file, fs_size, &storage_path, None)
|
||||
.upload(reader, fs_size, &storage_path, None)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
|
||||
|
||||
@@ -114,16 +119,16 @@ pub(crate) async fn upload_initdb_dir(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
initdb_dir: Bytes,
|
||||
initdb_tar_zst: File,
|
||||
size: u64,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::trace!("uploading initdb dir");
|
||||
|
||||
let size = initdb_dir.len();
|
||||
let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir));
|
||||
let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
|
||||
|
||||
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
storage
|
||||
.upload_storage_object(bytes, size, &remote_path)
|
||||
.upload_storage_object(file, size as usize, &remote_path)
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
@@ -2524,7 +2524,7 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn finish_write(&self, new_lsn: Lsn) {
|
||||
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
|
||||
assert!(new_lsn.is_aligned());
|
||||
|
||||
self.metrics.last_record_gauge.set(new_lsn.0 as i64);
|
||||
|
||||
@@ -288,6 +288,9 @@ impl VirtualFile {
|
||||
}
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
|
||||
|
||||
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
|
||||
// where our caller doesn't get to use the returned VirtualFile before its
|
||||
// slot gets re-used by someone else.
|
||||
let file = STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Open)
|
||||
.observe_closure_duration(|| open_options.open(path))?;
|
||||
@@ -311,6 +314,9 @@ impl VirtualFile {
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
// TODO: Under pressure, it's likely the slot will get re-used and
|
||||
// the underlying file closed before they get around to using it.
|
||||
// => https://github.com/neondatabase/neon/issues/6065
|
||||
slot_guard.file.replace(file);
|
||||
|
||||
Ok(vfile)
|
||||
@@ -421,9 +427,12 @@ impl VirtualFile {
|
||||
// now locked in write-mode. Find a free slot to put it in.
|
||||
let (handle, mut slot_guard) = open_files.find_victim_slot();
|
||||
|
||||
// Open the physical file
|
||||
// Re-open the physical file.
|
||||
// NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
|
||||
// case from StorageIoOperation::Open. This helps with identifying thrashing
|
||||
// of the virtual file descriptor cache.
|
||||
let file = STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Open)
|
||||
.get(StorageIoOperation::OpenAfterReplace)
|
||||
.observe_closure_duration(|| self.open_options.open(&self.path))?;
|
||||
|
||||
// Perform the requested operation on it
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
//! redo Postgres process, but some records it can handle directly with
|
||||
//! bespoken Rust code.
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
||||
@@ -30,6 +31,7 @@ use bytes::{Buf, Bytes, BytesMut};
|
||||
use tracing::*;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
@@ -46,6 +48,7 @@ use postgres_ffi::BLCKSZ;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub struct WalIngest<'a> {
|
||||
shard: ShardIdentity,
|
||||
timeline: &'a Timeline,
|
||||
|
||||
checkpoint: CheckPoint,
|
||||
@@ -65,6 +68,7 @@ impl<'a> WalIngest<'a> {
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
|
||||
Ok(WalIngest {
|
||||
shard: *timeline.get_shard_identity(),
|
||||
timeline,
|
||||
checkpoint,
|
||||
checkpoint_modified: false,
|
||||
@@ -87,6 +91,8 @@ impl<'a> WalIngest<'a> {
|
||||
decoded: &mut DecodedWALRecord,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
WAL_INGEST.records_received.inc();
|
||||
|
||||
modification.lsn = lsn;
|
||||
decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
|
||||
|
||||
@@ -355,6 +361,33 @@ impl<'a> WalIngest<'a> {
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum,
|
||||
};
|
||||
|
||||
let key = rel_block_to_key(rel, blk.blkno);
|
||||
let key_is_local = self.shard.is_key_local(&key);
|
||||
|
||||
tracing::debug!(
|
||||
lsn=%lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {} (checkpoint={})",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
self.checkpoint_modified
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if self.shard.is_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
self.observe_decoded_block(modification, blk, ctx).await?;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
|
||||
.await?;
|
||||
}
|
||||
@@ -367,13 +400,38 @@ impl<'a> WalIngest<'a> {
|
||||
self.checkpoint_modified = false;
|
||||
}
|
||||
|
||||
if modification.is_empty() {
|
||||
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
|
||||
WAL_INGEST.records_filtered.inc();
|
||||
modification.tline.finish_write(lsn);
|
||||
} else {
|
||||
WAL_INGEST.records_committed.inc();
|
||||
modification.commit(ctx).await?;
|
||||
}
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
modification.commit(ctx).await?;
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN.
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Do not store this block, but observe it for the purposes of updating our relation size state.
|
||||
async fn observe_decoded_block(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
blk: &DecodedBkpBlock,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum,
|
||||
};
|
||||
self.handle_rel_extend(modification, rel, blk.blkno, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn ingest_decoded_block(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
@@ -1465,8 +1523,15 @@ impl<'a> WalIngest<'a> {
|
||||
//info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
|
||||
modification.put_rel_extend(rel, new_nblocks, ctx).await?;
|
||||
|
||||
let mut key = rel_block_to_key(rel, blknum);
|
||||
// fill the gap with zeros
|
||||
for gap_blknum in old_nblocks..blknum {
|
||||
key.field6 = gap_blknum;
|
||||
|
||||
if self.shard.get_shard_number(&key) != self.shard.number {
|
||||
continue;
|
||||
}
|
||||
|
||||
modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ OBJS = \
|
||||
libpagestore.o \
|
||||
neon.o \
|
||||
neon_utils.o \
|
||||
neon_walreader.o \
|
||||
pagestore_smgr.o \
|
||||
relsize_cache.o \
|
||||
walproposer.o \
|
||||
|
||||
@@ -576,11 +576,10 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
* If cache is full then evict some other page.
|
||||
*/
|
||||
void
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer)
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer)
|
||||
#else
|
||||
const void *buffer)
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer)
|
||||
#endif
|
||||
{
|
||||
BufferTag tag;
|
||||
@@ -626,13 +625,19 @@ void
|
||||
{
|
||||
/*
|
||||
* We have two choices if all cache pages are pinned (i.e. used in IO
|
||||
* operations): 1. Wait until some of this operation is completed and
|
||||
* pages is unpinned 2. Allocate one more chunk, so that specified
|
||||
* cache size is more recommendation than hard limit. As far as
|
||||
* probability of such event (that all pages are pinned) is considered
|
||||
* to be very very small: there are should be very large number of
|
||||
* concurrent IO operations and them are limited by max_connections,
|
||||
* we prefer not to complicate code and use second approach.
|
||||
* operations):
|
||||
*
|
||||
* 1) Wait until some of this operation is completed and pages is
|
||||
* unpinned.
|
||||
*
|
||||
* 2) Allocate one more chunk, so that specified cache size is more
|
||||
* recommendation than hard limit.
|
||||
*
|
||||
* As far as probability of such event (that all pages are pinned) is
|
||||
* considered to be very very small: there are should be very large
|
||||
* number of concurrent IO operations and them are limited by
|
||||
* max_connections, we prefer not to complicate code and use second
|
||||
* approach.
|
||||
*/
|
||||
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
/*
|
||||
* Interface to set of libpq wrappers walproposer and neon_walreader need.
|
||||
* Similar to libpqwalreceiver, but it has blocking connection establishment and
|
||||
* pqexec which don't fit us. Implementation is at walproposer_pg.c.
|
||||
*/
|
||||
#ifndef ___LIBPQWALPROPOSER_H__
|
||||
#define ___LIBPQWALPROPOSER_H__
|
||||
|
||||
/* Re-exported and modified ExecStatusType */
|
||||
typedef enum
|
||||
{
|
||||
/* We received a single CopyBoth result */
|
||||
WP_EXEC_SUCCESS_COPYBOTH,
|
||||
|
||||
/*
|
||||
* Any success result other than a single CopyBoth was received. The
|
||||
* specifics of the result were already logged, but it may be useful to
|
||||
* provide an error message indicating which safekeeper messed up.
|
||||
*
|
||||
* Do not expect PQerrorMessage to be appropriately set.
|
||||
*/
|
||||
WP_EXEC_UNEXPECTED_SUCCESS,
|
||||
|
||||
/*
|
||||
* No result available at this time. Wait until read-ready, then call
|
||||
* again. Internally, this is returned when PQisBusy indicates that
|
||||
* PQgetResult would block.
|
||||
*/
|
||||
WP_EXEC_NEEDS_INPUT,
|
||||
/* Catch-all failure. Check PQerrorMessage. */
|
||||
WP_EXEC_FAILED,
|
||||
} WalProposerExecStatusType;
|
||||
|
||||
/* Possible return values from walprop_async_read */
|
||||
typedef enum
|
||||
{
|
||||
/* The full read was successful. buf now points to the data */
|
||||
PG_ASYNC_READ_SUCCESS,
|
||||
|
||||
/*
|
||||
* The read is ongoing. Wait until the connection is read-ready, then try
|
||||
* again.
|
||||
*/
|
||||
PG_ASYNC_READ_TRY_AGAIN,
|
||||
/* Reading failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_READ_FAIL,
|
||||
} PGAsyncReadResult;
|
||||
|
||||
/* Possible return values from walprop_async_write */
|
||||
typedef enum
|
||||
{
|
||||
/* The write fully completed */
|
||||
PG_ASYNC_WRITE_SUCCESS,
|
||||
|
||||
/*
|
||||
* The write started, but you'll need to call PQflush some more times to
|
||||
* finish it off. We just tried, so it's best to wait until the connection
|
||||
* is read- or write-ready to try again.
|
||||
*
|
||||
* If it becomes read-ready, call PQconsumeInput and flush again. If it
|
||||
* becomes write-ready, just call PQflush.
|
||||
*/
|
||||
PG_ASYNC_WRITE_TRY_FLUSH,
|
||||
/* Writing failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_WRITE_FAIL,
|
||||
} PGAsyncWriteResult;
|
||||
|
||||
/*
|
||||
* This header is included by walproposer.h to define walproposer_api; if we're
|
||||
* building walproposer without pg, ignore libpq part, leaving only interface
|
||||
* types.
|
||||
*/
|
||||
#ifndef WALPROPOSER_LIB
|
||||
|
||||
#include "libpq-fe.h"
|
||||
|
||||
/*
|
||||
* Sometimes working directly with underlying PGconn is simpler, export the
|
||||
* whole thing for simplicity.
|
||||
*/
|
||||
typedef struct WalProposerConn
|
||||
{
|
||||
PGconn *pg_conn;
|
||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||
char *recvbuf; /* last received CopyData message from
|
||||
* walprop_async_read */
|
||||
} WalProposerConn;
|
||||
|
||||
extern WalProposerConn *libpqwp_connect_start(char *conninfo);
|
||||
extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
|
||||
extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
|
||||
extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
|
||||
extern void libpqwp_disconnect(WalProposerConn *conn);
|
||||
|
||||
#endif /* WALPROPOSER_LIB */
|
||||
#endif /* ___LIBPQWALPROPOSER_H__ */
|
||||
@@ -1,731 +0,0 @@
|
||||
/*
|
||||
* Like WALRead, but when WAL segment doesn't exist locally instead of throwing
|
||||
* ERROR asynchronously tries to fetch it from the most advanced safekeeper.
|
||||
*
|
||||
* We can't use libpqwalreceiver as it blocks during connection establishment
|
||||
* (and waiting for PQExec result), so use libpqwalproposer instead.
|
||||
*
|
||||
* TODO: keepalives are currently never sent, so the other side can close the
|
||||
* connection prematurely.
|
||||
*
|
||||
* TODO: close conn if reading takes too long to prevent stuck connections.
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/xlogdefs.h"
|
||||
#include "access/xlogreader.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "storage/fd.h"
|
||||
#include "utils/wait_event.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
|
||||
#include "neon_walreader.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
#define NEON_WALREADER_ERR_MSG_LEN 512
|
||||
|
||||
/*
|
||||
* Can be called where NeonWALReader *state is available in the context, adds log_prefix.
|
||||
*/
|
||||
#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
|
||||
|
||||
static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
|
||||
static void NeonWALReaderResetRemote(NeonWALReader *state);
|
||||
static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
|
||||
static void neon_wal_segment_close(NeonWALReader *state);
|
||||
static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
|
||||
TimeLineID tli);
|
||||
|
||||
/*
|
||||
* State of connection to donor safekeeper.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
/* no remote connection */
|
||||
RS_NONE,
|
||||
/* doing PQconnectPoll, need readable socket */
|
||||
RS_CONNECTING_READ,
|
||||
/* doing PQconnectPoll, need writable socket */
|
||||
RS_CONNECTING_WRITE,
|
||||
/* Waiting for START_REPLICATION result */
|
||||
RS_WAIT_EXEC_RESULT,
|
||||
/* replication stream established */
|
||||
RS_ESTABLISHED,
|
||||
} NeonWALReaderRemoteState;
|
||||
|
||||
struct NeonWALReader
|
||||
{
|
||||
/*
|
||||
* LSN before which we assume WAL is not available locally. Exists because
|
||||
* though first segment after startup always exists, part before
|
||||
* basebackup LSN is filled with zeros.
|
||||
*/
|
||||
XLogRecPtr available_lsn;
|
||||
WALSegmentContext segcxt;
|
||||
WALOpenSegment seg;
|
||||
int wre_errno;
|
||||
/* Explains failure to read, static for simplicity. */
|
||||
char err_msg[NEON_WALREADER_ERR_MSG_LEN];
|
||||
|
||||
/*
|
||||
* Saved info about request in progress, used to check validity of
|
||||
* arguments after resume and remember how far we accomplished it. req_lsn
|
||||
* is 0 if there is no request in progress.
|
||||
*/
|
||||
XLogRecPtr req_lsn;
|
||||
Size req_len;
|
||||
Size req_progress;
|
||||
WalProposer *wp; /* we learn donor through walproposer */
|
||||
char donor_name[64]; /* saved donor safekeeper name for logging */
|
||||
/* state of connection to safekeeper */
|
||||
NeonWALReaderRemoteState rem_state;
|
||||
WalProposerConn *wp_conn;
|
||||
|
||||
/*
|
||||
* position in wp_conn recvbuf from which we'll copy WAL next time, or
|
||||
* NULL if there is no unprocessed message
|
||||
*/
|
||||
char *wal_ptr;
|
||||
Size wal_rem_len; /* how many unprocessed bytes left in recvbuf */
|
||||
|
||||
/*
|
||||
* LSN of wal_ptr position according to walsender to cross check against
|
||||
* read request
|
||||
*/
|
||||
XLogRecPtr rem_lsn;
|
||||
|
||||
/* prepended to lines logged by neon_walreader, if provided */
|
||||
char log_prefix[64];
|
||||
};
|
||||
|
||||
/* palloc and initialize NeonWALReader */
|
||||
NeonWALReader *
|
||||
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
|
||||
{
|
||||
NeonWALReader *reader;
|
||||
|
||||
reader = (NeonWALReader *)
|
||||
palloc_extended(sizeof(NeonWALReader),
|
||||
MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
|
||||
if (!reader)
|
||||
return NULL;
|
||||
|
||||
reader->available_lsn = available_lsn;
|
||||
reader->seg.ws_file = -1;
|
||||
reader->seg.ws_segno = 0;
|
||||
reader->seg.ws_tli = 0;
|
||||
reader->segcxt.ws_segsize = wal_segment_size;
|
||||
|
||||
reader->wp = wp;
|
||||
|
||||
reader->rem_state = RS_NONE;
|
||||
|
||||
if (log_prefix)
|
||||
strncpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
|
||||
|
||||
return reader;
|
||||
}
|
||||
|
||||
void
|
||||
NeonWALReaderFree(NeonWALReader *state)
|
||||
{
|
||||
if (state->seg.ws_file != -1)
|
||||
neon_wal_segment_close(state);
|
||||
if (state->wp_conn)
|
||||
libpqwp_disconnect(state->wp_conn);
|
||||
pfree(state);
|
||||
}
|
||||
|
||||
/*
|
||||
* Like vanilla WALRead, but if requested position is before available_lsn or
|
||||
* WAL segment doesn't exist on disk, it tries to fetch needed segment from the
|
||||
* advanced safekeeper.
|
||||
*
|
||||
* Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
|
||||
* fetched from timeline 'tli'.
|
||||
*
|
||||
* Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
|
||||
* occurs, in which case 'err' has the desciption. Error always closes remote
|
||||
* connection, if there was any, so socket subscription should be removed.
|
||||
*
|
||||
* NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
|
||||
* NeonWALReaderSocket and call NeonWALRead again with exactly the same
|
||||
* arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
|
||||
* docs during connection establishment (before first successful read) socket
|
||||
* underneath might change.
|
||||
*
|
||||
* Also, eventually walreader should switch from remote to local read; caller
|
||||
* should remove subscription to socket then by checking NeonWALReaderEvents
|
||||
* after successful read (otherwise next read might reopen the connection with
|
||||
* different socket).
|
||||
*
|
||||
* Reading not monotonically is not supported and will result in error.
|
||||
*
|
||||
* Caller should be sure that WAL up to requested LSN exists, otherwise
|
||||
* NEON_WALREAD_WOULDBLOCK might be always returned.
|
||||
*/
|
||||
NeonWALReadResult
|
||||
NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
|
||||
{
|
||||
/*
|
||||
* If requested data is before known available basebackup lsn or there is
|
||||
* already active remote state, do remote read.
|
||||
*/
|
||||
if (startptr < state->available_lsn || state->rem_state != RS_NONE)
|
||||
{
|
||||
return NeonWALReadRemote(state, buf, startptr, count, tli);
|
||||
}
|
||||
if (NeonWALReadLocal(state, buf, startptr, count, tli))
|
||||
{
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
else if (state->wre_errno == ENOENT)
|
||||
{
|
||||
nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
|
||||
LSN_FORMAT_ARGS(startptr));
|
||||
return NeonWALReadRemote(state, buf, startptr, count, tli);
|
||||
}
|
||||
else
|
||||
{
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* Do the read from remote safekeeper. */
|
||||
static NeonWALReadResult
|
||||
NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
|
||||
{
|
||||
if (state->rem_state == RS_NONE)
|
||||
{
|
||||
XLogRecPtr donor_lsn;
|
||||
|
||||
/* no connection yet; start one */
|
||||
Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
|
||||
|
||||
if (donor == NULL)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to establish remote connection to fetch WAL: no donor available");
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
|
||||
nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
|
||||
state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
|
||||
state->wp_conn = libpqwp_connect_start(donor->conninfo);
|
||||
if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to connect to %s to fetch WAL: immediately failed with %s",
|
||||
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
/* we'll poll immediately */
|
||||
state->rem_state = RS_CONNECTING_READ;
|
||||
}
|
||||
|
||||
if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
|
||||
{
|
||||
switch (PQconnectPoll(state->wp_conn->pg_conn))
|
||||
{
|
||||
case PGRES_POLLING_FAILED:
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to connect to %s to fetch WAL: poll error: %s",
|
||||
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
case PGRES_POLLING_READING:
|
||||
state->rem_state = RS_CONNECTING_READ;
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case PGRES_POLLING_WRITING:
|
||||
state->rem_state = RS_CONNECTING_WRITE;
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case PGRES_POLLING_OK:
|
||||
{
|
||||
/* connection successfully established */
|
||||
char start_repl_query[128];
|
||||
|
||||
snprintf(start_repl_query, sizeof(start_repl_query),
|
||||
"START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
|
||||
LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
|
||||
nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
|
||||
state->donor_name, start_repl_query);
|
||||
if (!libpqwp_send_query(state->wp_conn, start_repl_query))
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to send %s query to %s: %s",
|
||||
start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
state->rem_state = RS_WAIT_EXEC_RESULT;
|
||||
break;
|
||||
}
|
||||
|
||||
default: /* there is unused PGRES_POLLING_ACTIVE */
|
||||
Assert(false);
|
||||
return NEON_WALREAD_ERROR; /* keep the compiler quiet */
|
||||
}
|
||||
}
|
||||
|
||||
if (state->rem_state == RS_WAIT_EXEC_RESULT)
|
||||
{
|
||||
switch (libpqwp_get_query_result(state->wp_conn))
|
||||
{
|
||||
case WP_EXEC_SUCCESS_COPYBOTH:
|
||||
state->rem_state = RS_ESTABLISHED;
|
||||
break;
|
||||
case WP_EXEC_NEEDS_INPUT:
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case WP_EXEC_FAILED:
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"get START_REPLICATION result from %s failed: %s",
|
||||
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
default: /* can't happen */
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"get START_REPLICATION result from %s: unexpected result",
|
||||
state->donor_name);
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
Assert(state->rem_state == RS_ESTABLISHED);
|
||||
|
||||
/*
|
||||
* If we had the request before, verify args are the same and advance the
|
||||
* result ptr according to the progress; otherwise register the request.
|
||||
*/
|
||||
if (state->req_lsn != InvalidXLogRecPtr)
|
||||
{
|
||||
if (state->req_lsn != startptr || state->req_len != count)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"args changed during request, was %X/%X %zu, now %X/%X %zu",
|
||||
LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
|
||||
LSN_FORMAT_ARGS(startptr),
|
||||
count,
|
||||
state->req_progress);
|
||||
buf += state->req_progress;
|
||||
}
|
||||
else
|
||||
{
|
||||
state->req_lsn = startptr;
|
||||
state->req_len = count;
|
||||
state->req_progress = 0;
|
||||
nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
|
||||
LSN_FORMAT_ARGS(startptr),
|
||||
count);
|
||||
}
|
||||
|
||||
while (true)
|
||||
{
|
||||
Size to_copy;
|
||||
|
||||
/*
|
||||
* If we have no ready data, receive new message.
|
||||
*/
|
||||
if (state->wal_rem_len == 0 &&
|
||||
|
||||
/*
|
||||
* check for the sake of 0 length reads; walproposer does these for
|
||||
* heartbeats, though generally they shouldn't hit remote source.
|
||||
*/
|
||||
state->req_len - state->req_progress > 0)
|
||||
{
|
||||
NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
|
||||
|
||||
if (read_msg_res != NEON_WALREAD_SUCCESS)
|
||||
return read_msg_res;
|
||||
}
|
||||
|
||||
if (state->req_lsn + state->req_progress != state->rem_lsn)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
|
||||
LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
|
||||
LSN_FORMAT_ARGS(state->rem_lsn),
|
||||
LSN_FORMAT_ARGS(state->req_lsn),
|
||||
state->req_len);
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
|
||||
/* We can copy min of (available, requested) bytes. */
|
||||
to_copy =
|
||||
Min(state->req_len - state->req_progress, state->wal_rem_len);
|
||||
memcpy(buf, state->wal_ptr, to_copy);
|
||||
state->wal_ptr += to_copy;
|
||||
state->wal_rem_len -= to_copy;
|
||||
state->rem_lsn += to_copy;
|
||||
if (state->wal_rem_len == 0)
|
||||
state->wal_ptr = NULL; /* freed by libpqwalproposer */
|
||||
buf += to_copy;
|
||||
state->req_progress += to_copy;
|
||||
if (state->req_progress == state->req_len)
|
||||
{
|
||||
XLogSegNo next_segno;
|
||||
XLogSegNo req_segno;
|
||||
|
||||
XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
|
||||
XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
|
||||
|
||||
/*
|
||||
* Request completed. If there is a chance of serving next one
|
||||
* locally, close the connection.
|
||||
*/
|
||||
if (state->req_lsn < state->available_lsn &&
|
||||
state->rem_lsn >= state->available_lsn)
|
||||
{
|
||||
nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
|
||||
LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
}
|
||||
else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
|
||||
is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
|
||||
{
|
||||
nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
|
||||
LSN_FORMAT_ARGS(state->rem_lsn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
}
|
||||
state->req_lsn = InvalidXLogRecPtr;
|
||||
state->req_len = 0;
|
||||
state->req_progress = 0;
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read one WAL message from the stream, sets state->wal_ptr in case of success.
|
||||
* Resets remote state in case of failure.
|
||||
*/
|
||||
static NeonWALReadResult
|
||||
NeonWALReaderReadMsg(NeonWALReader *state)
|
||||
{
|
||||
while (true) /* loop until we get 'w' */
|
||||
{
|
||||
char *copydata_ptr;
|
||||
int copydata_size;
|
||||
StringInfoData s;
|
||||
char msg_type;
|
||||
int hdrlen;
|
||||
|
||||
Assert(state->rem_state == RS_ESTABLISHED);
|
||||
Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
|
||||
|
||||
switch (libpqwp_async_read(state->wp_conn,
|
||||
©data_ptr,
|
||||
©data_size))
|
||||
{
|
||||
case PG_ASYNC_READ_SUCCESS:
|
||||
break;
|
||||
case PG_ASYNC_READ_TRY_AGAIN:
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case PG_ASYNC_READ_FAIL:
|
||||
snprintf(state->err_msg,
|
||||
sizeof(state->err_msg),
|
||||
"req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
|
||||
LSN_FORMAT_ARGS(state->req_lsn),
|
||||
state->req_len,
|
||||
state->req_progress,
|
||||
PQerrorMessage(state->wp_conn->pg_conn));
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* put data on StringInfo to parse */
|
||||
s.data = copydata_ptr;
|
||||
s.len = copydata_size;
|
||||
s.cursor = 0;
|
||||
s.maxlen = -1;
|
||||
|
||||
if (copydata_size == 0)
|
||||
{
|
||||
snprintf(state->err_msg,
|
||||
sizeof(state->err_msg),
|
||||
"zero length copydata received");
|
||||
goto err;
|
||||
}
|
||||
msg_type = pq_getmsgbyte(&s);
|
||||
switch (msg_type)
|
||||
{
|
||||
case 'w':
|
||||
{
|
||||
XLogRecPtr start_lsn;
|
||||
|
||||
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
|
||||
if (s.len - s.cursor < hdrlen)
|
||||
{
|
||||
snprintf(state->err_msg,
|
||||
sizeof(state->err_msg),
|
||||
"invalid WAL message received from primary");
|
||||
goto err;
|
||||
}
|
||||
|
||||
start_lsn = pq_getmsgint64(&s);
|
||||
pq_getmsgint64(&s); /* XLogRecPtr end_lsn; */
|
||||
pq_getmsgint64(&s); /* TimestampTz send_time */
|
||||
|
||||
state->rem_lsn = start_lsn;
|
||||
state->wal_rem_len = (Size) (s.len - s.cursor);
|
||||
state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
|
||||
nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
|
||||
LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
|
||||
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
case 'k':
|
||||
{
|
||||
XLogRecPtr end_lsn;
|
||||
bool reply_requested;
|
||||
|
||||
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
|
||||
if (s.len - s.cursor < hdrlen)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"invalid keepalive message received from primary");
|
||||
goto err;
|
||||
}
|
||||
|
||||
end_lsn = pq_getmsgint64(&s);
|
||||
pq_getmsgint64(&s); /* TimestampTz timestamp; */
|
||||
reply_requested = pq_getmsgbyte(&s);
|
||||
nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
|
||||
LSN_FORMAT_ARGS(end_lsn),
|
||||
reply_requested);
|
||||
if (end_lsn < state->req_lsn + state->req_len)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
|
||||
LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
|
||||
goto err;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
default:
|
||||
nwr_log(WARNING, "invalid replication message type %d", msg_type);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
err:
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
|
||||
/* reset remote connection and request in progress */
|
||||
static void
|
||||
NeonWALReaderResetRemote(NeonWALReader *state)
|
||||
{
|
||||
state->req_lsn = InvalidXLogRecPtr;
|
||||
state->req_len = 0;
|
||||
state->req_progress = 0;
|
||||
state->rem_state = RS_NONE;
|
||||
if (state->wp_conn)
|
||||
{
|
||||
libpqwp_disconnect(state->wp_conn);
|
||||
state->wp_conn = NULL;
|
||||
}
|
||||
state->donor_name[0] = '\0';
|
||||
state->wal_ptr = NULL;
|
||||
state->wal_rem_len = 0;
|
||||
state->rem_lsn = InvalidXLogRecPtr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return socket of connection to remote source. Must be called only when
|
||||
* connection exists (NeonWALReaderEvents returns non zero).
|
||||
*/
|
||||
pgsocket
|
||||
NeonWALReaderSocket(NeonWALReader *state)
|
||||
{
|
||||
if (!state->wp_conn)
|
||||
nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
|
||||
return PQsocket(state->wp_conn->pg_conn);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns events user should wait on connection socket or 0 if remote
|
||||
* connection is not active.
|
||||
*/
|
||||
extern uint32
|
||||
NeonWALReaderEvents(NeonWALReader *state)
|
||||
{
|
||||
switch (state->rem_state)
|
||||
{
|
||||
case RS_NONE:
|
||||
return 0;
|
||||
case RS_CONNECTING_READ:
|
||||
return WL_SOCKET_READABLE;
|
||||
case RS_CONNECTING_WRITE:
|
||||
return WL_SOCKET_WRITEABLE;
|
||||
case RS_WAIT_EXEC_RESULT:
|
||||
case RS_ESTABLISHED:
|
||||
return WL_SOCKET_READABLE;
|
||||
default:
|
||||
Assert(false);
|
||||
return 0; /* make compiler happy */
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
|
||||
{
|
||||
char *p;
|
||||
XLogRecPtr recptr;
|
||||
Size nbytes;
|
||||
|
||||
p = buf;
|
||||
recptr = startptr;
|
||||
nbytes = count;
|
||||
|
||||
while (nbytes > 0)
|
||||
{
|
||||
uint32 startoff;
|
||||
int segbytes;
|
||||
int readbytes;
|
||||
|
||||
startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
|
||||
|
||||
/*
|
||||
* If the data we want is not in a segment we have open, close what we
|
||||
* have (if anything) and open the next one, using the caller's
|
||||
* provided openSegment callback.
|
||||
*/
|
||||
if (state->seg.ws_file < 0 ||
|
||||
!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
|
||||
tli != state->seg.ws_tli)
|
||||
{
|
||||
XLogSegNo nextSegNo;
|
||||
|
||||
neon_wal_segment_close(state);
|
||||
|
||||
XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
|
||||
if (!neon_wal_segment_open(state, nextSegNo, &tli))
|
||||
{
|
||||
char fname[MAXFNAMELEN];
|
||||
|
||||
state->wre_errno = errno;
|
||||
|
||||
XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
|
||||
fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
/* This shouldn't happen -- indicates a bug in segment_open */
|
||||
Assert(state->seg.ws_file >= 0);
|
||||
|
||||
/* Update the current segment info. */
|
||||
state->seg.ws_tli = tli;
|
||||
state->seg.ws_segno = nextSegNo;
|
||||
}
|
||||
|
||||
/* How many bytes are within this segment? */
|
||||
if (nbytes > (state->segcxt.ws_segsize - startoff))
|
||||
segbytes = state->segcxt.ws_segsize - startoff;
|
||||
else
|
||||
segbytes = nbytes;
|
||||
|
||||
#ifndef FRONTEND
|
||||
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
|
||||
#endif
|
||||
|
||||
/* Reset errno first; eases reporting non-errno-affecting errors */
|
||||
errno = 0;
|
||||
readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
|
||||
|
||||
#ifndef FRONTEND
|
||||
pgstat_report_wait_end();
|
||||
#endif
|
||||
|
||||
if (readbytes <= 0)
|
||||
{
|
||||
char fname[MAXFNAMELEN];
|
||||
|
||||
XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
|
||||
|
||||
if (readbytes < 0)
|
||||
{
|
||||
state->wre_errno = errno;
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
|
||||
fname, startoff, strerror(state->wre_errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
|
||||
fname, startoff);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Update state for read */
|
||||
recptr += readbytes;
|
||||
nbytes -= readbytes;
|
||||
p += readbytes;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy of vanilla wal_segment_open, but returns false in case of error instead
|
||||
* of ERROR, with errno set.
|
||||
*
|
||||
* XLogReaderRoutine->segment_open callback for local pg_wal files
|
||||
*/
|
||||
static bool
|
||||
neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
|
||||
TimeLineID *tli_p)
|
||||
{
|
||||
TimeLineID tli = *tli_p;
|
||||
char path[MAXPGPATH];
|
||||
|
||||
XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
|
||||
nwr_log(LOG, "opening %s", path);
|
||||
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
|
||||
if (state->seg.ws_file >= 0)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
|
||||
{
|
||||
struct stat stat_buffer;
|
||||
char path[MAXPGPATH];
|
||||
|
||||
XLogFilePath(path, tli, segno, segsize);
|
||||
return stat(path, &stat_buffer) == 0;
|
||||
}
|
||||
|
||||
/* copy of vanilla wal_segment_close with NeonWALReader */
|
||||
static void
|
||||
neon_wal_segment_close(NeonWALReader *state)
|
||||
{
|
||||
if (state->seg.ws_file >= 0)
|
||||
{
|
||||
close(state->seg.ws_file);
|
||||
/* need to check errno? */
|
||||
state->seg.ws_file = -1;
|
||||
}
|
||||
}
|
||||
|
||||
char *
|
||||
NeonWALReaderErrMsg(NeonWALReader *state)
|
||||
{
|
||||
return state->err_msg;
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
#ifndef __NEON_WALREADER_H__
|
||||
#define __NEON_WALREADER_H__
|
||||
|
||||
#include "access/xlogdefs.h"
|
||||
|
||||
/* forward declare so we don't have to expose the struct to the public */
|
||||
struct NeonWALReader;
|
||||
typedef struct NeonWALReader NeonWALReader;
|
||||
|
||||
/* avoid including walproposer.h as it includes us */
|
||||
struct WalProposer;
|
||||
typedef struct WalProposer WalProposer;
|
||||
|
||||
/* NeonWALRead return value */
|
||||
typedef enum
|
||||
{
|
||||
NEON_WALREAD_SUCCESS,
|
||||
NEON_WALREAD_WOULDBLOCK,
|
||||
NEON_WALREAD_ERROR,
|
||||
} NeonWALReadResult;
|
||||
|
||||
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
|
||||
extern void NeonWALReaderFree(NeonWALReader *state);
|
||||
extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
|
||||
extern uint32 NeonWALReaderEvents(NeonWALReader *state);
|
||||
extern char *NeonWALReaderErrMsg(NeonWALReader *state);
|
||||
|
||||
#endif /* __NEON_WALREADER_H__ */
|
||||
@@ -370,9 +370,8 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
uint64 end,
|
||||
nfree = newsize;
|
||||
PrefetchState *newPState;
|
||||
Size newprfs_size = offsetof(PrefetchState, prf_buffer) + (
|
||||
sizeof(PrefetchRequest) * newsize
|
||||
);
|
||||
Size newprfs_size = offsetof(PrefetchState, prf_buffer) +
|
||||
(sizeof(PrefetchRequest) * newsize);
|
||||
|
||||
/* don't try to re-initialize if we haven't initialized yet */
|
||||
if (MyPState == NULL)
|
||||
@@ -1243,11 +1242,10 @@ PageIsEmptyHeapPage(char *buffer)
|
||||
}
|
||||
|
||||
static void
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer, bool force)
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
|
||||
#else
|
||||
const char *buffer, bool force)
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn = PageGetLSN((Page) buffer);
|
||||
@@ -1901,6 +1899,7 @@ bool
|
||||
neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
{
|
||||
uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
|
||||
BufferTag tag;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -1919,11 +1918,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
|
||||
return false;
|
||||
|
||||
BufferTag tag =
|
||||
{
|
||||
.forkNum = forknum,
|
||||
.blockNum = blocknum
|
||||
};
|
||||
tag.forkNum = forknum;
|
||||
tag.blockNum = blocknum;
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
|
||||
|
||||
@@ -1977,12 +1973,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
* While function is defined in the neon extension it's used within neon_test_utils directly.
|
||||
* To avoid breaking tests in the runtime please keep function signature in sync.
|
||||
*/
|
||||
void
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
void PGDLLEXPORT
|
||||
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer)
|
||||
#else
|
||||
void PGDLLEXPORT
|
||||
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer)
|
||||
#endif
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
|
||||
/* Prototypes for private functions */
|
||||
static void WalProposerLoop(WalProposer *wp);
|
||||
static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
|
||||
static void ShutdownConnection(Safekeeper *sk);
|
||||
static void ResetConnection(Safekeeper *sk);
|
||||
static long TimeToReconnect(WalProposer *wp, TimestampTz now);
|
||||
@@ -75,9 +76,10 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper
|
||||
static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
|
||||
static bool AsyncFlush(Safekeeper *sk);
|
||||
static int CompareLsn(const void *a, const void *b);
|
||||
static char *FormatSafekeeperState(SafekeeperState state, SafekeeperActiveState active_state);
|
||||
static char *FormatSafekeeperState(SafekeeperState state);
|
||||
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
|
||||
static char *FormatEvents(uint32 events);
|
||||
static uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
|
||||
static char *FormatEvents(WalProposer *wp, uint32 events);
|
||||
|
||||
WalProposer *
|
||||
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
@@ -123,7 +125,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
}
|
||||
|
||||
initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
|
||||
wp->safekeeper[wp->n_safekeepers].xlogreader = NULL;
|
||||
wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]);
|
||||
wp->safekeeper[wp->n_safekeepers].flushWrite = false;
|
||||
wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
|
||||
wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
|
||||
wp->n_safekeepers += 1;
|
||||
@@ -272,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
|
||||
wp->config->safekeeper_connection_timeout))
|
||||
{
|
||||
walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state), wp->config->safekeeper_connection_timeout);
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
|
||||
ShutdownConnection(sk);
|
||||
}
|
||||
}
|
||||
@@ -300,6 +303,43 @@ WalProposerLoop(WalProposer *wp)
|
||||
WalProposerPoll(wp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
|
||||
*
|
||||
* Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
|
||||
*/
|
||||
static void
|
||||
HackyRemoveWalProposerEvent(Safekeeper *to_remove)
|
||||
{
|
||||
WalProposer *wp = to_remove->wp;
|
||||
|
||||
/* Remove the existing event set, assign sk->eventPos = -1 */
|
||||
wp->api.free_event_set(wp);
|
||||
/* Re-initialize it without adding any safekeeper events */
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
/*
|
||||
* loop through the existing safekeepers. If they aren't the one we're
|
||||
* removing, and if they have a socket we can use, re-add the applicable
|
||||
* events.
|
||||
*/
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
uint32 desired_events = WL_NO_EVENTS;
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
if (sk == to_remove)
|
||||
continue;
|
||||
|
||||
/* If this safekeeper isn't offline, add an event for it! */
|
||||
if (sk->state != SS_OFFLINE)
|
||||
{
|
||||
desired_events = SafekeeperStateDesiredEvents(sk->state);
|
||||
/* will set sk->eventPos */
|
||||
wp->api.add_safekeeper_event_set(sk, desired_events);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
|
||||
static void
|
||||
@@ -307,13 +347,14 @@ ShutdownConnection(Safekeeper *sk)
|
||||
{
|
||||
sk->wp->api.conn_finish(sk);
|
||||
sk->state = SS_OFFLINE;
|
||||
sk->flushWrite = false;
|
||||
sk->streamingAt = InvalidXLogRecPtr;
|
||||
|
||||
if (sk->voteResponse.termHistory.entries)
|
||||
pfree(sk->voteResponse.termHistory.entries);
|
||||
sk->voteResponse.termHistory.entries = NULL;
|
||||
|
||||
sk->wp->api.rm_safekeeper_event_set(sk);
|
||||
HackyRemoveWalProposerEvent(sk);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -431,6 +472,8 @@ ReconnectSafekeepers(WalProposer *wp)
|
||||
static void
|
||||
AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
/*
|
||||
* Sanity check. We assume further down that the operations don't block
|
||||
* because the socket is ready.
|
||||
@@ -482,7 +525,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
*/
|
||||
case SS_VOTING:
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state, sk->active_state));
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -511,7 +554,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
*/
|
||||
case SS_IDLE:
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state, sk->active_state));
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -577,7 +620,7 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
* Because PQconnectPoll can change the socket, we have to un-register the
|
||||
* old event and re-register an event on the new socket.
|
||||
*/
|
||||
wp->api.rm_safekeeper_event_set(sk);
|
||||
HackyRemoveWalProposerEvent(sk);
|
||||
wp->api.add_safekeeper_event_set(sk, new_events);
|
||||
|
||||
/* If we successfully connected, send START_WAL_PUSH query */
|
||||
@@ -1061,10 +1104,6 @@ SendProposerElected(Safekeeper *sk)
|
||||
term_t lastCommonTerm;
|
||||
int i;
|
||||
|
||||
/* Now that we are ready to send it's a good moment to create WAL reader */
|
||||
Assert(!sk->xlogreader);
|
||||
wp->api.wal_reader_allocate(sk);
|
||||
|
||||
/*
|
||||
* Determine start LSN by comparing safekeeper's log term switch history
|
||||
* and proposer's, searching for the divergence point.
|
||||
@@ -1184,7 +1223,6 @@ StartStreaming(Safekeeper *sk)
|
||||
* once for a connection.
|
||||
*/
|
||||
sk->state = SS_ACTIVE;
|
||||
sk->active_state = SS_ACTIVE_SEND;
|
||||
sk->streamingAt = sk->startStreamingAt;
|
||||
|
||||
/* event set will be updated inside SendMessageToNode */
|
||||
@@ -1243,13 +1281,9 @@ HandleActiveState(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
/*
|
||||
* Note: we don't known which socket awoke us (sk or nwr). However, as
|
||||
* SendAppendRequests always tries to send at least one msg in
|
||||
* SS_ACTIVE_SEND be careful not to go there if are only after sk
|
||||
* response, otherwise it'd create busy loop of pings.
|
||||
*/
|
||||
if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL)
|
||||
uint32 newEvents = WL_SOCKET_READABLE;
|
||||
|
||||
if (events & WL_SOCKET_WRITEABLE)
|
||||
if (!SendAppendRequests(sk))
|
||||
return;
|
||||
|
||||
@@ -1257,26 +1291,28 @@ HandleActiveState(Safekeeper *sk, uint32 events)
|
||||
if (!RecvAppendResponses(sk))
|
||||
return;
|
||||
|
||||
if (events & WL_SOCKET_CLOSED)
|
||||
{
|
||||
walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
|
||||
sk->host, sk->port);
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
|
||||
* in the buffer.
|
||||
*
|
||||
* LSN comparison checks if we have pending unsent messages. This check
|
||||
* isn't necessary now, because we always send append messages immediately
|
||||
* after arrival. But it's good to have it here in case we change this
|
||||
* behavior in the future.
|
||||
*/
|
||||
if (sk->streamingAt != wp->availableLsn || sk->flushWrite)
|
||||
newEvents |= WL_SOCKET_WRITEABLE;
|
||||
|
||||
/* configures event set for yield whatever is the substate */
|
||||
wp->api.active_state_update_event_set(sk);
|
||||
wp->api.update_event_set(sk, newEvents);
|
||||
}
|
||||
|
||||
/*
|
||||
* Send WAL messages starting from sk->streamingAt until the end or non-writable
|
||||
* socket or neon_walreader blocks, whichever comes first; active_state is
|
||||
* updated accordingly. Caller should take care of updating event set. Even if
|
||||
* no unsent WAL is available, at least one empty message will be sent as a
|
||||
* heartbeat, if socket is ready.
|
||||
* socket, whichever comes first. Caller should take care of updating event set.
|
||||
* Even if no unsent WAL is available, at least one empty message will be sent
|
||||
* as a heartbeat, if socket is ready.
|
||||
*
|
||||
* Resets state and kills the connections if any error on them is encountered.
|
||||
* Can change state if Async* functions encounter errors and reset connection.
|
||||
* Returns false in this case, true otherwise.
|
||||
*/
|
||||
static bool
|
||||
@@ -1284,11 +1320,11 @@ SendAppendRequests(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
XLogRecPtr endLsn;
|
||||
AppendRequestHeader *req;
|
||||
PGAsyncWriteResult writeResult;
|
||||
bool sentAnything = false;
|
||||
AppendRequestHeader *req;
|
||||
|
||||
if (sk->active_state == SS_ACTIVE_FLUSH)
|
||||
if (sk->flushWrite)
|
||||
{
|
||||
if (!AsyncFlush(sk))
|
||||
|
||||
@@ -1299,99 +1335,76 @@ SendAppendRequests(Safekeeper *sk)
|
||||
return sk->state == SS_ACTIVE;
|
||||
|
||||
/* Event set will be updated in the end of HandleActiveState */
|
||||
sk->active_state = SS_ACTIVE_SEND;
|
||||
sk->flushWrite = false;
|
||||
}
|
||||
|
||||
while (sk->streamingAt != wp->availableLsn || !sentAnything)
|
||||
{
|
||||
if (sk->active_state == SS_ACTIVE_SEND)
|
||||
sentAnything = true;
|
||||
|
||||
endLsn = sk->streamingAt;
|
||||
endLsn += MAX_SEND_SIZE;
|
||||
|
||||
/* if we went beyond available WAL, back off */
|
||||
if (endLsn > wp->availableLsn)
|
||||
{
|
||||
sentAnything = true;
|
||||
|
||||
endLsn = sk->streamingAt;
|
||||
endLsn += MAX_SEND_SIZE;
|
||||
|
||||
/* if we went beyond available WAL, back off */
|
||||
if (endLsn > wp->availableLsn)
|
||||
{
|
||||
endLsn = wp->availableLsn;
|
||||
}
|
||||
|
||||
req = &sk->appendRequest;
|
||||
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
|
||||
|
||||
walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||
req->endLsn - req->beginLsn,
|
||||
LSN_FORMAT_ARGS(req->beginLsn),
|
||||
LSN_FORMAT_ARGS(req->endLsn),
|
||||
LSN_FORMAT_ARGS(req->commitLsn),
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
/* write AppendRequest header */
|
||||
appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
|
||||
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
|
||||
sk->active_state = SS_ACTIVE_READ_WAL;
|
||||
endLsn = wp->availableLsn;
|
||||
}
|
||||
|
||||
if (sk->active_state == SS_ACTIVE_READ_WAL)
|
||||
req = &sk->appendRequest;
|
||||
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
|
||||
|
||||
walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||
req->endLsn - req->beginLsn,
|
||||
LSN_FORMAT_ARGS(req->beginLsn),
|
||||
LSN_FORMAT_ARGS(req->endLsn),
|
||||
LSN_FORMAT_ARGS(req->commitLsn),
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
/* write AppendRequest header */
|
||||
appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
|
||||
|
||||
/* write the WAL itself */
|
||||
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
|
||||
/* wal_read will raise error on failure */
|
||||
wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req->endLsn - req->beginLsn);
|
||||
sk->outbuf.len += req->endLsn - req->beginLsn;
|
||||
|
||||
writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
|
||||
|
||||
/* Mark current message as sent, whatever the result is */
|
||||
sk->streamingAt = endLsn;
|
||||
|
||||
switch (writeResult)
|
||||
{
|
||||
req = &sk->appendRequest;
|
||||
case PG_ASYNC_WRITE_SUCCESS:
|
||||
/* Continue writing the next message */
|
||||
break;
|
||||
|
||||
switch (wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req->endLsn - req->beginLsn))
|
||||
{
|
||||
case NEON_WALREAD_SUCCESS:
|
||||
break;
|
||||
case NEON_WALREAD_WOULDBLOCK:
|
||||
return true;
|
||||
case NEON_WALREAD_ERROR:
|
||||
walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||
sk->host, sk->port,
|
||||
NeonWALReaderErrMsg(sk->xlogreader));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
Assert(false);
|
||||
}
|
||||
case PG_ASYNC_WRITE_TRY_FLUSH:
|
||||
|
||||
sk->outbuf.len += req->endLsn - req->beginLsn;
|
||||
/*
|
||||
* * We still need to call PQflush some more to finish the
|
||||
* job. Caller function will handle this by setting right
|
||||
* event* set.
|
||||
*/
|
||||
sk->flushWrite = true;
|
||||
return true;
|
||||
|
||||
writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
|
||||
|
||||
/* Mark current message as sent, whatever the result is */
|
||||
sk->streamingAt = req->endLsn;
|
||||
|
||||
switch (writeResult)
|
||||
{
|
||||
case PG_ASYNC_WRITE_SUCCESS:
|
||||
/* Continue writing the next message */
|
||||
sk->active_state = SS_ACTIVE_SEND;
|
||||
break;
|
||||
|
||||
case PG_ASYNC_WRITE_TRY_FLUSH:
|
||||
|
||||
/*
|
||||
* We still need to call PQflush some more to finish the
|
||||
* job. Caller function will handle this by setting right
|
||||
* event set.
|
||||
*/
|
||||
sk->active_state = SS_ACTIVE_FLUSH;
|
||||
return true;
|
||||
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
Assert(false);
|
||||
return false;
|
||||
}
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
Assert(false);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1401,7 +1414,7 @@ SendAppendRequests(Safekeeper *sk)
|
||||
/*
|
||||
* Receive and process all available feedback.
|
||||
*
|
||||
* Resets state and kills the connection if any error on it is encountered.
|
||||
* Can change state if Async* functions encounter errors and reset connection.
|
||||
* Returns false in this case, true otherwise.
|
||||
*
|
||||
* NB: This function can call SendMessageToNode and produce new messages.
|
||||
@@ -1582,53 +1595,6 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
|
||||
return responses[wp->n_safekeepers - wp->quorum];
|
||||
}
|
||||
|
||||
/*
|
||||
* Return safekeeper with active connection from which WAL can be downloaded, or
|
||||
* none if it doesn't exist. donor_lsn is set to end position of the donor to
|
||||
* the best of our knowledge.
|
||||
*/
|
||||
Safekeeper *
|
||||
GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
|
||||
{
|
||||
*donor_lsn = InvalidXLogRecPtr;
|
||||
Safekeeper *donor = NULL;
|
||||
int i;
|
||||
|
||||
if (wp->n_votes < wp->quorum)
|
||||
{
|
||||
walprop_log(WARNING, "GetDonor called before elections are won");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* First, consider node which had determined our term start LSN as we know
|
||||
* about its position immediately after election before any feedbacks are
|
||||
* sent.
|
||||
*/
|
||||
if (wp->safekeeper[wp->donor].state >= SS_IDLE)
|
||||
{
|
||||
donor = &wp->safekeeper[wp->donor];
|
||||
*donor_lsn = wp->propEpochStartLsn;
|
||||
}
|
||||
|
||||
/*
|
||||
* But also check feedbacks from all nodes with live connections and take
|
||||
* the highest one. Note: if node sends feedbacks it already processed
|
||||
* elected message so its term is fine.
|
||||
*/
|
||||
for (i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
|
||||
{
|
||||
donor = sk;
|
||||
*donor_lsn = sk->appendResponse.flushLsn;
|
||||
}
|
||||
}
|
||||
return donor;
|
||||
}
|
||||
|
||||
static void
|
||||
HandleSafekeeperResponse(WalProposer *wp)
|
||||
{
|
||||
@@ -1734,7 +1700,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
|
||||
case PG_ASYNC_READ_FAIL:
|
||||
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state, sk->active_state),
|
||||
sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
@@ -1774,7 +1740,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state, sk->active_state));
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1845,13 +1811,12 @@ static bool
|
||||
BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
uint32 events;
|
||||
|
||||
if (!wp->api.conn_blocking_write(sk, msg, msg_size))
|
||||
{
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state),
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
@@ -1863,15 +1828,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
|
||||
* If the new state will be waiting for events to happen, update the event
|
||||
* set to wait for those
|
||||
*/
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
/*
|
||||
* nwr_events is relevant only during SS_ACTIVE which doesn't user
|
||||
* BlockingWrite
|
||||
*/
|
||||
Assert(!nwr_events);
|
||||
if (sk_events)
|
||||
wp->api.update_event_set(sk, sk_events);
|
||||
events = SafekeeperStateDesiredEvents(success_state);
|
||||
if (events)
|
||||
wp->api.update_event_set(sk, events);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -1904,7 +1863,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
|
||||
return false;
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state),
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
@@ -1943,7 +1902,7 @@ AsyncFlush(Safekeeper *sk)
|
||||
return false;
|
||||
case -1:
|
||||
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state),
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
@@ -1973,14 +1932,14 @@ CompareLsn(const void *a, const void *b)
|
||||
*
|
||||
* The strings are intended to be used as a prefix to "state", e.g.:
|
||||
*
|
||||
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state, sk->active_state));
|
||||
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
|
||||
*
|
||||
* If this sort of phrasing doesn't fit the message, instead use something like:
|
||||
*
|
||||
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state, sk->active_state));
|
||||
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
|
||||
*/
|
||||
static char *
|
||||
FormatSafekeeperState(SafekeeperState state, SafekeeperActiveState active_state)
|
||||
FormatSafekeeperState(SafekeeperState state)
|
||||
{
|
||||
char *return_val = NULL;
|
||||
|
||||
@@ -2012,18 +1971,7 @@ FormatSafekeeperState(SafekeeperState state, SafekeeperActiveState active_state)
|
||||
return_val = "idle";
|
||||
break;
|
||||
case SS_ACTIVE:
|
||||
switch (active_state)
|
||||
{
|
||||
case SS_ACTIVE_SEND:
|
||||
return_val = "active send";
|
||||
break;
|
||||
case SS_ACTIVE_READ_WAL:
|
||||
return_val = "active read WAL";
|
||||
break;
|
||||
case SS_ACTIVE_FLUSH:
|
||||
return_val = "active flush";
|
||||
break;
|
||||
}
|
||||
return_val = "active";
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2036,20 +1984,22 @@ FormatSafekeeperState(SafekeeperState state, SafekeeperActiveState active_state)
|
||||
static void
|
||||
AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
{
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
uint32 expected;
|
||||
WalProposer *wp = sk->wp;
|
||||
uint32 expected = SafekeeperStateDesiredEvents(sk->state);
|
||||
|
||||
/*
|
||||
* The events are in-line with what we're expecting, under two conditions:
|
||||
* (a) if we aren't expecting anything, `events` has no read- or
|
||||
* write-ready component. (b) if we are expecting something, there's
|
||||
* overlap (i.e. `events & expected != 0`)
|
||||
*/
|
||||
bool events_ok_for_state; /* long name so the `Assert` is more
|
||||
* clear later */
|
||||
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
/*
|
||||
* Without one more level of notify target indirection we have no way to
|
||||
* distinguish which socket woke up us, so just union expected events.
|
||||
*/
|
||||
expected = sk_events | nwr_events;
|
||||
events_ok_for_state = ((events & expected) != 0);
|
||||
if (expected == WL_NO_EVENTS)
|
||||
events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
|
||||
else
|
||||
events_ok_for_state = ((events & expected) != 0);
|
||||
|
||||
if (!events_ok_for_state)
|
||||
{
|
||||
@@ -2058,37 +2008,36 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
* and then an assertion that's guaranteed to fail.
|
||||
*/
|
||||
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state));
|
||||
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
|
||||
Assert(events_ok_for_state);
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns the set of events for both safekeeper (sk_events) and neon_walreader
|
||||
* (nwr_events) sockets a safekeeper in this state should be waiting on.
|
||||
/* Returns the set of events a safekeeper in this state should be waiting on
|
||||
*
|
||||
* This will return WL_NO_EVENTS (= 0) for some events. */
|
||||
void
|
||||
SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events)
|
||||
static uint32
|
||||
SafekeeperStateDesiredEvents(SafekeeperState state)
|
||||
{
|
||||
*nwr_events = 0; /* nwr_events is empty for most states */
|
||||
uint32 result = WL_NO_EVENTS;
|
||||
|
||||
/* If the state doesn't have a modifier, we can check the base state */
|
||||
switch (sk->state)
|
||||
switch (state)
|
||||
{
|
||||
/* Connecting states say what they want in the name */
|
||||
case SS_CONNECTING_READ:
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
return;
|
||||
result = WL_SOCKET_READABLE;
|
||||
break;
|
||||
case SS_CONNECTING_WRITE:
|
||||
*sk_events = WL_SOCKET_WRITEABLE;
|
||||
return;
|
||||
result = WL_SOCKET_WRITEABLE;
|
||||
break;
|
||||
|
||||
/* Reading states need the socket to be read-ready to continue */
|
||||
case SS_WAIT_EXEC_RESULT:
|
||||
case SS_HANDSHAKE_RECV:
|
||||
case SS_WAIT_VERDICT:
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
return;
|
||||
result = WL_SOCKET_READABLE;
|
||||
break;
|
||||
|
||||
/*
|
||||
* Idle states use read-readiness as a sign that the connection
|
||||
@@ -2096,62 +2045,32 @@ SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_even
|
||||
*/
|
||||
case SS_VOTING:
|
||||
case SS_IDLE:
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
return;
|
||||
result = WL_SOCKET_READABLE;
|
||||
break;
|
||||
|
||||
/*
|
||||
* Flush states require write-ready for flushing. Active state
|
||||
* does both reading and writing.
|
||||
*
|
||||
* TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
|
||||
* should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
|
||||
*/
|
||||
case SS_SEND_ELECTED_FLUSH:
|
||||
*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
|
||||
return;
|
||||
|
||||
case SS_ACTIVE:
|
||||
switch (sk->active_state)
|
||||
{
|
||||
/*
|
||||
* Everything is sent; we just wait for sk responses and
|
||||
* latch.
|
||||
*
|
||||
* Note: this assumes we send all available WAL to
|
||||
* safekeeper in one wakeup (unless it blocks). Otherwise
|
||||
* we would want WL_SOCKET_WRITEABLE here to finish the
|
||||
* work.
|
||||
*/
|
||||
case SS_ACTIVE_SEND:
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
if (NeonWALReaderEvents(sk->xlogreader))
|
||||
*nwr_events = WL_SOCKET_CLOSED; /* c.f.
|
||||
* walprop_pg_active_state_update_event_set */
|
||||
return;
|
||||
|
||||
/*
|
||||
* Waiting for neon_walreader socket, but we still read
|
||||
* responses from sk socket.
|
||||
*/
|
||||
case SS_ACTIVE_READ_WAL:
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
*nwr_events = NeonWALReaderEvents(sk->xlogreader);
|
||||
return;
|
||||
|
||||
/*
|
||||
* Need to flush the sk socket, so ignore neon_walreader
|
||||
* one and set write interest on sk.
|
||||
*/
|
||||
case SS_ACTIVE_FLUSH:
|
||||
*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
|
||||
if (NeonWALReaderEvents(sk->xlogreader))
|
||||
*nwr_events = WL_SOCKET_CLOSED; /* c.f.
|
||||
* walprop_pg_active_state_update_event_set */
|
||||
return;
|
||||
}
|
||||
return;
|
||||
result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
|
||||
break;
|
||||
|
||||
/* The offline state expects no events. */
|
||||
case SS_OFFLINE:
|
||||
*sk_events = 0;
|
||||
return;
|
||||
result = WL_NO_EVENTS;
|
||||
break;
|
||||
|
||||
default:
|
||||
Assert(false);
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Returns a human-readable string corresponding to the event set
|
||||
@@ -2162,7 +2081,7 @@ SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_even
|
||||
* The string should not be freed. It should also not be expected to remain the same between
|
||||
* function calls. */
|
||||
static char *
|
||||
FormatEvents(uint32 events)
|
||||
FormatEvents(WalProposer *wp, uint32 events)
|
||||
{
|
||||
static char return_str[8];
|
||||
|
||||
|
||||
@@ -10,9 +10,6 @@
|
||||
#include "utils/uuid.h"
|
||||
#include "replication/walreceiver.h"
|
||||
|
||||
#include "libpqwalproposer.h"
|
||||
#include "neon_walreader.h"
|
||||
|
||||
#define SK_MAGIC 0xCafeCeefu
|
||||
#define SK_PROTOCOL_VERSION 2
|
||||
|
||||
@@ -25,9 +22,43 @@
|
||||
*/
|
||||
#define WL_NO_EVENTS 0
|
||||
|
||||
struct WalProposerConn; /* Defined in libpqwalproposer.h */
|
||||
struct WalProposerConn; /* Defined in implementation (walprop_pg.c) */
|
||||
typedef struct WalProposerConn WalProposerConn;
|
||||
|
||||
/* Possible return values from ReadPGAsync */
|
||||
typedef enum
|
||||
{
|
||||
/* The full read was successful. buf now points to the data */
|
||||
PG_ASYNC_READ_SUCCESS,
|
||||
|
||||
/*
|
||||
* The read is ongoing. Wait until the connection is read-ready, then try
|
||||
* again.
|
||||
*/
|
||||
PG_ASYNC_READ_TRY_AGAIN,
|
||||
/* Reading failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_READ_FAIL,
|
||||
} PGAsyncReadResult;
|
||||
|
||||
/* Possible return values from WritePGAsync */
|
||||
typedef enum
|
||||
{
|
||||
/* The write fully completed */
|
||||
PG_ASYNC_WRITE_SUCCESS,
|
||||
|
||||
/*
|
||||
* The write started, but you'll need to call PQflush some more times to
|
||||
* finish it off. We just tried, so it's best to wait until the connection
|
||||
* is read- or write-ready to try again.
|
||||
*
|
||||
* If it becomes read-ready, call PQconsumeInput and flush again. If it
|
||||
* becomes write-ready, just call PQflush.
|
||||
*/
|
||||
PG_ASYNC_WRITE_TRY_FLUSH,
|
||||
/* Writing failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_WRITE_FAIL,
|
||||
} PGAsyncWriteResult;
|
||||
|
||||
/*
|
||||
* WAL safekeeper state, which is used to wait for some event.
|
||||
*
|
||||
@@ -104,40 +135,6 @@ typedef enum
|
||||
SS_ACTIVE,
|
||||
} SafekeeperState;
|
||||
|
||||
/*
|
||||
* Sending WAL substates of SS_ACTIVE.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
/*
|
||||
* We are ready to send more WAL, waiting for latch set to learn about
|
||||
* more WAL becoming available (or just a timeout to send heartbeat).
|
||||
*/
|
||||
SS_ACTIVE_SEND,
|
||||
|
||||
/*
|
||||
* Polling neon_walreader to receive chunk of WAL (probably remotely) to
|
||||
* send to this safekeeper.
|
||||
*
|
||||
* Note: socket management is done completely inside walproposer_pg for
|
||||
* simplicity, and thus simulation doesn't test it. Which is fine as
|
||||
* simulation is mainly aimed at consensus checks, not waiteventset
|
||||
* management.
|
||||
*
|
||||
* Also, while in this state we don't touch safekeeper socket, so in
|
||||
* theory it might close connection as inactive. This can be addressed if
|
||||
* needed; however, while fetching WAL we should regularly send it, so the
|
||||
* problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
|
||||
* walreader socket), but similarly shouldn't be a problem.
|
||||
*/
|
||||
SS_ACTIVE_READ_WAL,
|
||||
|
||||
/*
|
||||
* Waiting for write readiness to flush the socket.
|
||||
*/
|
||||
SS_ACTIVE_FLUSH,
|
||||
} SafekeeperActiveState;
|
||||
|
||||
/* Consensus logical timestamp. */
|
||||
typedef uint64 term_t;
|
||||
|
||||
@@ -346,11 +343,12 @@ typedef struct Safekeeper
|
||||
*/
|
||||
XLogRecPtr startStreamingAt;
|
||||
|
||||
bool flushWrite; /* set to true if we need to call AsyncFlush,*
|
||||
* to flush pending messages */
|
||||
XLogRecPtr streamingAt; /* current streaming position */
|
||||
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
|
||||
|
||||
SafekeeperState state; /* safekeeper state machine state */
|
||||
SafekeeperActiveState active_state;
|
||||
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
|
||||
AcceptorGreeting greetResponse; /* acceptor greeting */
|
||||
VoteResponse voteResponse; /* the vote */
|
||||
@@ -371,17 +369,12 @@ typedef struct Safekeeper
|
||||
/*
|
||||
* WAL reader, allocated for each safekeeper.
|
||||
*/
|
||||
NeonWALReader *xlogreader;
|
||||
XLogReaderState *xlogreader;
|
||||
|
||||
/*
|
||||
* Position in wait event set. Equal to -1 if no event
|
||||
*/
|
||||
int eventPos;
|
||||
|
||||
/*
|
||||
* Neon WAL reader position in wait event set, or -1 if no socket.
|
||||
*/
|
||||
int nwrEventPos;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -410,6 +403,31 @@ typedef enum
|
||||
*/
|
||||
} WalProposerConnectPollStatusType;
|
||||
|
||||
/* Re-exported and modified ExecStatusType */
|
||||
typedef enum
|
||||
{
|
||||
/* We received a single CopyBoth result */
|
||||
WP_EXEC_SUCCESS_COPYBOTH,
|
||||
|
||||
/*
|
||||
* Any success result other than a single CopyBoth was received. The
|
||||
* specifics of the result were already logged, but it may be useful to
|
||||
* provide an error message indicating which safekeeper messed up.
|
||||
*
|
||||
* Do not expect PQerrorMessage to be appropriately set.
|
||||
*/
|
||||
WP_EXEC_UNEXPECTED_SUCCESS,
|
||||
|
||||
/*
|
||||
* No result available at this time. Wait until read-ready, then call
|
||||
* again. Internally, this is returned when PQisBusy indicates that
|
||||
* PQgetResult would block.
|
||||
*/
|
||||
WP_EXEC_NEEDS_INPUT,
|
||||
/* Catch-all failure. Check PQerrorMessage. */
|
||||
WP_EXEC_FAILED,
|
||||
} WalProposerExecStatusType;
|
||||
|
||||
/* Re-exported ConnStatusType */
|
||||
typedef enum
|
||||
{
|
||||
@@ -491,26 +509,23 @@ typedef struct walproposer_api
|
||||
bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
|
||||
/* Read WAL from disk to buf. */
|
||||
NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
|
||||
void (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
|
||||
|
||||
/* Allocate WAL reader. */
|
||||
void (*wal_reader_allocate) (Safekeeper *sk);
|
||||
|
||||
/* Deallocate event set. */
|
||||
void (*free_event_set) (WalProposer *wp);
|
||||
|
||||
/* Initialize event set. */
|
||||
void (*init_event_set) (WalProposer *wp);
|
||||
|
||||
/* Update events for an existing safekeeper connection. */
|
||||
void (*update_event_set) (Safekeeper *sk, uint32 events);
|
||||
|
||||
/* Configure wait event set for yield in SS_ACTIVE. */
|
||||
void (*active_state_update_event_set) (Safekeeper *sk);
|
||||
|
||||
/* Add a new safekeeper connection to the event set. */
|
||||
void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
|
||||
|
||||
/* Remove safekeeper connection from event set */
|
||||
void (*rm_safekeeper_event_set) (Safekeeper *sk);
|
||||
|
||||
/*
|
||||
* Wait until some event happens: - timeout is reached - socket event for
|
||||
* safekeeper connection - new WAL is available
|
||||
@@ -696,19 +711,12 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
|
||||
extern void WalProposerPoll(WalProposer *wp);
|
||||
extern void WalProposerFree(WalProposer *wp);
|
||||
|
||||
/*
|
||||
* WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
|
||||
* recreate set from scratch, hence the export.
|
||||
*/
|
||||
extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
|
||||
extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
|
||||
|
||||
|
||||
#define WPEVENT 1337 /* special log level for walproposer internal
|
||||
* events */
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
|
||||
extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
|
||||
#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
|
||||
#else
|
||||
#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
|
||||
|
||||
@@ -43,12 +43,9 @@
|
||||
#include "utils/ps_status.h"
|
||||
#include "utils/timestamp.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
|
||||
#include "libpqwalproposer.h"
|
||||
#include "neon.h"
|
||||
#include "neon_walreader.h"
|
||||
#include "walproposer.h"
|
||||
#include "libpq-fe.h"
|
||||
|
||||
#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */
|
||||
#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender*
|
||||
@@ -94,10 +91,6 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
|
||||
static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
|
||||
static void XLogWalPropClose(XLogRecPtr recptr);
|
||||
|
||||
static void add_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void update_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
|
||||
|
||||
static void
|
||||
init_walprop_config(bool syncSafekeepers)
|
||||
{
|
||||
@@ -548,6 +541,14 @@ walprop_pg_load_libpqwalreceiver(void)
|
||||
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||
}
|
||||
|
||||
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
|
||||
struct WalProposerConn
|
||||
{
|
||||
PGconn *pg_conn;
|
||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||
char *recvbuf; /* last received data from walprop_async_read */
|
||||
};
|
||||
|
||||
/* Helper function */
|
||||
static bool
|
||||
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||
@@ -585,17 +586,16 @@ walprop_status(Safekeeper *sk)
|
||||
}
|
||||
}
|
||||
|
||||
WalProposerConn *
|
||||
libpqwp_connect_start(char *conninfo)
|
||||
static void
|
||||
walprop_connect_start(Safekeeper *sk)
|
||||
{
|
||||
|
||||
PGconn *pg_conn;
|
||||
WalProposerConn *conn;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
char *password = neon_auth_token;
|
||||
|
||||
Assert(sk->conn == NULL);
|
||||
|
||||
/*
|
||||
* Connect using the given connection string. If the NEON_AUTH_TOKEN
|
||||
@@ -614,7 +614,7 @@ libpqwp_connect_start(char *conninfo)
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = conninfo;
|
||||
values[n] = sk->conninfo;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
@@ -635,20 +635,11 @@ libpqwp_connect_start(char *conninfo)
|
||||
* palloc will exit on failure though, so there's not much we could do if
|
||||
* it *did* fail.
|
||||
*/
|
||||
conn = palloc(sizeof(WalProposerConn));
|
||||
conn->pg_conn = pg_conn;
|
||||
conn->is_nonblocking = false; /* connections always start in blocking
|
||||
* mode */
|
||||
conn->recvbuf = NULL;
|
||||
return conn;
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_connect_start(Safekeeper *sk)
|
||||
{
|
||||
Assert(sk->conn == NULL);
|
||||
sk->conn = libpqwp_connect_start(sk->conninfo);
|
||||
|
||||
sk->conn = palloc(sizeof(WalProposerConn));
|
||||
sk->conn->pg_conn = pg_conn;
|
||||
sk->conn->is_nonblocking = false; /* connections always start in
|
||||
* blocking mode */
|
||||
sk->conn->recvbuf = NULL;
|
||||
}
|
||||
|
||||
static WalProposerConnectPollStatusType
|
||||
@@ -692,33 +683,26 @@ walprop_connect_poll(Safekeeper *sk)
|
||||
return return_val;
|
||||
}
|
||||
|
||||
extern bool
|
||||
libpqwp_send_query(WalProposerConn *conn, char *query)
|
||||
static bool
|
||||
walprop_send_query(Safekeeper *sk, char *query)
|
||||
{
|
||||
/*
|
||||
* We need to be in blocking mode for sending the query to run without
|
||||
* requiring a call to PQflush
|
||||
*/
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
if (!ensure_nonblocking_status(sk->conn, false))
|
||||
return false;
|
||||
|
||||
/* PQsendQuery returns 1 on success, 0 on failure */
|
||||
if (!PQsendQuery(conn->pg_conn, query))
|
||||
if (!PQsendQuery(sk->conn->pg_conn, query))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
walprop_send_query(Safekeeper *sk, char *query)
|
||||
static WalProposerExecStatusType
|
||||
walprop_get_query_result(Safekeeper *sk)
|
||||
{
|
||||
return libpqwp_send_query(sk->conn, query);
|
||||
}
|
||||
|
||||
WalProposerExecStatusType
|
||||
libpqwp_get_query_result(WalProposerConn *conn)
|
||||
{
|
||||
|
||||
PGresult *result;
|
||||
WalProposerExecStatusType return_val;
|
||||
|
||||
@@ -726,14 +710,14 @@ libpqwp_get_query_result(WalProposerConn *conn)
|
||||
char *unexpected_success = NULL;
|
||||
|
||||
/* Consume any input that we might be missing */
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
if (!PQconsumeInput(sk->conn->pg_conn))
|
||||
return WP_EXEC_FAILED;
|
||||
|
||||
if (PQisBusy(conn->pg_conn))
|
||||
if (PQisBusy(sk->conn->pg_conn))
|
||||
return WP_EXEC_NEEDS_INPUT;
|
||||
|
||||
|
||||
result = PQgetResult(conn->pg_conn);
|
||||
result = PQgetResult(sk->conn->pg_conn);
|
||||
|
||||
/*
|
||||
* PQgetResult returns NULL only if getting the result was successful &
|
||||
@@ -794,12 +778,6 @@ libpqwp_get_query_result(WalProposerConn *conn)
|
||||
return return_val;
|
||||
}
|
||||
|
||||
static WalProposerExecStatusType
|
||||
walprop_get_query_result(Safekeeper *sk)
|
||||
{
|
||||
return libpqwp_get_query_result(sk->conn);
|
||||
}
|
||||
|
||||
static pgsocket
|
||||
walprop_socket(Safekeeper *sk)
|
||||
{
|
||||
@@ -812,21 +790,38 @@ walprop_flush(Safekeeper *sk)
|
||||
return (PQflush(sk->conn->pg_conn));
|
||||
}
|
||||
|
||||
/* Like libpqrcv_receive. *buf is valid until the next call. */
|
||||
PGAsyncReadResult
|
||||
libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
static void
|
||||
walprop_finish(Safekeeper *sk)
|
||||
{
|
||||
if (!sk->conn)
|
||||
return;
|
||||
|
||||
if (sk->conn->recvbuf != NULL)
|
||||
PQfreemem(sk->conn->recvbuf);
|
||||
PQfinish(sk->conn->pg_conn);
|
||||
pfree(sk->conn);
|
||||
sk->conn = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a message from the safekeeper.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
static PGAsyncReadResult
|
||||
walprop_async_read(Safekeeper *sk, char **buf, int *amount)
|
||||
{
|
||||
int result;
|
||||
|
||||
if (conn->recvbuf != NULL)
|
||||
if (sk->conn->recvbuf != NULL)
|
||||
{
|
||||
PQfreemem(conn->recvbuf);
|
||||
conn->recvbuf = NULL;
|
||||
PQfreemem(sk->conn->recvbuf);
|
||||
sk->conn->recvbuf = NULL;
|
||||
}
|
||||
|
||||
/* Call PQconsumeInput so that we have the data we need */
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
if (!PQconsumeInput(sk->conn->pg_conn))
|
||||
{
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
@@ -844,7 +839,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
* sometimes be triggered by the server returning an ErrorResponse (which
|
||||
* also happens to have the effect that the copy is done).
|
||||
*/
|
||||
switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
|
||||
switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
|
||||
{
|
||||
case 0:
|
||||
*amount = 0;
|
||||
@@ -859,7 +854,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
* We can check PQgetResult to make sure that the server
|
||||
* failed; it'll always result in PGRES_FATAL_ERROR
|
||||
*/
|
||||
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
|
||||
ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));
|
||||
|
||||
if (status != PGRES_FATAL_ERROR)
|
||||
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
|
||||
@@ -880,23 +875,11 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
default:
|
||||
/* Positive values indicate the size of the returned result */
|
||||
*amount = result;
|
||||
*buf = conn->recvbuf;
|
||||
*buf = sk->conn->recvbuf;
|
||||
return PG_ASYNC_READ_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a message from the safekeeper.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
static PGAsyncReadResult
|
||||
walprop_async_read(Safekeeper *sk, char **buf, int *amount)
|
||||
{
|
||||
return libpqwp_async_read(sk->conn, buf, amount);
|
||||
}
|
||||
|
||||
static PGAsyncWriteResult
|
||||
walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
{
|
||||
@@ -979,32 +962,6 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
libpqwp_disconnect(WalProposerConn *conn)
|
||||
{
|
||||
if (conn->recvbuf != NULL)
|
||||
PQfreemem(conn->recvbuf);
|
||||
PQfinish(conn->pg_conn);
|
||||
pfree(conn);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_finish(Safekeeper *sk)
|
||||
{
|
||||
if (sk->conn)
|
||||
{
|
||||
libpqwp_disconnect(sk->conn);
|
||||
sk->conn = NULL;
|
||||
}
|
||||
|
||||
/* free xlogreader */
|
||||
if (sk->xlogreader)
|
||||
{
|
||||
NeonWALReaderFree(sk->xlogreader);
|
||||
sk->xlogreader = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Subscribe for new WAL and stream it in the loop to safekeepers.
|
||||
*
|
||||
@@ -1443,41 +1400,26 @@ XLogWalPropClose(XLogRecPtr recptr)
|
||||
walpropFile = -1;
|
||||
}
|
||||
|
||||
static NeonWALReadResult
|
||||
static void
|
||||
walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
|
||||
{
|
||||
NeonWALReadResult res;
|
||||
WALReadError errinfo;
|
||||
|
||||
res = NeonWALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
|
||||
if (res == NEON_WALREAD_SUCCESS)
|
||||
if (!WALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id(),
|
||||
&errinfo))
|
||||
{
|
||||
/*
|
||||
* If we have the socket subscribed, but walreader doesn't need any
|
||||
* events, it must mean that remote connection just closed hoping to
|
||||
* do next read locally. Remove the socket then. It is important to do
|
||||
* as otherwise next read might open another connection and we won't
|
||||
* be able to distinguish whether we have correct socket added in wait
|
||||
* event set.
|
||||
*/
|
||||
if (NeonWALReaderEvents(sk->xlogreader) == 0)
|
||||
rm_safekeeper_event_set(sk, false);
|
||||
WALReadRaiseError(&errinfo);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_wal_reader_allocate(Safekeeper *sk)
|
||||
{
|
||||
char log_prefix[64];
|
||||
|
||||
snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
|
||||
sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
|
||||
if (sk->xlogreader == NULL)
|
||||
elog(FATAL, "Failed to allocate xlog reader");
|
||||
}
|
||||
@@ -1496,7 +1438,6 @@ walprop_pg_free_event_set(WalProposer *wp)
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
wp->safekeeper[i].eventPos = -1;
|
||||
wp->safekeeper[i].nwrEventPos = -1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1506,35 +1447,11 @@ walprop_pg_init_event_set(WalProposer *wp)
|
||||
if (waitEvents)
|
||||
elog(FATAL, "double-initialization of event set");
|
||||
|
||||
/* for each sk, we have socket plus potentially socket for neon walreader */
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
|
||||
AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
wp->safekeeper[i].eventPos = -1;
|
||||
wp->safekeeper[i].nwrEventPos = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* add safekeeper socket to wait event set */
|
||||
static void
|
||||
walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
Assert(sk->eventPos == -1);
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
|
||||
}
|
||||
|
||||
/* add neon wal reader socket to wait event set */
|
||||
static void
|
||||
add_nwr_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
Assert(sk->nwrEventPos == -1);
|
||||
sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
|
||||
elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -1546,139 +1463,10 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
|
||||
ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update neon_walreader event.
|
||||
* Can be called when nwr socket doesn't exist, does nothing in this case.
|
||||
*/
|
||||
static void
|
||||
update_nwr_event_set(Safekeeper *sk, uint32 events)
|
||||
walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
/* eventPos = -1 when we don't have an event */
|
||||
if (sk->nwrEventPos != -1)
|
||||
ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
walprop_pg_active_state_update_event_set(Safekeeper *sk)
|
||||
{
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
|
||||
Assert(sk->state == SS_ACTIVE);
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
/*
|
||||
* If we need to wait for neon_walreader, ensure we have up to date socket
|
||||
* in the wait event set.
|
||||
*/
|
||||
if (sk->active_state == SS_ACTIVE_READ_WAL)
|
||||
{
|
||||
/*
|
||||
* TODO: instead of reattaching socket (and thus recreating WES) each
|
||||
* time we should keep it if possible, i.e. if connection is already
|
||||
* established. Note that single neon_walreader object can switch
|
||||
* between local and remote reads multiple times during its lifetime,
|
||||
* so careful bookkeeping is needed here.
|
||||
*/
|
||||
rm_safekeeper_event_set(sk, false);
|
||||
add_nwr_event_set(sk, nwr_events);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Hack: we should always set 0 here, but for random reasons
|
||||
* WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
|
||||
* some event. Since there is also no way to remove socket except
|
||||
* reconstructing the whole set, SafekeeperStateDesiredEvents instead
|
||||
* gives WL_SOCKET_CLOSED if socket exists.
|
||||
*/
|
||||
Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
|
||||
update_nwr_event_set(sk, WL_SOCKET_CLOSED);
|
||||
}
|
||||
walprop_pg_update_event_set(sk, sk_events);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
|
||||
{
|
||||
rm_safekeeper_event_set(to_remove, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* A hacky way to remove single event from the event set. Can be called if event
|
||||
* doesn't exist, does nothing in this case.
|
||||
*
|
||||
* Note: Internally, this completely reconstructs the event set. It should be
|
||||
* avoided if possible.
|
||||
*
|
||||
* If is_sk is true, socket of connection to safekeeper is removed; otherwise
|
||||
* socket of neon_walreader.
|
||||
*/
|
||||
static void
|
||||
rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
|
||||
{
|
||||
WalProposer *wp = to_remove->wp;
|
||||
|
||||
elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
|
||||
to_remove->host, to_remove->port, is_sk);
|
||||
|
||||
/*
|
||||
* Shortpath for exiting if have nothing to do. We never call this
|
||||
* function with safekeeper socket not existing, but do that with neon
|
||||
* walreader socket.
|
||||
*/
|
||||
if ((is_sk && to_remove->eventPos == -1) ||
|
||||
(!is_sk && to_remove->nwrEventPos == -1))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/* Remove the existing event set, assign sk->eventPos = -1 */
|
||||
walprop_pg_free_event_set(wp);
|
||||
|
||||
/* Re-initialize it without adding any safekeeper events */
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
/*
|
||||
* loop through the existing safekeepers. If they aren't the one we're
|
||||
* removing, and if they have a socket we can use, re-add the applicable
|
||||
* events.
|
||||
*/
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
if (sk == to_remove)
|
||||
{
|
||||
if (is_sk)
|
||||
sk->eventPos = -1;
|
||||
else
|
||||
sk->nwrEventPos = -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this safekeeper isn't offline, add events for it, except for the
|
||||
* event requested to remove.
|
||||
*/
|
||||
if (sk->state != SS_OFFLINE)
|
||||
{
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
if (sk != to_remove || !is_sk)
|
||||
{
|
||||
/* will set sk->eventPos */
|
||||
wp->api.add_safekeeper_event_set(sk, sk_events);
|
||||
}
|
||||
else if ((sk != to_remove || is_sk) && nwr_events)
|
||||
{
|
||||
add_nwr_event_set(sk, nwr_events);
|
||||
}
|
||||
}
|
||||
}
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -1944,11 +1732,10 @@ static const walproposer_api walprop_pg = {
|
||||
.recovery_download = WalProposerRecovery,
|
||||
.wal_read = walprop_pg_wal_read,
|
||||
.wal_reader_allocate = walprop_pg_wal_reader_allocate,
|
||||
.free_event_set = walprop_pg_free_event_set,
|
||||
.init_event_set = walprop_pg_init_event_set,
|
||||
.update_event_set = walprop_pg_update_event_set,
|
||||
.active_state_update_event_set = walprop_pg_active_state_update_event_set,
|
||||
.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
|
||||
.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
|
||||
.wait_event_set = walprop_pg_wait_event_set,
|
||||
.strong_random = walprop_pg_strong_random,
|
||||
.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
|
||||
|
||||
121
poetry.lock
generated
121
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
@@ -98,18 +98,18 @@ speedups = ["Brotli", "aiodns", "brotlicffi"]
|
||||
|
||||
[[package]]
|
||||
name = "aiopg"
|
||||
version = "1.3.4"
|
||||
version = "1.4.0"
|
||||
description = "Postgres integration with asyncio."
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
|
||||
{file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
|
||||
{file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"},
|
||||
{file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
async-timeout = ">=3.0,<5.0"
|
||||
psycopg2-binary = ">=2.8.4"
|
||||
psycopg2-binary = ">=2.9.5"
|
||||
|
||||
[package.extras]
|
||||
sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
|
||||
@@ -160,64 +160,71 @@ pluggy = ">=0.4.0"
|
||||
|
||||
[[package]]
|
||||
name = "async-timeout"
|
||||
version = "4.0.2"
|
||||
version = "4.0.3"
|
||||
description = "Timeout context manager for asyncio programs"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
|
||||
{file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
|
||||
{file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
|
||||
{file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "asyncpg"
|
||||
version = "0.27.0"
|
||||
version = "0.29.0"
|
||||
description = "An asyncio PostgreSQL driver"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
python-versions = ">=3.8.0"
|
||||
files = [
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:20b596d8d074f6f695c13ffb8646d0b6bb1ab570ba7b0cfd349b921ff03cfc1e"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a6206210c869ebd3f4eb9e89bea132aefb56ff3d1b7dd7e26b102b17e27bbb1"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7a94c03386bb95456b12c66026b3a87d1b965f0f1e5733c36e7229f8f137747"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bfc3980b4ba6f97138b04f0d32e8af21d6c9fa1f8e6e140c07d15690a0a99279"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9654085f2b22f66952124de13a8071b54453ff972c25c59b5ce1173a4283ffd9"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-win32.whl", hash = "sha256:879c29a75969eb2722f94443752f4720d560d1e748474de54ae8dd230bc4956b"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-win_amd64.whl", hash = "sha256:ab0f21c4818d46a60ca789ebc92327d6d874d3b7ccff3963f7af0a21dc6cff52"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:18f77e8e71e826ba2d0c3ba6764930776719ae2b225ca07e014590545928b576"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2232d4625c558f2aa001942cac1d7952aa9f0dbfc212f63bc754277769e1ef2"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a3a4ff43702d39e3c97a8786314123d314e0f0e4dabc8367db5b665c93914de"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccddb9419ab4e1c48742457d0c0362dbdaeb9b28e6875115abfe319b29ee225d"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:768e0e7c2898d40b16d4ef7a0b44e8150db3dd8995b4652aa1fe2902e92c7df8"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609054a1f47292a905582a1cfcca51a6f3f30ab9d822448693e66fdddde27920"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-win32.whl", hash = "sha256:8113e17cfe236dc2277ec844ba9b3d5312f61bd2fdae6d3ed1c1cdd75f6cf2d8"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb71211414dd1eeb8d31ec529fe77cff04bf53efc783a5f6f0a32d84923f45cf"},
|
||||
{file = "asyncpg-0.27.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4750f5cf49ed48a6e49c6e5aed390eee367694636c2dcfaf4a273ca832c5c43c"},
|
||||
{file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:eca01eb112a39d31cc4abb93a5aef2a81514c23f70956729f42fb83b11b3483f"},
|
||||
{file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5710cb0937f696ce303f5eed6d272e3f057339bb4139378ccecafa9ee923a71c"},
|
||||
{file = "asyncpg-0.27.0-cp37-cp37m-win_amd64.whl", hash = "sha256:71cca80a056ebe19ec74b7117b09e650990c3ca535ac1c35234a96f65604192f"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4bb366ae34af5b5cabc3ac6a5347dfb6013af38c68af8452f27968d49085ecc0"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16ba8ec2e85d586b4a12bcd03e8d29e3d99e832764d6a1d0b8c27dbbe4a2569d"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d20dea7b83651d93b1eb2f353511fe7fd554752844523f17ad30115d8b9c8cd6"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e56ac8a8237ad4adec97c0cd4728596885f908053ab725e22900b5902e7f8e69"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bf21ebf023ec67335258e0f3d3ad7b91bb9507985ba2b2206346de488267cad0"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-win32.whl", hash = "sha256:69aa1b443a182b13a17ff926ed6627af2d98f62f2fe5890583270cc4073f63bf"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-win_amd64.whl", hash = "sha256:62932f29cf2433988fcd799770ec64b374a3691e7902ecf85da14d5e0854d1ea"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fddcacf695581a8d856654bc4c8cfb73d5c9df26d5f55201722d3e6a699e9629"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7d8585707ecc6661d07367d444bbaa846b4e095d84451340da8df55a3757e152"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:975a320baf7020339a67315284a4d3bf7460e664e484672bd3e71dbd881bc692"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2232ebae9796d4600a7819fc383da78ab51b32a092795f4555575fc934c1c89d"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:88b62164738239f62f4af92567b846a8ef7cf8abf53eddd83650603de4d52163"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eb4b2fdf88af4fb1cc569781a8f933d2a73ee82cd720e0cb4edabbaecf2a905b"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-win32.whl", hash = "sha256:8934577e1ed13f7d2d9cea3cc016cc6f95c19faedea2c2b56a6f94f257cea672"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b6499de06fe035cf2fa932ec5617ed3f37d4ebbf663b655922e105a484a6af9"},
|
||||
{file = "asyncpg-0.27.0.tar.gz", hash = "sha256:720986d9a4705dd8a40fdf172036f5ae787225036a7eb46e704c45aa8f62c054"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"},
|
||||
{file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=5.0.4,<5.1.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"]
|
||||
docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
|
||||
test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"]
|
||||
docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
|
||||
test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "attrs"
|
||||
@@ -2476,6 +2483,16 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
@@ -2697,4 +2714,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
|
||||
content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
|
||||
|
||||
@@ -4,6 +4,10 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
@@ -69,6 +73,7 @@ webpki-roots.workspace = true
|
||||
x509-parser.workspace = true
|
||||
native-tls.workspace = true
|
||||
postgres-native-tls.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
smol_str.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
@@ -78,4 +83,3 @@ tokio-util.workspace = true
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
|
||||
@@ -3,9 +3,11 @@ mod hacks;
|
||||
mod link;
|
||||
|
||||
pub use link::LinkAuthError;
|
||||
use smol_str::SmolStr;
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
|
||||
use crate::auth::credentials::check_peer_addr_is_in_list;
|
||||
use crate::auth::validate_password_and_exchange;
|
||||
use crate::console::errors::GetAuthInfoError;
|
||||
use crate::console::provider::AuthInfo;
|
||||
use crate::console::AuthSecret;
|
||||
@@ -24,31 +26,12 @@ use crate::{
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use std::borrow::Cow;
|
||||
use std::net::IpAddr;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
/// A product of successful authentication.
|
||||
pub struct AuthSuccess<T> {
|
||||
/// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client?
|
||||
pub reported_auth_ok: bool,
|
||||
/// Something to be considered a positive result.
|
||||
pub value: T,
|
||||
}
|
||||
|
||||
impl<T> AuthSuccess<T> {
|
||||
/// Very similar to [`std::option::Option::map`].
|
||||
/// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
|
||||
/// a function to a contained value.
|
||||
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
|
||||
AuthSuccess {
|
||||
reported_auth_ok: self.reported_auth_ok,
|
||||
value: f(self.value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This type serves two purposes:
|
||||
///
|
||||
/// * When `T` is `()`, it's just a regular auth backend selector
|
||||
@@ -61,9 +44,11 @@ pub enum BackendType<'a, T> {
|
||||
/// Current Cloud API (V2).
|
||||
Console(Cow<'a, console::provider::neon::Api>, T),
|
||||
/// Local mock of Cloud API (V2).
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(Cow<'a, console::provider::mock::Api>, T),
|
||||
/// Authentication via a web browser.
|
||||
Link(Cow<'a, url::ApiUrl>),
|
||||
#[cfg(test)]
|
||||
/// Test backend.
|
||||
Test(&'a dyn TestBackend),
|
||||
}
|
||||
@@ -78,8 +63,10 @@ impl std::fmt::Display for BackendType<'_, ()> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
|
||||
Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
|
||||
#[cfg(test)]
|
||||
Test(_) => fmt.debug_tuple("Test").finish(),
|
||||
}
|
||||
}
|
||||
@@ -92,8 +79,10 @@ impl<T> BackendType<'_, T> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => Console(Cow::Borrowed(c), x),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
|
||||
Link(c) => Link(Cow::Borrowed(c)),
|
||||
#[cfg(test)]
|
||||
Test(x) => Test(*x),
|
||||
}
|
||||
}
|
||||
@@ -107,8 +96,10 @@ impl<'a, T> BackendType<'a, T> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => Console(c, f(x)),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(c, x) => Postgres(c, f(x)),
|
||||
Link(c) => Link(c),
|
||||
#[cfg(test)]
|
||||
Test(x) => Test(x),
|
||||
}
|
||||
}
|
||||
@@ -121,51 +112,87 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => x.map(|x| Console(c, x)),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(c, x) => x.map(|x| Postgres(c, x)),
|
||||
Link(c) => Ok(Link(c)),
|
||||
#[cfg(test)]
|
||||
Test(x) => Ok(Test(x)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ComputeCredentials {
|
||||
pub struct ComputeCredentials<T> {
|
||||
pub info: ComputeUserInfo,
|
||||
pub keys: T,
|
||||
}
|
||||
|
||||
pub struct ComputeUserInfoNoEndpoint {
|
||||
pub user: SmolStr,
|
||||
pub peer_addr: IpAddr,
|
||||
pub cache_key: SmolStr,
|
||||
}
|
||||
|
||||
pub struct ComputeUserInfo {
|
||||
pub endpoint: SmolStr,
|
||||
pub inner: ComputeUserInfoNoEndpoint,
|
||||
}
|
||||
|
||||
pub enum ComputeCredentialKeys {
|
||||
#[cfg(feature = "testing")]
|
||||
Password(Vec<u8>),
|
||||
AuthKeys(AuthKeys),
|
||||
}
|
||||
|
||||
impl TryFrom<ClientCredentials> for ComputeUserInfo {
|
||||
// user name
|
||||
type Error = ComputeUserInfoNoEndpoint;
|
||||
|
||||
fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
|
||||
let inner = ComputeUserInfoNoEndpoint {
|
||||
user: creds.user,
|
||||
peer_addr: creds.peer_addr,
|
||||
cache_key: creds.cache_key,
|
||||
};
|
||||
match creds.project {
|
||||
None => Err(inner),
|
||||
Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks_creds(
|
||||
///
|
||||
/// All authentication flows will emit an AuthenticationOk message if successful.
|
||||
async fn auth_quirks(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
creds: ClientCredentials,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the endpoint (project) name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
let maybe_success = if creds.project.is_none() {
|
||||
// Password will be checked by the compute node later.
|
||||
Some(hacks::password_hack(creds, client, latency_timer).await?)
|
||||
} else {
|
||||
None
|
||||
let (info, unauthenticated_password) = match creds.try_into() {
|
||||
Err(info) => {
|
||||
let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
|
||||
(res.info, Some(res.keys))
|
||||
}
|
||||
Ok(info) => (info, None),
|
||||
};
|
||||
|
||||
// Password hack should set the project name.
|
||||
// TODO: make `creds.project` more type-safe.
|
||||
assert!(creds.project.is_some());
|
||||
info!("fetching user's authentication info");
|
||||
// TODO(anna): this will slow down both "hacks" below; we probably need a cache.
|
||||
let AuthInfo {
|
||||
secret,
|
||||
allowed_ips,
|
||||
} = api.get_auth_info(extra, creds).await?;
|
||||
} = api.get_auth_info(extra, &info, latency_timer).await?;
|
||||
|
||||
// check allowed list
|
||||
if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) {
|
||||
if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed());
|
||||
}
|
||||
let secret = secret.unwrap_or_else(|| {
|
||||
@@ -173,36 +200,49 @@ async fn auth_quirks_creds(
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
|
||||
});
|
||||
|
||||
if let Some(success) = maybe_success {
|
||||
return Ok(success);
|
||||
if let Some(password) = unauthenticated_password {
|
||||
let auth_outcome = validate_password_and_exchange(&password, secret)?;
|
||||
let keys = match auth_outcome {
|
||||
crate::sasl::Outcome::Success(key) => key,
|
||||
crate::sasl::Outcome::Failure(reason) => {
|
||||
info!("auth backend failed with an error: {reason}");
|
||||
return Err(auth::AuthError::auth_failed(&*info.inner.user));
|
||||
}
|
||||
};
|
||||
|
||||
// we have authenticated the password
|
||||
client.write_message_noflush(&pq_proto::BeMessage::AuthenticationOk)?;
|
||||
|
||||
return Ok(ComputeCredentials { info, keys });
|
||||
}
|
||||
|
||||
// -- the remaining flows are self-authenticating --
|
||||
|
||||
// Perform cleartext auth if we're allowed to do that.
|
||||
// Currently, we use it for websocket connections (latency).
|
||||
if allow_cleartext {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::cleartext_hack(client, latency_timer).await;
|
||||
return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
|
||||
}
|
||||
|
||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||
classic::authenticate(creds, client, config, latency_timer, secret).await
|
||||
classic::authenticate(info, client, config, latency_timer, secret).await
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks(
|
||||
/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
|
||||
/// only if authentication was successfuly.
|
||||
async fn auth_and_wake_compute(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
creds: ClientCredentials,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
let auth_stuff = auth_quirks_creds(
|
||||
) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
|
||||
let compute_credentials = auth_quirks(
|
||||
api,
|
||||
extra,
|
||||
creds,
|
||||
@@ -215,7 +255,9 @@ async fn auth_quirks(
|
||||
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
let wake_res = api.wake_compute(extra, creds).await;
|
||||
let wake_res = api
|
||||
.wake_compute(extra, &compute_credentials.info, latency_timer)
|
||||
.await;
|
||||
match handle_try_wake(wake_res, num_retries) {
|
||||
Err(e) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
@@ -232,27 +274,27 @@ async fn auth_quirks(
|
||||
tokio::time::sleep(wait_duration).await;
|
||||
};
|
||||
|
||||
match auth_stuff.value {
|
||||
ComputeCredentials::Password(password) => node.config.password(password),
|
||||
ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
|
||||
match compute_credentials.keys {
|
||||
#[cfg(feature = "testing")]
|
||||
ComputeCredentialKeys::Password(password) => node.config.password(password),
|
||||
ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
|
||||
};
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: auth_stuff.reported_auth_ok,
|
||||
value: node,
|
||||
})
|
||||
Ok((node, compute_credentials.info))
|
||||
}
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
impl<'a> BackendType<'a, ClientCredentials> {
|
||||
/// Get compute endpoint name from the credentials.
|
||||
pub fn get_endpoint(&self) -> Option<String> {
|
||||
pub fn get_endpoint(&self) -> Option<SmolStr> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(_, creds) => creds.project.clone(),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(_, creds) => creds.project.clone(),
|
||||
Link(_) => Some("link".to_owned()),
|
||||
Test(_) => Some("test".to_owned()),
|
||||
Link(_) => Some("link".into()),
|
||||
#[cfg(test)]
|
||||
Test(_) => Some("test".into()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -261,9 +303,11 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(_, creds) => creds.user,
|
||||
Postgres(_, creds) => creds.user,
|
||||
Console(_, creds) => &creds.user,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(_, creds) => &creds.user,
|
||||
Link(_) => "link",
|
||||
#[cfg(test)]
|
||||
Test(_) => "test",
|
||||
}
|
||||
}
|
||||
@@ -271,26 +315,25 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
||||
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
&mut self,
|
||||
self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
|
||||
use BackendType::*;
|
||||
|
||||
let res = match self {
|
||||
Console(api, creds) => {
|
||||
info!(
|
||||
user = creds.user,
|
||||
user = &*creds.user,
|
||||
project = creds.project(),
|
||||
"performing authentication using the console"
|
||||
);
|
||||
|
||||
let api = api.as_ref();
|
||||
auth_quirks(
|
||||
api,
|
||||
let (cache_info, user_info) = auth_and_wake_compute(
|
||||
&*api,
|
||||
extra,
|
||||
creds,
|
||||
client,
|
||||
@@ -298,18 +341,19 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
config,
|
||||
latency_timer,
|
||||
)
|
||||
.await?
|
||||
.await?;
|
||||
(cache_info, BackendType::Console(api, user_info))
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api, creds) => {
|
||||
info!(
|
||||
user = creds.user,
|
||||
user = &*creds.user,
|
||||
project = creds.project(),
|
||||
"performing authentication using a local postgres instance"
|
||||
);
|
||||
|
||||
let api = api.as_ref();
|
||||
auth_quirks(
|
||||
api,
|
||||
let (cache_info, user_info) = auth_and_wake_compute(
|
||||
&*api,
|
||||
extra,
|
||||
creds,
|
||||
client,
|
||||
@@ -317,16 +361,21 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
config,
|
||||
latency_timer,
|
||||
)
|
||||
.await?
|
||||
.await?;
|
||||
(cache_info, BackendType::Postgres(api, user_info))
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url) => {
|
||||
info!("performing link authentication");
|
||||
|
||||
link::authenticate(url, client)
|
||||
.await?
|
||||
.map(CachedNodeInfo::new_uncached)
|
||||
let node_info = link::authenticate(&url, client).await?;
|
||||
|
||||
(
|
||||
CachedNodeInfo::new_uncached(node_info),
|
||||
BackendType::Link(url),
|
||||
)
|
||||
}
|
||||
#[cfg(test)]
|
||||
Test(_) => {
|
||||
unreachable!("this function should never be called in the test backend")
|
||||
}
|
||||
@@ -335,16 +384,21 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
info!("user successfully authenticated");
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
|
||||
impl BackendType<'_, ComputeUserInfo> {
|
||||
pub async fn get_allowed_ips(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(api, creds) => api.get_allowed_ips(extra, creds).await,
|
||||
Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
|
||||
Console(api, creds) => api.get_allowed_ips(extra, creds, latency_timer).await,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api, creds) => api.get_allowed_ips(extra, creds, latency_timer).await,
|
||||
Link(_) => Ok(Arc::new(vec![])),
|
||||
#[cfg(test)]
|
||||
Test(x) => x.get_allowed_ips(),
|
||||
}
|
||||
}
|
||||
@@ -354,13 +408,24 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
pub async fn wake_compute(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
|
||||
Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
|
||||
Console(api, creds) => {
|
||||
api.wake_compute(extra, creds, latency_timer)
|
||||
.map_ok(Some)
|
||||
.await
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api, creds) => {
|
||||
api.wake_compute(extra, creds, latency_timer)
|
||||
.map_ok(Some)
|
||||
.await
|
||||
}
|
||||
Link(_) => Ok(None),
|
||||
#[cfg(test)]
|
||||
Test(x) => x.wake_compute().map(Some),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::{AuthSuccess, ComputeCredentials};
|
||||
use super::{ComputeCredentials, ComputeUserInfo};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
auth::{self, backend::ComputeCredentialKeys, AuthFlow},
|
||||
compute,
|
||||
config::AuthenticationConfig,
|
||||
console::AuthSecret,
|
||||
@@ -12,14 +12,15 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: ComputeUserInfo,
|
||||
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
config: &'static AuthenticationConfig,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
secret: AuthSecret,
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
||||
let flow = AuthFlow::new(client);
|
||||
let scram_keys = match secret {
|
||||
#[cfg(feature = "testing")]
|
||||
AuthSecret::Md5(_) => {
|
||||
info!("auth endpoint chooses MD5");
|
||||
return Err(auth::AuthError::bad_auth_method("MD5"));
|
||||
@@ -32,7 +33,7 @@ pub(super) async fn authenticate(
|
||||
config.scram_protocol_timeout,
|
||||
async {
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = latency_timer.pause();
|
||||
let _paused = latency_timer.wait_for_user();
|
||||
|
||||
flow.begin(scram).await.map_err(|error| {
|
||||
warn!(?error, "error sending scram acknowledgement");
|
||||
@@ -53,7 +54,7 @@ pub(super) async fn authenticate(
|
||||
sasl::Outcome::Success(key) => key,
|
||||
sasl::Outcome::Failure(reason) => {
|
||||
info!("auth backend failed with an error: {reason}");
|
||||
return Err(auth::AuthError::auth_failed(creds.user));
|
||||
return Err(auth::AuthError::auth_failed(&*creds.inner.user));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -64,9 +65,9 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
};
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
|
||||
Ok(ComputeCredentials {
|
||||
info: creds,
|
||||
keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
|
||||
scram_keys,
|
||||
)),
|
||||
})
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
use super::{AuthSuccess, ComputeCredentials};
|
||||
use super::{
|
||||
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
|
||||
};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
auth::{self, AuthFlow},
|
||||
console::AuthSecret,
|
||||
proxy::LatencyTimer,
|
||||
sasl,
|
||||
stream::{self, Stream},
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -11,52 +15,61 @@ use tracing::{info, warn};
|
||||
/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn cleartext_hack(
|
||||
pub async fn authenticate_cleartext(
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
secret: AuthSecret,
|
||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = latency_timer.pause();
|
||||
let _paused = latency_timer.wait_for_user();
|
||||
|
||||
let password = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword)
|
||||
let auth_outcome = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword(secret))
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: ComputeCredentials::Password(password),
|
||||
})
|
||||
let keys = match auth_outcome {
|
||||
sasl::Outcome::Success(key) => key,
|
||||
sasl::Outcome::Failure(reason) => {
|
||||
info!("auth backend failed with an error: {reason}");
|
||||
return Err(auth::AuthError::auth_failed(&*info.inner.user));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(ComputeCredentials { info, keys })
|
||||
}
|
||||
|
||||
/// Workaround for clients which don't provide an endpoint (project) name.
|
||||
/// Very similar to [`cleartext_hack`], but there's a specific password format.
|
||||
pub async fn password_hack(
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
/// Similar to [`authenticate_cleartext`], but there's a specific password format,
|
||||
/// and passwords are not yet validated (we don't know how to validate them!)
|
||||
pub async fn password_hack_no_authentication(
|
||||
info: ComputeUserInfoNoEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
) -> auth::Result<ComputeCredentials<Vec<u8>>> {
|
||||
warn!("project not specified, resorting to the password hack auth flow");
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = latency_timer.pause();
|
||||
let _paused = latency_timer.wait_for_user();
|
||||
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
.await?
|
||||
.authenticate()
|
||||
.get_password()
|
||||
.await?;
|
||||
|
||||
info!(project = &payload.endpoint, "received missing parameter");
|
||||
creds.project = Some(payload.endpoint);
|
||||
info!(project = &*payload.endpoint, "received missing parameter");
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: ComputeCredentials::Password(payload.password),
|
||||
Ok(ComputeCredentials {
|
||||
info: ComputeUserInfo {
|
||||
inner: info,
|
||||
endpoint: payload.endpoint,
|
||||
},
|
||||
keys: payload.password,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use super::AuthSuccess;
|
||||
use crate::{
|
||||
auth, compute,
|
||||
console::{self, provider::NodeInfo},
|
||||
@@ -57,7 +56,7 @@ pub fn new_psql_session_id() -> String {
|
||||
pub(super) async fn authenticate(
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<NodeInfo>> {
|
||||
) -> auth::Result<NodeInfo> {
|
||||
let psql_session_id = new_psql_session_id();
|
||||
let span = info_span!("link", psql_session_id = &psql_session_id);
|
||||
let greeting = hello_message(link_uri, &psql_session_id);
|
||||
@@ -102,12 +101,9 @@ pub(super) async fn authenticate(
|
||||
config.password(password.as_ref());
|
||||
}
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: true,
|
||||
value: NodeInfo {
|
||||
config,
|
||||
aux: db_info.aux,
|
||||
allow_self_signed_compute: false, // caller may override
|
||||
},
|
||||
Ok(NodeInfo {
|
||||
config,
|
||||
aux: db_info.aux,
|
||||
allow_self_signed_compute: false, // caller may override
|
||||
})
|
||||
}
|
||||
|
||||
@@ -7,10 +7,8 @@ use crate::{
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
net::{IpAddr, SocketAddr},
|
||||
};
|
||||
use smol_str::SmolStr;
|
||||
use std::{collections::HashSet, net::IpAddr};
|
||||
use thiserror::Error;
|
||||
use tracing::{info, warn};
|
||||
|
||||
@@ -24,7 +22,7 @@ pub enum ClientCredsParseError {
|
||||
SNI ('{}') and project option ('{}').",
|
||||
.domain, .option,
|
||||
)]
|
||||
InconsistentProjectNames { domain: String, option: String },
|
||||
InconsistentProjectNames { domain: SmolStr, option: SmolStr },
|
||||
|
||||
#[error(
|
||||
"Common name inferred from SNI ('{}') is not known",
|
||||
@@ -33,7 +31,7 @@ pub enum ClientCredsParseError {
|
||||
UnknownCommonName { cn: String },
|
||||
|
||||
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
|
||||
MalformedProjectName(String),
|
||||
MalformedProjectName(SmolStr),
|
||||
}
|
||||
|
||||
impl UserFacingError for ClientCredsParseError {}
|
||||
@@ -41,34 +39,34 @@ impl UserFacingError for ClientCredsParseError {}
|
||||
/// Various client credentials which we use for authentication.
|
||||
/// Note that we don't store any kind of client key or password here.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ClientCredentials<'a> {
|
||||
pub user: &'a str,
|
||||
pub struct ClientCredentials {
|
||||
pub user: SmolStr,
|
||||
// TODO: this is a severe misnomer! We should think of a new name ASAP.
|
||||
pub project: Option<String>,
|
||||
pub project: Option<SmolStr>,
|
||||
|
||||
pub cache_key: String,
|
||||
pub peer_addr: SocketAddr,
|
||||
pub cache_key: SmolStr,
|
||||
pub peer_addr: IpAddr,
|
||||
}
|
||||
|
||||
impl ClientCredentials<'_> {
|
||||
impl ClientCredentials {
|
||||
#[inline]
|
||||
pub fn project(&self) -> Option<&str> {
|
||||
self.project.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ClientCredentials<'a> {
|
||||
impl ClientCredentials {
|
||||
pub fn parse(
|
||||
params: &'a StartupMessageParams,
|
||||
params: &StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_names: Option<HashSet<String>>,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> Result<Self, ClientCredsParseError> {
|
||||
use ClientCredsParseError::*;
|
||||
|
||||
// Some parameters are stored in the startup message.
|
||||
let get_param = |key| params.get(key).ok_or(MissingKey(key));
|
||||
let user = get_param("user")?;
|
||||
let user = get_param("user")?.into();
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
let project_option = params
|
||||
@@ -82,7 +80,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
.at_most_one()
|
||||
.ok()?
|
||||
})
|
||||
.map(|name| name.to_string());
|
||||
.map(|name| name.into());
|
||||
|
||||
let project_from_domain = if let Some(sni_str) = sni {
|
||||
if let Some(cn) = common_names {
|
||||
@@ -121,7 +119,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
}
|
||||
.transpose()?;
|
||||
|
||||
info!(user, project = project.as_deref(), "credentials");
|
||||
info!(%user, project = project.as_deref(), "credentials");
|
||||
if sni.is_some() {
|
||||
info!("Connection with sni");
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
@@ -143,7 +141,8 @@ impl<'a> ClientCredentials<'a> {
|
||||
"{}{}",
|
||||
project.as_deref().unwrap_or(""),
|
||||
neon_options(params).unwrap_or("".to_string())
|
||||
);
|
||||
)
|
||||
.into();
|
||||
|
||||
Ok(Self {
|
||||
user,
|
||||
@@ -206,10 +205,10 @@ fn project_name_valid(name: &str) -> bool {
|
||||
name.chars().all(|c| c.is_alphanumeric() || c == '-')
|
||||
}
|
||||
|
||||
fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
|
||||
fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<SmolStr> {
|
||||
sni.strip_suffix(common_name)?
|
||||
.strip_suffix('.')
|
||||
.map(str::to_owned)
|
||||
.map(SmolStr::from)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -221,7 +220,7 @@ mod tests {
|
||||
fn parse_bare_minimum() -> anyhow::Result<()> {
|
||||
// According to postgresql, only `user` should be required.
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project, None);
|
||||
@@ -236,7 +235,7 @@ mod tests {
|
||||
("database", "world"), // should be ignored
|
||||
("foo", "bar"), // should be ignored
|
||||
]);
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project, None);
|
||||
@@ -251,7 +250,7 @@ mod tests {
|
||||
let sni = Some("foo.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("foo"));
|
||||
@@ -267,7 +266,7 @@ mod tests {
|
||||
("options", "-ckey=1 project=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("bar"));
|
||||
@@ -282,7 +281,7 @@ mod tests {
|
||||
("options", "-ckey=1 endpoint=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("bar"));
|
||||
@@ -300,7 +299,7 @@ mod tests {
|
||||
),
|
||||
]);
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert!(creds.project.is_none());
|
||||
@@ -315,7 +314,7 @@ mod tests {
|
||||
("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
|
||||
]);
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert!(creds.project.is_none());
|
||||
@@ -330,7 +329,7 @@ mod tests {
|
||||
let sni = Some("baz.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("baz"));
|
||||
@@ -344,13 +343,13 @@ mod tests {
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.a.com");
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("p1"));
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.b.com");
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("p1"));
|
||||
|
||||
@@ -365,7 +364,7 @@ mod tests {
|
||||
let sni = Some("second.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
|
||||
.expect_err("should fail");
|
||||
match err {
|
||||
@@ -384,7 +383,7 @@ mod tests {
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["example.com".into()].into());
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
|
||||
.expect_err("should fail");
|
||||
match err {
|
||||
@@ -404,7 +403,7 @@ mod tests {
|
||||
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("project"));
|
||||
assert_eq!(
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
//! Main authentication flow.
|
||||
|
||||
use super::{AuthErrorImpl, PasswordHackPayload};
|
||||
use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
|
||||
use crate::{
|
||||
config::TlsServerEndPoint,
|
||||
console::AuthSecret,
|
||||
sasl, scram,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
@@ -50,7 +51,7 @@ impl AuthMethod for PasswordHack {
|
||||
|
||||
/// Use clear-text password auth called `password` in docs
|
||||
/// <https://www.postgresql.org/docs/current/auth-password.html>
|
||||
pub struct CleartextPassword;
|
||||
pub struct CleartextPassword(pub AuthSecret);
|
||||
|
||||
impl AuthMethod for CleartextPassword {
|
||||
#[inline(always)]
|
||||
@@ -98,7 +99,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<PasswordHackPayload> {
|
||||
pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let password = msg
|
||||
.strip_suffix(&[0])
|
||||
@@ -117,13 +118,19 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<Vec<u8>> {
|
||||
pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let password = msg
|
||||
.strip_suffix(&[0])
|
||||
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
|
||||
|
||||
Ok(password.to_vec())
|
||||
let outcome = validate_password_and_exchange(password, self.state.0)?;
|
||||
|
||||
if let sasl::Outcome::Success(_) = &outcome {
|
||||
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
}
|
||||
|
||||
Ok(outcome)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,6 +159,49 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
))
|
||||
.await?;
|
||||
|
||||
if let sasl::Outcome::Success(_) = &outcome {
|
||||
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
}
|
||||
|
||||
Ok(outcome)
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn validate_password_and_exchange(
|
||||
password: &[u8],
|
||||
secret: AuthSecret,
|
||||
) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
match secret {
|
||||
#[cfg(feature = "testing")]
|
||||
AuthSecret::Md5(_) => {
|
||||
// test only
|
||||
Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password(
|
||||
password.to_owned(),
|
||||
)))
|
||||
}
|
||||
// perform scram authentication as both client and server to validate the keys
|
||||
AuthSecret::Scram(scram_secret) => {
|
||||
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
|
||||
let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
|
||||
let outcome = crate::scram::exchange(
|
||||
&scram_secret,
|
||||
sasl_client,
|
||||
crate::config::TlsServerEndPoint::Undefined,
|
||||
)?;
|
||||
|
||||
let client_key = match outcome {
|
||||
sasl::Outcome::Success(client_key) => client_key,
|
||||
sasl::Outcome::Failure(reason) => return Ok(sasl::Outcome::Failure(reason)),
|
||||
};
|
||||
|
||||
let keys = crate::compute::ScramKeys {
|
||||
client_key: client_key.as_bytes(),
|
||||
server_key: scram_secret.server_key.as_bytes(),
|
||||
};
|
||||
|
||||
Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys(
|
||||
tokio_postgres::config::AuthKeys::ScramSha256(keys),
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,9 +4,10 @@
|
||||
//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified.
|
||||
|
||||
use bstr::ByteSlice;
|
||||
use smol_str::SmolStr;
|
||||
|
||||
pub struct PasswordHackPayload {
|
||||
pub endpoint: String,
|
||||
pub endpoint: SmolStr,
|
||||
pub password: Vec<u8>,
|
||||
}
|
||||
|
||||
@@ -18,7 +19,7 @@ impl PasswordHackPayload {
|
||||
if let Some((endpoint, password)) = bytes.split_once_str(sep) {
|
||||
let endpoint = endpoint.to_str().ok()?;
|
||||
return Some(Self {
|
||||
endpoint: parse_endpoint_param(endpoint)?.to_owned(),
|
||||
endpoint: parse_endpoint_param(endpoint)?.into(),
|
||||
password: password.to_owned(),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ use clap::{Parser, ValueEnum};
|
||||
#[derive(Clone, Debug, ValueEnum)]
|
||||
enum AuthBackend {
|
||||
Console,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres,
|
||||
Link,
|
||||
}
|
||||
@@ -289,6 +290,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let api = console::provider::neon::Api::new(endpoint, caches, locks);
|
||||
auth::BackendType::Console(Cow::Owned(api), ())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
AuthBackend::Postgres => {
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let api = console::provider::mock::Api::new(url);
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
#[cfg(feature = "testing")]
|
||||
pub mod mock;
|
||||
pub mod neon;
|
||||
|
||||
use super::messages::MetricsAuxInfo;
|
||||
use crate::{
|
||||
auth::ClientCredentials,
|
||||
auth::backend::ComputeUserInfo,
|
||||
cache::{timed_lru, TimedLru},
|
||||
compute, scram,
|
||||
compute,
|
||||
proxy::LatencyTimer,
|
||||
scram,
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
@@ -205,6 +208,7 @@ pub struct ConsoleReqExtra<'a> {
|
||||
|
||||
/// Auth secret which is managed by the cloud.
|
||||
pub enum AuthSecret {
|
||||
#[cfg(feature = "testing")]
|
||||
/// Md5 hash of user's password.
|
||||
Md5([u8; 16]),
|
||||
|
||||
@@ -247,20 +251,23 @@ pub trait Api {
|
||||
async fn get_auth_info(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> Result<AuthInfo, errors::GetAuthInfoError>;
|
||||
|
||||
async fn get_allowed_ips(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,8 @@ use super::{
|
||||
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
||||
AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
|
||||
};
|
||||
use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUrl};
|
||||
use crate::proxy::LatencyTimer;
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use thiserror::Error;
|
||||
@@ -47,7 +48,7 @@ impl Api {
|
||||
|
||||
async fn do_get_auth_info(
|
||||
&self,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
let (secret, allowed_ips) = async {
|
||||
// Perhaps we could persist this connection, but then we'd have to
|
||||
@@ -60,7 +61,7 @@ impl Api {
|
||||
let secret = match get_execute_postgres_query(
|
||||
&client,
|
||||
"select rolpassword from pg_catalog.pg_authid where rolname = $1",
|
||||
&[&creds.user],
|
||||
&[&&*creds.inner.user],
|
||||
"rolpassword",
|
||||
)
|
||||
.await?
|
||||
@@ -71,14 +72,14 @@ impl Api {
|
||||
secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
|
||||
}
|
||||
None => {
|
||||
warn!("user '{}' does not exist", creds.user);
|
||||
warn!("user '{}' does not exist", creds.inner.user);
|
||||
None
|
||||
}
|
||||
};
|
||||
let allowed_ips = match get_execute_postgres_query(
|
||||
&client,
|
||||
"select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
|
||||
&[&creds.project.clone().unwrap_or_default().as_str()],
|
||||
&[&creds.endpoint.as_str()],
|
||||
"allowed_ips",
|
||||
)
|
||||
.await?
|
||||
@@ -145,7 +146,8 @@ impl super::Api for Api {
|
||||
async fn get_auth_info(
|
||||
&self,
|
||||
_extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
_latency_timer: &mut LatencyTimer,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
self.do_get_auth_info(creds).await
|
||||
}
|
||||
@@ -153,7 +155,8 @@ impl super::Api for Api {
|
||||
async fn get_allowed_ips(
|
||||
&self,
|
||||
_extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
_latency_timer: &mut LatencyTimer,
|
||||
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
|
||||
Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
|
||||
}
|
||||
@@ -162,7 +165,8 @@ impl super::Api for Api {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_extra: &ConsoleReqExtra<'_>,
|
||||
_creds: &ClientCredentials,
|
||||
_creds: &ComputeUserInfo,
|
||||
_latency_timer: &mut LatencyTimer,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
self.do_wake_compute()
|
||||
.map_ok(CachedNodeInfo::new_uncached)
|
||||
|
||||
@@ -5,12 +5,8 @@ use super::{
|
||||
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
||||
ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
|
||||
};
|
||||
use crate::{
|
||||
auth::ClientCredentials,
|
||||
compute, http,
|
||||
proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
|
||||
scram,
|
||||
};
|
||||
use crate::proxy::{LatencyTimer, ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
@@ -53,7 +49,7 @@ impl Api {
|
||||
async fn do_get_auth_info(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
async {
|
||||
@@ -65,8 +61,8 @@ impl Api {
|
||||
.query(&[("session_id", extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", extra.application_name),
|
||||
("project", Some(creds.project().expect("impossible"))),
|
||||
("role", Some(creds.user)),
|
||||
("project", Some(&creds.endpoint)),
|
||||
("role", Some(&creds.inner.user)),
|
||||
])
|
||||
.build()?;
|
||||
|
||||
@@ -106,9 +102,8 @@ impl Api {
|
||||
async fn do_wake_compute(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<NodeInfo, WakeComputeError> {
|
||||
let project = creds.project().expect("impossible");
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
async {
|
||||
let request = self
|
||||
@@ -119,7 +114,7 @@ impl Api {
|
||||
.query(&[("session_id", extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", extra.application_name),
|
||||
("project", Some(project)),
|
||||
("project", Some(&creds.endpoint)),
|
||||
("options", extra.options),
|
||||
])
|
||||
.build()?;
|
||||
@@ -162,17 +157,20 @@ impl super::Api for Api {
|
||||
async fn get_auth_info(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
let _timer = latency_timer.control_plane();
|
||||
self.do_get_auth_info(extra, creds).await
|
||||
}
|
||||
|
||||
async fn get_allowed_ips(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
|
||||
let key: &str = creds.project().expect("impossible");
|
||||
let key: &str = &creds.endpoint;
|
||||
if let Some(allowed_ips) = self.caches.allowed_ips.get(key) {
|
||||
ALLOWED_IPS_BY_CACHE_OUTCOME
|
||||
.with_label_values(&["hit"])
|
||||
@@ -182,7 +180,11 @@ impl super::Api for Api {
|
||||
ALLOWED_IPS_BY_CACHE_OUTCOME
|
||||
.with_label_values(&["miss"])
|
||||
.inc();
|
||||
|
||||
let timer = latency_timer.control_plane();
|
||||
let allowed_ips = Arc::new(self.do_get_auth_info(extra, creds).await?.allowed_ips);
|
||||
drop(timer);
|
||||
|
||||
self.caches
|
||||
.allowed_ips
|
||||
.insert(key.into(), allowed_ips.clone());
|
||||
@@ -193,9 +195,10 @@ impl super::Api for Api {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
let key: &str = &creds.cache_key;
|
||||
let key: &str = &creds.inner.cache_key;
|
||||
|
||||
// Every time we do a wakeup http request, the compute node will stay up
|
||||
// for some time (highly depends on the console's scale-to-zero policy);
|
||||
@@ -219,7 +222,10 @@ impl super::Api for Api {
|
||||
}
|
||||
}
|
||||
|
||||
let timer = latency_timer.control_plane();
|
||||
let node = self.do_wake_compute(extra, creds).await?;
|
||||
drop(timer);
|
||||
|
||||
let (_, cached) = self.caches.node_info.insert(key.clone(), node);
|
||||
info!(key = &*key, "created a cache entry for compute node info");
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
mod tests;
|
||||
|
||||
use crate::{
|
||||
auth::{self, backend::AuthSuccess},
|
||||
auth,
|
||||
cancellation::{self, CancelMap},
|
||||
compute::{self, PostgresConnection},
|
||||
config::{AuthenticationConfig, ProxyConfig, TlsConfig},
|
||||
@@ -24,7 +24,7 @@ use prometheus::{
|
||||
IntGaugeVec,
|
||||
};
|
||||
use regex::Regex;
|
||||
use std::{error::Error, io, net::SocketAddr, ops::ControlFlow, sync::Arc, time::Instant};
|
||||
use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant};
|
||||
use tokio::{
|
||||
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
|
||||
time,
|
||||
@@ -110,6 +110,19 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static CONTROL_PLANE_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"proxy_compute_connection_control_plane_latency_seconds",
|
||||
"Time proxy spent talking to control-plane/console while trying to establish a connection to the compute endpoint",
|
||||
// http/ws/tcp, true/false, true/false, success/failure
|
||||
// 3 * 2 * 2 * 2 = 24 counters
|
||||
&["protocol", "cache_miss", "pool_miss", "outcome"],
|
||||
// largest bucket = 2^16 * 0.5ms = 32s
|
||||
exponential_buckets(0.0005, 2.0, 16).unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"proxy_console_request_latency",
|
||||
@@ -174,6 +187,10 @@ pub struct LatencyTimer {
|
||||
start: Option<Instant>,
|
||||
// accumulated time on the stopwatch
|
||||
accumulated: std::time::Duration,
|
||||
// time since the stopwatch was started while talking to control-plane
|
||||
start_cp: Option<Instant>,
|
||||
// accumulated time on the stopwatch while talking to control-plane
|
||||
accumulated_cp: std::time::Duration,
|
||||
// label data
|
||||
protocol: &'static str,
|
||||
cache_miss: bool,
|
||||
@@ -181,7 +198,11 @@ pub struct LatencyTimer {
|
||||
outcome: &'static str,
|
||||
}
|
||||
|
||||
pub struct LatencyTimerPause<'a> {
|
||||
pub struct LatencyTimerUserIO<'a> {
|
||||
timer: &'a mut LatencyTimer,
|
||||
}
|
||||
|
||||
pub struct LatencyTimerControlPlane<'a> {
|
||||
timer: &'a mut LatencyTimer,
|
||||
}
|
||||
|
||||
@@ -190,6 +211,8 @@ impl LatencyTimer {
|
||||
Self {
|
||||
start: Some(Instant::now()),
|
||||
accumulated: std::time::Duration::ZERO,
|
||||
start_cp: None,
|
||||
accumulated_cp: std::time::Duration::ZERO,
|
||||
protocol,
|
||||
cache_miss: false,
|
||||
// by default we don't do pooling
|
||||
@@ -199,11 +222,17 @@ impl LatencyTimer {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pause(&mut self) -> LatencyTimerPause<'_> {
|
||||
pub fn control_plane(&mut self) -> LatencyTimerControlPlane<'_> {
|
||||
// start the stopwatch again
|
||||
self.start = Some(Instant::now());
|
||||
LatencyTimerControlPlane { timer: self }
|
||||
}
|
||||
|
||||
pub fn wait_for_user(&mut self) -> LatencyTimerUserIO<'_> {
|
||||
// stop the stopwatch and record the time that we have accumulated
|
||||
let start = self.start.take().expect("latency timer should be started");
|
||||
self.accumulated += start.elapsed();
|
||||
LatencyTimerPause { timer: self }
|
||||
LatencyTimerUserIO { timer: self }
|
||||
}
|
||||
|
||||
pub fn cache_miss(&mut self) {
|
||||
@@ -219,13 +248,25 @@ impl LatencyTimer {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LatencyTimerPause<'_> {
|
||||
impl Drop for LatencyTimerUserIO<'_> {
|
||||
fn drop(&mut self) {
|
||||
// start the stopwatch again
|
||||
self.timer.start = Some(Instant::now());
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LatencyTimerControlPlane<'_> {
|
||||
fn drop(&mut self) {
|
||||
// stop the control-plane stopwatch and record the time that we have accumulated
|
||||
let start = self
|
||||
.timer
|
||||
.start_cp
|
||||
.take()
|
||||
.expect("latency timer should be started");
|
||||
self.timer.accumulated_cp += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LatencyTimer {
|
||||
fn drop(&mut self) {
|
||||
let duration =
|
||||
@@ -237,7 +278,21 @@ impl Drop for LatencyTimer {
|
||||
bool_to_str(self.pool_miss),
|
||||
self.outcome,
|
||||
])
|
||||
.observe(duration.as_secs_f64())
|
||||
.observe(duration.as_secs_f64());
|
||||
|
||||
let duration_cp = self
|
||||
.start_cp
|
||||
.map(|start| start.elapsed())
|
||||
.unwrap_or_default()
|
||||
+ self.accumulated_cp;
|
||||
CONTROL_PLANE_LATENCY
|
||||
.with_label_values(&[
|
||||
self.protocol,
|
||||
bool_to_str(self.cache_miss),
|
||||
bool_to_str(self.pool_miss),
|
||||
self.outcome,
|
||||
])
|
||||
.observe(duration_cp.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,7 +373,7 @@ pub async fn task_main(
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr).await
|
||||
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr.ip()).await
|
||||
}
|
||||
.instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty))
|
||||
.unwrap_or_else(move |e| {
|
||||
@@ -408,7 +463,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
session_id: uuid::Uuid,
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<()> {
|
||||
info!(
|
||||
protocol = mode.protocol_label(),
|
||||
@@ -666,7 +721,7 @@ pub async fn connect_to_compute<M: ConnectMechanism>(
|
||||
mechanism: &M,
|
||||
mut node_info: console::CachedNodeInfo,
|
||||
extra: &console::ConsoleReqExtra<'_>,
|
||||
creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
|
||||
creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
|
||||
mut latency_timer: LatencyTimer,
|
||||
) -> Result<M::Connection, M::Error>
|
||||
where
|
||||
@@ -695,11 +750,17 @@ where
|
||||
info!("compute node's state has likely changed; requesting a wake-up");
|
||||
let node_info = loop {
|
||||
let wake_res = match creds {
|
||||
auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
|
||||
auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
|
||||
auth::BackendType::Console(api, creds) => {
|
||||
api.wake_compute(extra, creds, &mut latency_timer).await
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
auth::BackendType::Postgres(api, creds) => {
|
||||
api.wake_compute(extra, creds, &mut latency_timer).await
|
||||
}
|
||||
// nothing to do?
|
||||
auth::BackendType::Link(_) => return Err(err.into()),
|
||||
// test backend
|
||||
#[cfg(test)]
|
||||
auth::BackendType::Test(x) => x.wake_compute(),
|
||||
};
|
||||
|
||||
@@ -838,7 +899,6 @@ pub fn retry_after(num_retries: u32) -> time::Duration {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn prepare_client_connection(
|
||||
node: &compute::PostgresConnection,
|
||||
reported_auth_ok: bool,
|
||||
session: cancellation::Session<'_>,
|
||||
stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -846,13 +906,6 @@ async fn prepare_client_connection(
|
||||
// The new token (cancel_key_data) will be sent to the client.
|
||||
let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
|
||||
|
||||
// Report authentication success if we haven't done this already.
|
||||
// Note that we do this only (for the most part) after we've connected
|
||||
// to a compute (see above) which performs its own authentication.
|
||||
if !reported_auth_ok {
|
||||
stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
}
|
||||
|
||||
// Forward all postgres connection params to the client.
|
||||
// Right now the implementation is very hacky and inefficent (ideally,
|
||||
// we don't need an intermediate hashmap), but at least it should be correct.
|
||||
@@ -921,7 +974,7 @@ struct Client<'a, S> {
|
||||
/// The underlying libpq protocol stream.
|
||||
stream: PqStream<Stream<S>>,
|
||||
/// Client credentials that we care about.
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials>,
|
||||
/// KV-dictionary with PostgreSQL connection params.
|
||||
params: &'a StartupMessageParams,
|
||||
/// Unique connection ID.
|
||||
@@ -934,7 +987,7 @@ impl<'a, S> Client<'a, S> {
|
||||
/// Construct a new connection context.
|
||||
fn new(
|
||||
stream: PqStream<Stream<S>>,
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials>,
|
||||
params: &'a StartupMessageParams,
|
||||
session_id: uuid::Uuid,
|
||||
allow_self_signed_compute: bool,
|
||||
@@ -953,7 +1006,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
/// Let the client authenticate and connect to the designated compute node.
|
||||
// Instrumentation logs endpoint name everywhere. Doesn't work for link
|
||||
// auth; strictly speaking we don't know endpoint name in its case.
|
||||
#[tracing::instrument(name = "", fields(ep = self.creds.get_endpoint().unwrap_or("".to_owned())), skip_all)]
|
||||
#[tracing::instrument(name = "", fields(ep = %self.creds.get_endpoint().unwrap_or_default()), skip_all)]
|
||||
async fn connect_to_db(
|
||||
self,
|
||||
session: cancellation::Session<'_>,
|
||||
@@ -962,7 +1015,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
) -> anyhow::Result<()> {
|
||||
let Self {
|
||||
mut stream,
|
||||
mut creds,
|
||||
creds,
|
||||
params,
|
||||
session_id,
|
||||
allow_self_signed_compute,
|
||||
@@ -978,6 +1031,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
|
||||
let mut latency_timer = LatencyTimer::new(mode.protocol_label());
|
||||
|
||||
let user = creds.get_user().to_owned();
|
||||
let auth_result = match creds
|
||||
.authenticate(
|
||||
&extra,
|
||||
@@ -990,7 +1044,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
{
|
||||
Ok(auth_result) => auth_result,
|
||||
Err(e) => {
|
||||
let user = creds.get_user();
|
||||
let db = params.get("database");
|
||||
let app = params.get("application_name");
|
||||
let params_span = tracing::info_span!("", ?user, ?db, ?app);
|
||||
@@ -999,10 +1052,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
}
|
||||
};
|
||||
|
||||
let AuthSuccess {
|
||||
reported_auth_ok,
|
||||
value: mut node_info,
|
||||
} = auth_result;
|
||||
let (mut node_info, creds) = auth_result;
|
||||
|
||||
node_info.allow_self_signed_compute = allow_self_signed_compute;
|
||||
|
||||
@@ -1025,7 +1075,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
|
||||
}
|
||||
|
||||
prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
|
||||
prepare_client_connection(&node, session, &mut stream).await?;
|
||||
// Before proxy passing, forward to compute whatever data is left in the
|
||||
// PqStream input buffer. Normally there is none, but our serverless npm
|
||||
// driver in pipeline mode sends startup, password and first query
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
mod mitm;
|
||||
|
||||
use super::*;
|
||||
use crate::auth::backend::TestBackend;
|
||||
use crate::auth::ClientCredentials;
|
||||
use crate::auth::backend::{ComputeUserInfo, TestBackend};
|
||||
use crate::config::CertResolver;
|
||||
use crate::console::{CachedNodeInfo, NodeInfo};
|
||||
use crate::{auth, http, sasl, scram};
|
||||
@@ -109,8 +108,9 @@ fn generate_tls_config<'a>(
|
||||
trait TestAuth: Sized {
|
||||
async fn authenticate<S: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
self,
|
||||
_stream: &mut PqStream<Stream<S>>,
|
||||
stream: &mut PqStream<Stream<S>>,
|
||||
) -> anyhow::Result<()> {
|
||||
stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -168,7 +168,6 @@ async fn dummy_proxy(
|
||||
auth.authenticate(&mut stream).await?;
|
||||
|
||||
stream
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&Be::CLIENT_ENCODING)?
|
||||
.write_message(&Be::ReadyForQuery)
|
||||
.await?;
|
||||
@@ -486,7 +485,7 @@ fn helper_create_connect_info(
|
||||
) -> (
|
||||
CachedNodeInfo,
|
||||
console::ConsoleReqExtra<'static>,
|
||||
auth::BackendType<'_, ClientCredentials<'static>>,
|
||||
auth::BackendType<'_, ComputeUserInfo>,
|
||||
) {
|
||||
let cache = helper_create_cached_node_info();
|
||||
let extra = console::ConsoleReqExtra {
|
||||
|
||||
@@ -15,7 +15,7 @@ mod signature;
|
||||
#[cfg(any(test, doc))]
|
||||
mod password;
|
||||
|
||||
pub use exchange::Exchange;
|
||||
pub use exchange::{exchange, Exchange};
|
||||
pub use key::ScramKey;
|
||||
pub use secret::ServerSecret;
|
||||
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
//! Implementation of the SCRAM authentication algorithm.
|
||||
|
||||
use std::convert::Infallible;
|
||||
|
||||
use postgres_protocol::authentication::sasl::ScramSha256;
|
||||
|
||||
use super::messages::{
|
||||
ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
|
||||
};
|
||||
@@ -29,22 +33,27 @@ impl std::str::FromStr for TlsServerEndPoint {
|
||||
}
|
||||
}
|
||||
|
||||
struct SaslSentInner {
|
||||
cbind_flag: ChannelBinding<TlsServerEndPoint>,
|
||||
client_first_message_bare: String,
|
||||
server_first_message: OwnedServerFirstMessage,
|
||||
}
|
||||
|
||||
struct SaslInitial {
|
||||
nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
|
||||
}
|
||||
|
||||
enum ExchangeState {
|
||||
/// Waiting for [`ClientFirstMessage`].
|
||||
Initial,
|
||||
Initial(SaslInitial),
|
||||
/// Waiting for [`ClientFinalMessage`].
|
||||
SaltSent {
|
||||
cbind_flag: ChannelBinding<TlsServerEndPoint>,
|
||||
client_first_message_bare: String,
|
||||
server_first_message: OwnedServerFirstMessage,
|
||||
},
|
||||
SaltSent(SaslSentInner),
|
||||
}
|
||||
|
||||
/// Server's side of SCRAM auth algorithm.
|
||||
pub struct Exchange<'a> {
|
||||
state: ExchangeState,
|
||||
secret: &'a ServerSecret,
|
||||
nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
|
||||
tls_server_end_point: config::TlsServerEndPoint,
|
||||
}
|
||||
|
||||
@@ -55,90 +64,160 @@ impl<'a> Exchange<'a> {
|
||||
tls_server_end_point: config::TlsServerEndPoint,
|
||||
) -> Self {
|
||||
Self {
|
||||
state: ExchangeState::Initial,
|
||||
state: ExchangeState::Initial(SaslInitial { nonce }),
|
||||
secret,
|
||||
nonce,
|
||||
tls_server_end_point,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exchange(
|
||||
secret: &ServerSecret,
|
||||
mut client: ScramSha256,
|
||||
tls_server_end_point: config::TlsServerEndPoint,
|
||||
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
|
||||
use sasl::Step::*;
|
||||
|
||||
let init = SaslInitial {
|
||||
nonce: rand::random,
|
||||
};
|
||||
|
||||
let client_first = std::str::from_utf8(client.message())
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
|
||||
let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
|
||||
Continue(sent, server_first) => {
|
||||
client.update(server_first.as_bytes())?;
|
||||
sent
|
||||
}
|
||||
Success(x, _) => match x {},
|
||||
Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
|
||||
};
|
||||
|
||||
let client_final = std::str::from_utf8(client.message())
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
|
||||
let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
|
||||
Success(keys, server_final) => {
|
||||
client.finish(server_final.as_bytes())?;
|
||||
keys
|
||||
}
|
||||
Continue(x, _) => match x {},
|
||||
Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
|
||||
};
|
||||
|
||||
Ok(sasl::Outcome::Success(keys))
|
||||
}
|
||||
|
||||
impl SaslInitial {
|
||||
fn transition(
|
||||
&self,
|
||||
secret: &ServerSecret,
|
||||
tls_server_end_point: &config::TlsServerEndPoint,
|
||||
input: &str,
|
||||
) -> sasl::Result<sasl::Step<SaslSentInner, Infallible>> {
|
||||
let client_first_message = ClientFirstMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
|
||||
|
||||
// If the flag is set to "y" and the server supports channel
|
||||
// binding, the server MUST fail authentication
|
||||
if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer
|
||||
&& tls_server_end_point.supported()
|
||||
{
|
||||
return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used"));
|
||||
}
|
||||
|
||||
let server_first_message = client_first_message.build_server_first_message(
|
||||
&(self.nonce)(),
|
||||
&secret.salt_base64,
|
||||
secret.iterations,
|
||||
);
|
||||
let msg = server_first_message.as_str().to_owned();
|
||||
|
||||
let next = SaslSentInner {
|
||||
cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?,
|
||||
client_first_message_bare: client_first_message.bare.to_owned(),
|
||||
server_first_message,
|
||||
};
|
||||
|
||||
Ok(sasl::Step::Continue(next, msg))
|
||||
}
|
||||
}
|
||||
|
||||
impl SaslSentInner {
|
||||
fn transition(
|
||||
&self,
|
||||
secret: &ServerSecret,
|
||||
tls_server_end_point: &config::TlsServerEndPoint,
|
||||
input: &str,
|
||||
) -> sasl::Result<sasl::Step<Infallible, super::ScramKey>> {
|
||||
let Self {
|
||||
cbind_flag,
|
||||
client_first_message_bare,
|
||||
server_first_message,
|
||||
} = self;
|
||||
|
||||
let client_final_message = ClientFinalMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
|
||||
|
||||
let channel_binding = cbind_flag.encode(|_| match tls_server_end_point {
|
||||
config::TlsServerEndPoint::Sha256(x) => Ok(x),
|
||||
config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding),
|
||||
})?;
|
||||
|
||||
// This might've been caused by a MITM attack
|
||||
if client_final_message.channel_binding != channel_binding {
|
||||
return Err(SaslError::ChannelBindingFailed(
|
||||
"insecure connection: secure channel data mismatch",
|
||||
));
|
||||
}
|
||||
|
||||
if client_final_message.nonce != server_first_message.nonce() {
|
||||
return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
|
||||
}
|
||||
|
||||
let signature_builder = SignatureBuilder {
|
||||
client_first_message_bare,
|
||||
server_first_message: server_first_message.as_str(),
|
||||
client_final_message_without_proof: client_final_message.without_proof,
|
||||
};
|
||||
|
||||
let client_key = signature_builder
|
||||
.build(&secret.stored_key)
|
||||
.derive_client_key(&client_final_message.proof);
|
||||
|
||||
// Auth fails either if keys don't match or it's pre-determined to fail.
|
||||
if client_key.sha256() != secret.stored_key || secret.doomed {
|
||||
return Ok(sasl::Step::Failure("password doesn't match"));
|
||||
}
|
||||
|
||||
let msg =
|
||||
client_final_message.build_server_final_message(signature_builder, &secret.server_key);
|
||||
|
||||
Ok(sasl::Step::Success(client_key, msg))
|
||||
}
|
||||
}
|
||||
|
||||
impl sasl::Mechanism for Exchange<'_> {
|
||||
type Output = super::ScramKey;
|
||||
|
||||
fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
|
||||
use {sasl::Step::*, ExchangeState::*};
|
||||
match &self.state {
|
||||
Initial => {
|
||||
let client_first_message = ClientFirstMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
|
||||
|
||||
// If the flag is set to "y" and the server supports channel
|
||||
// binding, the server MUST fail authentication
|
||||
if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer
|
||||
&& self.tls_server_end_point.supported()
|
||||
{
|
||||
return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used"));
|
||||
Initial(init) => {
|
||||
match init.transition(self.secret, &self.tls_server_end_point, input)? {
|
||||
Continue(sent, msg) => {
|
||||
self.state = SaltSent(sent);
|
||||
Ok(Continue(self, msg))
|
||||
}
|
||||
Success(x, _) => match x {},
|
||||
Failure(msg) => Ok(Failure(msg)),
|
||||
}
|
||||
|
||||
let server_first_message = client_first_message.build_server_first_message(
|
||||
&(self.nonce)(),
|
||||
&self.secret.salt_base64,
|
||||
self.secret.iterations,
|
||||
);
|
||||
let msg = server_first_message.as_str().to_owned();
|
||||
|
||||
self.state = SaltSent {
|
||||
cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?,
|
||||
client_first_message_bare: client_first_message.bare.to_owned(),
|
||||
server_first_message,
|
||||
};
|
||||
|
||||
Ok(Continue(self, msg))
|
||||
}
|
||||
SaltSent {
|
||||
cbind_flag,
|
||||
client_first_message_bare,
|
||||
server_first_message,
|
||||
} => {
|
||||
let client_final_message = ClientFinalMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
|
||||
|
||||
let channel_binding = cbind_flag.encode(|_| match &self.tls_server_end_point {
|
||||
config::TlsServerEndPoint::Sha256(x) => Ok(x),
|
||||
config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding),
|
||||
})?;
|
||||
|
||||
// This might've been caused by a MITM attack
|
||||
if client_final_message.channel_binding != channel_binding {
|
||||
return Err(SaslError::ChannelBindingFailed(
|
||||
"insecure connection: secure channel data mismatch",
|
||||
));
|
||||
SaltSent(sent) => {
|
||||
match sent.transition(self.secret, &self.tls_server_end_point, input)? {
|
||||
Success(keys, msg) => Ok(Success(keys, msg)),
|
||||
Continue(x, _) => match x {},
|
||||
Failure(msg) => Ok(Failure(msg)),
|
||||
}
|
||||
|
||||
if client_final_message.nonce != server_first_message.nonce() {
|
||||
return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
|
||||
}
|
||||
|
||||
let signature_builder = SignatureBuilder {
|
||||
client_first_message_bare,
|
||||
server_first_message: server_first_message.as_str(),
|
||||
client_final_message_without_proof: client_final_message.without_proof,
|
||||
};
|
||||
|
||||
let client_key = signature_builder
|
||||
.build(&self.secret.stored_key)
|
||||
.derive_client_key(&client_final_message.proof);
|
||||
|
||||
// Auth fails either if keys don't match or it's pre-determined to fail.
|
||||
if client_key.sha256() != self.secret.stored_key || self.secret.doomed {
|
||||
return Ok(Failure("password doesn't match"));
|
||||
}
|
||||
|
||||
let msg = client_final_message
|
||||
.build_server_final_message(signature_builder, &self.secret.server_key);
|
||||
|
||||
Ok(Success(client_key, msg))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ use hyper::{
|
||||
Body, Method, Request, Response,
|
||||
};
|
||||
|
||||
use std::net::SocketAddr;
|
||||
use std::net::IpAddr;
|
||||
use std::task::Poll;
|
||||
use std::{future::ready, sync::Arc};
|
||||
use tls_listener::TlsListener;
|
||||
@@ -103,7 +103,13 @@ pub async fn task_main(
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
|
||||
request_handler(
|
||||
req, config, conn_pool, cancel_map, session_id, sni_name, peer_addr,
|
||||
req,
|
||||
config,
|
||||
conn_pool,
|
||||
cancel_map,
|
||||
session_id,
|
||||
sni_name,
|
||||
peer_addr.ip(),
|
||||
)
|
||||
.instrument(info_span!(
|
||||
"serverless",
|
||||
@@ -171,7 +177,7 @@ async fn request_handler(
|
||||
cancel_map: Arc<CancelMap>,
|
||||
session_id: uuid::Uuid,
|
||||
sni_hostname: Option<String>,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let host = request
|
||||
.headers()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, Context};
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use futures::future::poll_fn;
|
||||
@@ -9,7 +9,7 @@ use pbkdf2::{
|
||||
};
|
||||
use pq_proto::StartupMessageParams;
|
||||
use smol_str::SmolStr;
|
||||
use std::{collections::HashMap, net::SocketAddr, sync::Arc};
|
||||
use std::{collections::HashMap, net::IpAddr, sync::Arc};
|
||||
use std::{
|
||||
fmt,
|
||||
task::{ready, Poll},
|
||||
@@ -22,7 +22,7 @@ use tokio::time;
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
|
||||
|
||||
use crate::{
|
||||
auth::{self, check_peer_addr_is_in_list},
|
||||
auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
|
||||
console,
|
||||
proxy::{
|
||||
neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
|
||||
@@ -146,7 +146,7 @@ impl GlobalConnPool {
|
||||
conn_info: &ConnInfo,
|
||||
force_new: bool,
|
||||
session_id: uuid::Uuid,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<Client> {
|
||||
let mut client: Option<ClientInner> = None;
|
||||
let mut latency_timer = LatencyTimer::new("http");
|
||||
@@ -405,8 +405,8 @@ async fn connect_to_compute(
|
||||
conn_info: &ConnInfo,
|
||||
conn_id: uuid::Uuid,
|
||||
session_id: uuid::Uuid,
|
||||
latency_timer: LatencyTimer,
|
||||
peer_addr: SocketAddr,
|
||||
mut latency_timer: LatencyTimer,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<ClientInner> {
|
||||
let tls = config.tls_config.as_ref();
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
@@ -423,6 +423,9 @@ async fn connect_to_compute(
|
||||
common_names,
|
||||
peer_addr,
|
||||
)?;
|
||||
|
||||
let creds =
|
||||
ComputeUserInfo::try_from(creds).map_err(|_| anyhow!("missing endpoint identifier"))?;
|
||||
let backend = config.auth_backend.as_ref().map(|_| creds);
|
||||
|
||||
let console_options = neon_options(¶ms);
|
||||
@@ -434,13 +437,13 @@ async fn connect_to_compute(
|
||||
};
|
||||
// TODO(anna): this is a bit hacky way, consider using console notification listener.
|
||||
if !config.disable_ip_check_for_http {
|
||||
let allowed_ips = backend.get_allowed_ips(&extra).await?;
|
||||
if !check_peer_addr_is_in_list(&peer_addr.ip(), &allowed_ips) {
|
||||
let allowed_ips = backend.get_allowed_ips(&extra, &mut latency_timer).await?;
|
||||
if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed().into());
|
||||
}
|
||||
}
|
||||
let node_info = backend
|
||||
.wake_compute(&extra)
|
||||
.wake_compute(&extra, &mut latency_timer)
|
||||
.await?
|
||||
.context("missing cache entry from wake_compute")?;
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::bail;
|
||||
@@ -202,7 +202,7 @@ pub async fn handle(
|
||||
sni_hostname: Option<String>,
|
||||
conn_pool: Arc<GlobalConnPool>,
|
||||
session_id: uuid::Uuid,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
config: &'static HttpConfig,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let result = tokio::time::timeout(
|
||||
@@ -301,7 +301,7 @@ async fn handle_inner(
|
||||
sni_hostname: Option<String>,
|
||||
conn_pool: Arc<GlobalConnPool>,
|
||||
session_id: uuid::Uuid,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<Response<Body>> {
|
||||
NUM_CONNECTIONS_ACCEPTED_COUNTER
|
||||
.with_label_values(&["http"])
|
||||
|
||||
@@ -11,7 +11,7 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
|
||||
use pin_project_lite::pin_project;
|
||||
|
||||
use std::{
|
||||
net::SocketAddr,
|
||||
net::IpAddr,
|
||||
pin::Pin,
|
||||
task::{ready, Context, Poll},
|
||||
};
|
||||
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
|
||||
cancel_map: &CancelMap,
|
||||
session_id: uuid::Uuid,
|
||||
hostname: Option<String>,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<()> {
|
||||
let websocket = websocket.await?;
|
||||
handle_client(
|
||||
|
||||
@@ -12,8 +12,8 @@ typing-extensions = "^4.6.1"
|
||||
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
|
||||
requests = "^2.31.0"
|
||||
pytest-xdist = "^3.3.1"
|
||||
asyncpg = "^0.27.0"
|
||||
aiopg = "^1.3.1"
|
||||
asyncpg = "^0.29.0"
|
||||
aiopg = "^1.4.0"
|
||||
Jinja2 = "^3.0.2"
|
||||
types-requests = "^2.31.0.0"
|
||||
types-psycopg2 = "^2.9.21.10"
|
||||
|
||||
@@ -35,6 +35,7 @@ serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["fs"] }
|
||||
tokio-util = { workspace = true }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
toml_edit.workspace = true
|
||||
|
||||
@@ -35,6 +35,9 @@ use once_cell::sync::OnceCell;
|
||||
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
|
||||
const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
|
||||
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
/// Check whether wal backup is required for timeline. If yes, mark that launcher is
|
||||
/// aware of current status and return the timeline.
|
||||
async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
|
||||
@@ -494,15 +497,13 @@ async fn backup_object(
|
||||
.as_ref()
|
||||
.unwrap();
|
||||
|
||||
let file = tokio::io::BufReader::new(
|
||||
File::open(&source_file)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file {} for wal backup", source_file))?,
|
||||
);
|
||||
|
||||
storage
|
||||
.upload_storage_object(Box::new(file), size, target_file)
|
||||
let file = File::open(&source_file)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
|
||||
|
||||
let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE);
|
||||
|
||||
storage.upload_storage_object(file, size, target_file).await
|
||||
}
|
||||
|
||||
pub async fn read_object(
|
||||
@@ -524,5 +525,9 @@ pub async fn read_object(
|
||||
format!("Failed to open WAL segment download stream for remote path {file_path:?}")
|
||||
})?;
|
||||
|
||||
Ok(download.download_stream)
|
||||
let reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let reader = tokio::io::BufReader::with_capacity(BUFFER_SIZE, reader);
|
||||
|
||||
Ok(Box::pin(reader))
|
||||
}
|
||||
|
||||
@@ -363,12 +363,6 @@ class PgProtocol:
|
||||
result.append(cur.fetchall())
|
||||
return result
|
||||
|
||||
def safe_psql_scalar(self, query) -> Any:
|
||||
"""
|
||||
Execute query returning single row with single column.
|
||||
"""
|
||||
return self.safe_psql(query)[0][0]
|
||||
|
||||
|
||||
@dataclass
|
||||
class AuthKeys:
|
||||
@@ -2687,13 +2681,6 @@ class Endpoint(PgProtocol):
|
||||
):
|
||||
self.stop()
|
||||
|
||||
# Checkpoints running endpoint and returns pg_wal size in MB.
|
||||
def get_pg_wal_size(self):
|
||||
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
|
||||
self.safe_psql("checkpoint")
|
||||
assert self.pgdata_dir is not None # please mypy
|
||||
return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
|
||||
|
||||
|
||||
class EndpointFactory:
|
||||
"""An object representing multiple compute endpoints."""
|
||||
@@ -2890,13 +2877,6 @@ class Safekeeper:
|
||||
return segments
|
||||
|
||||
|
||||
# Walreceiver as returned by sk's timeline status endpoint.
|
||||
@dataclass
|
||||
class Walreceiver:
|
||||
conn_id: int
|
||||
state: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
@@ -2907,7 +2887,6 @@ class SafekeeperTimelineStatus:
|
||||
backup_lsn: Lsn
|
||||
peer_horizon_lsn: Lsn
|
||||
remote_consistent_lsn: Lsn
|
||||
walreceivers: List[Walreceiver]
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -2969,7 +2948,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
return SafekeeperTimelineStatus(
|
||||
acceptor_epoch=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
@@ -2979,7 +2957,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
backup_lsn=Lsn(resj["backup_lsn"]),
|
||||
peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
|
||||
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
|
||||
walreceivers=walreceivers,
|
||||
)
|
||||
|
||||
def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
|
||||
|
||||
@@ -55,7 +55,7 @@ def do_gc_target(
|
||||
class ReattachMode(str, enum.Enum):
|
||||
REATTACH_EXPLICIT = "explicit"
|
||||
REATTACH_RESET = "reset"
|
||||
REATTACH_RESET_DROP = "reset"
|
||||
REATTACH_RESET_DROP = "reset_drop"
|
||||
|
||||
|
||||
# Basic detach and re-attach test
|
||||
|
||||
@@ -422,8 +422,7 @@ def wait(f, desc, timeout=30, wait_f=None):
|
||||
try:
|
||||
if f():
|
||||
break
|
||||
except Exception as e:
|
||||
log.info(f"got exception while waiting for {desc}: {e}")
|
||||
except Exception:
|
||||
pass
|
||||
elapsed = time.time() - started_at
|
||||
if elapsed > timeout:
|
||||
@@ -1007,40 +1006,8 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
|
||||
endpoint.start()
|
||||
|
||||
|
||||
# Context manager which logs passed time on exit.
|
||||
class DurationLogger:
|
||||
def __init__(self, desc):
|
||||
self.desc = desc
|
||||
|
||||
def __enter__(self):
|
||||
self.ts_before = time.time()
|
||||
|
||||
def __exit__(self, *exc):
|
||||
log.info(f"{self.desc} finished in {time.time() - self.ts_before}s")
|
||||
|
||||
|
||||
# Context manager which logs WAL position change on exit.
|
||||
class WalChangeLogger:
|
||||
def __init__(self, ep, desc_before):
|
||||
self.ep = ep
|
||||
self.desc_before = desc_before
|
||||
|
||||
def __enter__(self):
|
||||
self.ts_before = time.time()
|
||||
self.lsn_before = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
|
||||
log.info(f"{self.desc_before}, lsn_before={self.lsn_before}")
|
||||
|
||||
def __exit__(self, *exc):
|
||||
lsn_after = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
|
||||
log.info(
|
||||
f"inserted {((lsn_after - self.lsn_before) / 1024 / 1024):.3f} MB of WAL in {(time.time() - self.ts_before):.3f}s"
|
||||
)
|
||||
|
||||
|
||||
# Test that we can create timeline with one safekeeper down and initialize it
|
||||
# later when some data already had been written. It is strictly weaker than
|
||||
# test_lagging_sk, but also is the simplest test to trigger WAL sk -> compute
|
||||
# download (recovery) and as such useful for development/testing.
|
||||
# later when some data already had been written.
|
||||
def test_late_init(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -1048,13 +1015,12 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
|
||||
sk1 = env.safekeepers[0]
|
||||
sk1.stop()
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_late_init")
|
||||
endpoint = env.endpoints.create_start("test_late_init")
|
||||
# create and insert smth while safekeeper is down...
|
||||
env.neon_cli.create_branch("test_late_init")
|
||||
endpoint = env.endpoints.create_start("test_late_init")
|
||||
endpoint.safe_psql("create table t(key int, value text)")
|
||||
with WalChangeLogger(endpoint, "doing insert with sk1 down"):
|
||||
endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
|
||||
endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
|
||||
log.info("insert with safekeeper down done")
|
||||
endpoint.stop() # stop compute
|
||||
|
||||
# stop another safekeeper, and start one which missed timeline creation
|
||||
@@ -1063,213 +1029,28 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
|
||||
sk1.start()
|
||||
|
||||
# insert some more
|
||||
with DurationLogger("recovery"):
|
||||
endpoint = env.endpoints.create_start("test_late_init")
|
||||
endpoint = env.endpoints.create_start("test_late_init")
|
||||
endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
|
||||
|
||||
wait_flush_lsn_align_by_ep(
|
||||
env, "test_late_init", tenant_id, timeline_id, endpoint, [sk1, env.safekeepers[2]]
|
||||
)
|
||||
# Check that WALs are the same.
|
||||
cmp_sk_wal([sk1, env.safekeepers[2]], tenant_id, timeline_id)
|
||||
|
||||
|
||||
# is timeline flush_lsn equal on provided safekeepers?
|
||||
def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
|
||||
flush_lsns = [
|
||||
sk_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn
|
||||
for sk_http_cli in sk_http_clis
|
||||
]
|
||||
log.info(f"waiting for flush_lsn alignment, flush_lsns={flush_lsns}")
|
||||
return all([flush_lsns[0] == flsn for flsn in flush_lsns])
|
||||
|
||||
|
||||
def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
status = sk_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
|
||||
return len(status.walreceivers) == 0
|
||||
|
||||
|
||||
# Assert by xxd that WAL on given safekeepers is identical. No compute must be
|
||||
# running for this to be reliable.
|
||||
def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
|
||||
assert len(sks) >= 2, "cmp_sk_wal makes sense with >= 2 safekeepers passed"
|
||||
sk_http_clis = [sk.http_client() for sk in sks]
|
||||
|
||||
# First check that term / flush_lsn are the same: it is easier to
|
||||
# report/understand if WALs are different due to that.
|
||||
statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
|
||||
term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
|
||||
for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
|
||||
assert (
|
||||
term_flush_lsns[0] == tfl
|
||||
), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
|
||||
|
||||
# check that WALs are identic.
|
||||
segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
|
||||
for cmp_segs, sk in zip(segs[1:], sks[1:]):
|
||||
assert (
|
||||
segs[0] == cmp_segs
|
||||
), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}"
|
||||
log.info(f"comparing segs {segs[0]}")
|
||||
|
||||
sk0 = sks[0]
|
||||
for sk in sks[1:]:
|
||||
(_, mismatch, not_regular) = filecmp.cmpfiles(
|
||||
sk0.timeline_dir(tenant_id, timeline_id),
|
||||
sk.timeline_dir(tenant_id, timeline_id),
|
||||
segs[0],
|
||||
shallow=False,
|
||||
)
|
||||
log.info(
|
||||
f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
|
||||
)
|
||||
|
||||
for f in mismatch:
|
||||
f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
|
||||
f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
|
||||
stdout_filename = "{}.filediff".format(f2)
|
||||
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
|
||||
subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
|
||||
|
||||
cmd = "diff {}.hex {}.hex".format(f1, f2)
|
||||
subprocess.run([cmd], stdout=stdout_f, shell=True)
|
||||
|
||||
assert (mismatch, not_regular) == (
|
||||
[],
|
||||
[],
|
||||
), f"WAL segs {f1} and {f2} on sks {sks[0].id} and {sk.id} are not identic"
|
||||
|
||||
|
||||
# Wait until flush_lsn on given sks becomes equal, assuming endpoint ep is
|
||||
# running. ep is stopped by this function. This is used in tests which check
|
||||
# binary equality of WAL segments on safekeepers; which is inherently racy as
|
||||
# shutting down endpoint might always write some WAL which can get to only one
|
||||
# safekeeper. So here we recheck flush_lsn again after ep shutdown and retry if
|
||||
# it has changed.
|
||||
def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks):
|
||||
sk_http_clis = [sk.http_client() for sk in sks]
|
||||
# First wait for the alignment.
|
||||
wait(
|
||||
partial(is_flush_lsn_aligned, sk_http_clis, tenant_id, timeline_id),
|
||||
"flush_lsn to get aligned",
|
||||
def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id):
|
||||
status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(
|
||||
f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}"
|
||||
)
|
||||
ep.stop() # then stop endpoint
|
||||
# Even if there is no compute, there might be some in flight data; ensure
|
||||
# all walreceivers die before rechecking.
|
||||
for sk_http_cli in sk_http_clis:
|
||||
wait(
|
||||
partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id),
|
||||
"walreceivers to be gone",
|
||||
)
|
||||
# Now recheck again flush_lsn and exit if it is good
|
||||
if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
|
||||
return
|
||||
# Otherwise repeat.
|
||||
log.info("flush_lsn changed during endpoint shutdown; retrying alignment")
|
||||
ep = env.endpoints.create_start(branch)
|
||||
return status1.flush_lsn == status2.flush_lsn
|
||||
|
||||
|
||||
# Test behaviour with one safekeeper down and missing a lot of WAL, exercising
|
||||
# neon_walreader and checking that pg_wal never bloats. Namely, ensures that
|
||||
# compute doesn't keep many WAL for lagging sk, but still can recover it with
|
||||
# neon_walreader, in two scenarious: a) WAL never existed on compute (it started
|
||||
# on basebackup LSN later than lagging sk position) though segment file exists
|
||||
# b) WAL had been recycled on it and segment file doesn't exist.
|
||||
#
|
||||
# Also checks along the way that whenever there are two sks alive, compute
|
||||
# should be able to commit.
|
||||
def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
|
||||
# inserts ~20MB of WAL, a bit more than a segment.
|
||||
def fill_segment(ep):
|
||||
ep.safe_psql("insert into t select generate_series(1, 180000), 'payload'")
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
(sk1, sk2, sk3) = env.safekeepers
|
||||
|
||||
# create and insert smth while safekeeper is down...
|
||||
sk1.stop()
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_lagging_sk")
|
||||
ep = env.endpoints.create_start("test_lagging_sk")
|
||||
ep.safe_psql("create table t(key int, value text)")
|
||||
# make small insert to be on the same segment
|
||||
ep.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
|
||||
log.info("insert with safekeeper down done")
|
||||
ep.stop() # stop compute
|
||||
|
||||
# Stop another safekeeper, and start one which missed timeline creation.
|
||||
sk2.stop()
|
||||
sk1.start()
|
||||
|
||||
# Start new ep and insert some more. neon_walreader should download WAL for
|
||||
# sk1 because it should be filled since the horizon (initial LSN) which is
|
||||
# earlier than basebackup LSN.
|
||||
ep = env.endpoints.create_start("test_lagging_sk")
|
||||
ep.safe_psql("insert into t select generate_series(1,100), 'payload'")
|
||||
# stop ep and ensure WAL is identical after recovery.
|
||||
wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
|
||||
# Check that WALs are the same.
|
||||
cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
|
||||
|
||||
# Now repeat insertion with sk1 down, but with inserting more data to check
|
||||
# that WAL on compute is removed.
|
||||
sk1.stop()
|
||||
sk2.start()
|
||||
|
||||
# min_wal_size must be at least 2x segment size.
|
||||
min_wal_config = [
|
||||
"min_wal_size=32MB",
|
||||
"max_wal_size=32MB",
|
||||
"wal_keep_size=0",
|
||||
"log_checkpoints=on",
|
||||
]
|
||||
ep = env.endpoints.create_start(
|
||||
"test_lagging_sk",
|
||||
config_lines=min_wal_config,
|
||||
)
|
||||
with WalChangeLogger(ep, "doing large insert with sk1 down"):
|
||||
for _ in range(0, 5):
|
||||
fill_segment(ep)
|
||||
# there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
|
||||
assert ep.get_pg_wal_size() < 16 * 2.5
|
||||
|
||||
sk2.stop() # stop another sk to ensure sk1 and sk3 can work
|
||||
sk1.start()
|
||||
with DurationLogger("recovery"):
|
||||
ep.safe_psql("insert into t select generate_series(1,100), 'payload'") # forces recovery
|
||||
# stop ep and ensure WAL is identical after recovery.
|
||||
wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
|
||||
# Check that WALs are the same.
|
||||
cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
|
||||
|
||||
# Now do the same with different safekeeper sk2 down, and restarting ep
|
||||
# before recovery (again scenario when recovery starts below basebackup_lsn,
|
||||
# but multi segment now).
|
||||
ep = env.endpoints.create_start(
|
||||
"test_lagging_sk",
|
||||
config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
|
||||
)
|
||||
with WalChangeLogger(ep, "doing large insert with sk2 down"):
|
||||
for _ in range(0, 5):
|
||||
fill_segment(ep)
|
||||
# there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
|
||||
assert ep.get_pg_wal_size() < 16 * 2.5
|
||||
|
||||
ep.stop()
|
||||
ep = env.endpoints.create_start(
|
||||
"test_lagging_sk",
|
||||
config_lines=min_wal_config,
|
||||
)
|
||||
sk2.start()
|
||||
with DurationLogger("recovery"):
|
||||
wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk2, sk3])
|
||||
# Check that WALs are the same.
|
||||
cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id)
|
||||
# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that
|
||||
# 1) walproposer can't recover node if it misses WAL written by previous computes, but
|
||||
# still starts up and functions normally if two other sks are ok.
|
||||
# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions
|
||||
# normally if two other sks are ok.
|
||||
# 3) Lagged safekeeper can still recover by peer recovery.
|
||||
def test_one_sk_down(neon_env_builder: NeonEnvBuilder):
|
||||
pass
|
||||
|
||||
|
||||
# Smaller version of test_one_sk_down testing peer recovery in isolation: that
|
||||
@@ -1311,7 +1092,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
|
||||
|
||||
# wait a bit, lsns shouldn't change
|
||||
time.sleep(2)
|
||||
# time.sleep(5)
|
||||
sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(
|
||||
@@ -1322,11 +1103,37 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
# now restart safekeeper with peer recovery enabled and wait for recovery
|
||||
sk1.stop().start(extra_opts=["--peer-recovery=true"])
|
||||
wait(
|
||||
partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
|
||||
partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
|
||||
"flush_lsn to get aligned",
|
||||
)
|
||||
|
||||
cmp_sk_wal([sk1, sk2], tenant_id, timeline_id)
|
||||
# check that WALs are identic after recovery
|
||||
segs = sk1.list_segments(tenant_id, timeline_id)
|
||||
log.info(f"segs are {segs}")
|
||||
|
||||
(_, mismatch, not_regular) = filecmp.cmpfiles(
|
||||
sk1.timeline_dir(tenant_id, timeline_id),
|
||||
sk2.timeline_dir(tenant_id, timeline_id),
|
||||
segs,
|
||||
shallow=False,
|
||||
)
|
||||
log.info(
|
||||
f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
|
||||
)
|
||||
|
||||
for f in mismatch:
|
||||
f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f)
|
||||
f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f)
|
||||
stdout_filename = "{}.filediff".format(f2)
|
||||
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
|
||||
subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
|
||||
|
||||
cmd = "diff {}.hex {}.hex".format(f1, f2)
|
||||
subprocess.run([cmd], stdout=stdout_f, shell=True)
|
||||
|
||||
assert (mismatch, not_regular) == ([], [])
|
||||
|
||||
# stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
|
||||
env.safekeepers[2].stop()
|
||||
|
||||
@@ -65,7 +65,7 @@ subtle = { version = "2" }
|
||||
time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
|
||||
tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
|
||||
tokio-rustls = { version = "0.24" }
|
||||
tokio-util = { version = "0.7", features = ["codec", "io"] }
|
||||
tokio-util = { version = "0.7", features = ["codec", "compat", "io"] }
|
||||
toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
|
||||
toml_edit = { version = "0.19", features = ["serde"] }
|
||||
tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
|
||||
|
||||
Reference in New Issue
Block a user