mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-22 15:41:15 +00:00
Compare commits
1 Commits
problame/r
...
alek/ololo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
443d9b30d0 |
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -814,7 +814,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.15.4
|
||||
VM_BUILDER_VERSION: v0.15.0-alpha1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
@@ -1111,14 +1111,14 @@ LIMIT 100",
|
||||
.as_millis() as u64;
|
||||
info!("Prepare extensions took {prep_ext_time_delta}ms");
|
||||
|
||||
// Don't try to download libraries that are not in the index.
|
||||
// Assume that they are already present locally.
|
||||
libs_vec.retain(|lib| {
|
||||
self.library_index
|
||||
.get()
|
||||
.expect("error accessing ext_remote_paths")
|
||||
.contains_key(lib)
|
||||
});
|
||||
// // Don't try to download libraries that are not in the index.
|
||||
// // Assume that they are already present locally.
|
||||
// libs_vec.retain(|lib| {
|
||||
// self.library_index
|
||||
// .get()
|
||||
// .expect("error accessing ext_remote_paths")
|
||||
// .contains_key(lib)
|
||||
// });
|
||||
|
||||
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
|
||||
|
||||
|
||||
@@ -156,7 +156,7 @@ pub async fn get_available_extensions(
|
||||
let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
|
||||
let mut enabled_extensions = ext_index_full.public_extensions;
|
||||
enabled_extensions.extend_from_slice(custom_extensions);
|
||||
let mut library_index = ext_index_full.library_index;
|
||||
let library_index = ext_index_full.library_index;
|
||||
let all_extension_data = ext_index_full.extension_data;
|
||||
info!("library_index: {:?}", library_index);
|
||||
|
||||
@@ -179,8 +179,6 @@ pub async fn get_available_extensions(
|
||||
file_create_tasks.push(tokio::fs::write(control_path, control_contents));
|
||||
} else {
|
||||
warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
|
||||
// also delete this from library index
|
||||
library_index.retain(|_, value| value != extension_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
}
|
||||
RoleAction::Create => {
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("role create query: '{}'", &query);
|
||||
|
||||
@@ -24,20 +24,6 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
|
||||
Ok(dir.next_entry().await?.is_none())
|
||||
}
|
||||
|
||||
pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
|
||||
let mut dir = tokio::fs::read_dir(&path)
|
||||
.await
|
||||
.context(format!("read_dir({})", path.as_ref().display()))?;
|
||||
|
||||
let mut content = vec![];
|
||||
while let Some(next) = dir.next_entry().await? {
|
||||
let file_name = next.file_name();
|
||||
content.push(file_name.to_string_lossy().to_string());
|
||||
}
|
||||
|
||||
Ok(content)
|
||||
}
|
||||
|
||||
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Ok(())
|
||||
@@ -57,7 +43,7 @@ where
|
||||
mod test {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::fs_ext::{is_directory_empty, list_dir};
|
||||
use crate::fs_ext::is_directory_empty;
|
||||
|
||||
use super::ignore_absent_files;
|
||||
|
||||
@@ -123,25 +109,4 @@ mod test {
|
||||
|
||||
assert!(!file_path.exists());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list_dir_works() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let dir_path = dir.path();
|
||||
|
||||
assert!(list_dir(dir_path).await.unwrap().is_empty());
|
||||
|
||||
let file_path: PathBuf = dir_path.join("testfile");
|
||||
let _ = std::fs::File::create(&file_path).unwrap();
|
||||
|
||||
assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
|
||||
|
||||
let another_dir_path: PathBuf = dir_path.join("testdir");
|
||||
std::fs::create_dir(another_dir_path).unwrap();
|
||||
|
||||
let expected = &["testdir", "testfile"];
|
||||
let mut actual = list_dir(dir_path).await.unwrap();
|
||||
actual.sort();
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -373,7 +373,7 @@ fn start_pageserver(
|
||||
let order = pageserver::InitializationOrder {
|
||||
initial_tenant_load: Some(init_done_tx),
|
||||
initial_logical_size_can_start: init_done_rx.clone(),
|
||||
initial_logical_size_attempt: Some(init_logical_size_done_tx),
|
||||
initial_logical_size_attempt: init_logical_size_done_tx,
|
||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||
};
|
||||
|
||||
|
||||
@@ -31,9 +31,7 @@ use utils::{
|
||||
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{
|
||||
TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
@@ -615,11 +613,6 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenant_path(tenant_id)
|
||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
|
||||
@@ -93,47 +93,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
delete:
|
||||
description: |
|
||||
Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
|
||||
404 means that deletion successfully finished"
|
||||
responses:
|
||||
"400":
|
||||
description: Error when no tenant id found in path
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenant not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: Deletion is already in progress, continue polling
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
@@ -861,7 +820,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
description: |
|
||||
|
||||
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
format!("Cannot delete timeline which has child timelines: {children:?}")
|
||||
.into_boxed_str(),
|
||||
),
|
||||
a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
|
||||
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
@@ -208,19 +208,6 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
|
||||
use crate::tenant::delete::DeleteTenantError::*;
|
||||
match value {
|
||||
Get(g) => ApiError::from(g),
|
||||
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
@@ -630,23 +617,6 @@ async fn tenant_status(
|
||||
json_response(StatusCode::OK, tenant_info)
|
||||
}
|
||||
|
||||
async fn tenant_delete_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
// TODO openapi spec
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
|
||||
.instrument(info_span!("tenant_delete_handler", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
/// HTTP endpoint to query the current tenant_size of a tenant.
|
||||
///
|
||||
/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
|
||||
@@ -1375,9 +1345,6 @@ pub fn make_router(
|
||||
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
||||
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
||||
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
api_handler(r, tenant_delete_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
|
||||
api_handler(r, tenant_size_handler)
|
||||
})
|
||||
|
||||
@@ -190,7 +190,7 @@ pub struct InitializationOrder {
|
||||
|
||||
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
|
||||
/// attempt. It is important to drop this once the attempt has completed.
|
||||
pub initial_logical_size_attempt: Option<utils::completion::Completion>,
|
||||
pub initial_logical_size_attempt: utils::completion::Completion,
|
||||
|
||||
/// Barrier for when we can start any background jobs.
|
||||
///
|
||||
@@ -226,7 +226,6 @@ async fn timed<Fut: std::future::Future>(
|
||||
|
||||
let ret = fut.await;
|
||||
|
||||
// this has a global allowed_errors
|
||||
tracing::warn!(
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
|
||||
@@ -47,11 +47,13 @@ use std::{
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use tracing::error;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::tenant::writeback_ephemeral_file;
|
||||
use crate::{metrics::PageCacheSizeMetrics, repository::Key};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
@@ -95,6 +97,10 @@ enum CacheKey {
|
||||
hash_key: MaterializedPageHashKey,
|
||||
lsn: Lsn,
|
||||
},
|
||||
EphemeralPage {
|
||||
file_id: u64,
|
||||
blkno: u32,
|
||||
},
|
||||
ImmutableFilePage {
|
||||
file_id: u64,
|
||||
blkno: u32,
|
||||
@@ -122,6 +128,7 @@ struct Slot {
|
||||
struct SlotInner {
|
||||
key: Option<CacheKey>,
|
||||
buf: &'static mut [u8; PAGE_SZ],
|
||||
dirty: bool,
|
||||
}
|
||||
|
||||
impl Slot {
|
||||
@@ -170,6 +177,8 @@ pub struct PageCache {
|
||||
/// can have a separate mapping map, next to this field.
|
||||
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
|
||||
|
||||
ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
|
||||
|
||||
immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
|
||||
|
||||
/// The actual buffers with their metadata.
|
||||
@@ -249,6 +258,14 @@ impl PageWriteGuard<'_> {
|
||||
);
|
||||
self.valid = true;
|
||||
}
|
||||
pub fn mark_dirty(&mut self) {
|
||||
// only ephemeral pages can be dirty ATM.
|
||||
assert!(matches!(
|
||||
self.inner.key,
|
||||
Some(CacheKey::EphemeralPage { .. })
|
||||
));
|
||||
self.inner.dirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageWriteGuard<'_> {
|
||||
@@ -263,6 +280,7 @@ impl Drop for PageWriteGuard<'_> {
|
||||
let self_key = self.inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
self.inner.key = None;
|
||||
self.inner.dirty = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -370,7 +388,41 @@ impl PageCache {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Section 1.2: Public interface functions for working with immutable file pages.
|
||||
// Section 1.2: Public interface functions for working with Ephemeral pages.
|
||||
|
||||
pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
|
||||
let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key)
|
||||
}
|
||||
|
||||
pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
|
||||
let cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_write(&cache_key)
|
||||
}
|
||||
|
||||
/// Immediately drop all buffers belonging to given file, without writeback
|
||||
pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
|
||||
for slot_idx in 0..self.slots.len() {
|
||||
let slot = &self.slots[slot_idx];
|
||||
|
||||
let mut inner = slot.inner.write().unwrap();
|
||||
if let Some(key) = &inner.key {
|
||||
match key {
|
||||
CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(key);
|
||||
inner.key = None;
|
||||
inner.dirty = false;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Section 1.3: Public interface functions for working with immutable file pages.
|
||||
|
||||
pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
|
||||
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
|
||||
@@ -392,6 +444,7 @@ impl PageCache {
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(key);
|
||||
inner.key = None;
|
||||
inner.dirty = false;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@@ -469,6 +522,10 @@ impl PageCache {
|
||||
CacheKey::MaterializedPage { .. } => {
|
||||
unreachable!("Materialized pages use lookup_materialized_page")
|
||||
}
|
||||
CacheKey::EphemeralPage { .. } => (
|
||||
&crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
|
||||
&crate::metrics::PAGE_CACHE.read_hits_ephemeral,
|
||||
),
|
||||
CacheKey::ImmutableFilePage { .. } => (
|
||||
&crate::metrics::PAGE_CACHE.read_accesses_immutable,
|
||||
&crate::metrics::PAGE_CACHE.read_hits_immutable,
|
||||
@@ -509,6 +566,7 @@ impl PageCache {
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return Ok(ReadBufResult::NotFound(PageWriteGuard {
|
||||
@@ -570,6 +628,7 @@ impl PageCache {
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return Ok(WriteBufResult::NotFound(PageWriteGuard {
|
||||
@@ -608,6 +667,10 @@ impl PageCache {
|
||||
*lsn = version.lsn;
|
||||
Some(version.slot_idx)
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let map = self.immutable_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
@@ -631,6 +694,10 @@ impl PageCache {
|
||||
None
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let map = self.immutable_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
@@ -664,6 +731,12 @@ impl PageCache {
|
||||
panic!("could not find old key in mapping")
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
.expect("could not find old key in mapping");
|
||||
self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
@@ -703,7 +776,17 @@ impl PageCache {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
match map.entry((*file_id, *blkno)) {
|
||||
Entry::Occupied(entry) => Some(*entry.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(slot_idx);
|
||||
self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
match map.entry((*file_id, *blkno)) {
|
||||
@@ -754,8 +837,25 @@ impl PageCache {
|
||||
}
|
||||
};
|
||||
if let Some(old_key) = &inner.key {
|
||||
if inner.dirty {
|
||||
if let Err(err) = Self::writeback(old_key, inner.buf) {
|
||||
// Writing the page to disk failed.
|
||||
//
|
||||
// FIXME: What to do here, when? We could propagate the error to the
|
||||
// caller, but victim buffer is generally unrelated to the original
|
||||
// call. It can even belong to a different tenant. Currently, we
|
||||
// report the error to the log and continue the clock sweep to find
|
||||
// a different victim. But if the problem persists, the page cache
|
||||
// could fill up with dirty pages that we cannot evict, and we will
|
||||
// loop retrying the writebacks indefinitely.
|
||||
error!("writeback of buffer {:?} failed: {}", old_key, err);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(old_key);
|
||||
inner.dirty = false;
|
||||
inner.key = None;
|
||||
}
|
||||
return Ok((slot_idx, inner));
|
||||
@@ -763,6 +863,28 @@ impl PageCache {
|
||||
}
|
||||
}
|
||||
|
||||
fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
match cache_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: _,
|
||||
lsn: _,
|
||||
} => Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"unexpected dirty materialized page",
|
||||
)),
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
writeback_ephemeral_file(*file_id, *blkno, buf)
|
||||
}
|
||||
CacheKey::ImmutableFilePage {
|
||||
file_id: _,
|
||||
blkno: _,
|
||||
} => Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"unexpected dirty immutable page",
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize a new page cache
|
||||
///
|
||||
/// This should be called only once at page server startup.
|
||||
@@ -773,6 +895,7 @@ impl PageCache {
|
||||
|
||||
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
|
||||
size_metrics.max_bytes.set_page_sz(num_pages);
|
||||
size_metrics.current_bytes_ephemeral.set_page_sz(0);
|
||||
size_metrics.current_bytes_immutable.set_page_sz(0);
|
||||
size_metrics.current_bytes_materialized_page.set_page_sz(0);
|
||||
|
||||
@@ -782,7 +905,11 @@ impl PageCache {
|
||||
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
|
||||
|
||||
Slot {
|
||||
inner: RwLock::new(SlotInner { key: None, buf }),
|
||||
inner: RwLock::new(SlotInner {
|
||||
key: None,
|
||||
buf,
|
||||
dirty: false,
|
||||
}),
|
||||
usage_count: AtomicU8::new(0),
|
||||
}
|
||||
})
|
||||
@@ -790,6 +917,7 @@ impl PageCache {
|
||||
|
||||
Self {
|
||||
materialized_page_map: Default::default(),
|
||||
ephemeral_page_map: Default::default(),
|
||||
immutable_page_map: Default::default(),
|
||||
slots,
|
||||
next_evict_slot: AtomicUsize::new(0),
|
||||
|
||||
@@ -28,7 +28,6 @@ use std::cmp::min;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Debug;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::fs::OpenOptions;
|
||||
@@ -47,10 +46,8 @@ use std::sync::{Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use self::config::TenantConf;
|
||||
use self::delete::DeleteTenantFlow;
|
||||
use self::metadata::LoadMetadataError;
|
||||
use self::metadata::TimelineMetadata;
|
||||
use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
@@ -109,7 +106,6 @@ macro_rules! pausable_failpoint {
|
||||
|
||||
pub mod blob_io;
|
||||
pub mod block_io;
|
||||
|
||||
pub mod disk_btree;
|
||||
pub(crate) mod ephemeral_file;
|
||||
pub mod layer_map;
|
||||
@@ -122,7 +118,6 @@ mod remote_timeline_client;
|
||||
pub mod storage_layer;
|
||||
|
||||
pub mod config;
|
||||
pub mod delete;
|
||||
pub mod mgr;
|
||||
pub mod tasks;
|
||||
pub mod upload_queue;
|
||||
@@ -136,6 +131,9 @@ pub use timeline::{
|
||||
LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
|
||||
};
|
||||
|
||||
// re-export this function so that page_cache.rs can use it.
|
||||
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
|
||||
|
||||
// re-export for use in remote_timeline_client.rs
|
||||
pub use crate::tenant::metadata::save_metadata;
|
||||
|
||||
@@ -147,8 +145,6 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
|
||||
pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";
|
||||
|
||||
pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
|
||||
|
||||
///
|
||||
/// Tenant consists of multiple timelines. Keep them in a hash table.
|
||||
///
|
||||
@@ -187,8 +183,6 @@ pub struct Tenant {
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
|
||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||
}
|
||||
|
||||
// We should not blindly overwrite local metadata with remote one.
|
||||
@@ -280,7 +274,7 @@ pub enum LoadLocalTimelineError {
|
||||
ResumeDeletion(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("NotFound")]
|
||||
NotFound,
|
||||
@@ -289,37 +283,17 @@ pub enum DeleteTimelineError {
|
||||
HasChildren(Vec<TimelineId>),
|
||||
|
||||
#[error("Timeline deletion is already in progress")]
|
||||
AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl Debug for DeleteTimelineError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::NotFound => write!(f, "NotFound"),
|
||||
Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
|
||||
Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
|
||||
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum SetStoppingError {
|
||||
AlreadyStopping(completion::Barrier),
|
||||
Broken,
|
||||
}
|
||||
|
||||
impl Debug for SetStoppingError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
|
||||
Self::Broken => write!(f, "Broken"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct RemoteStartupData {
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
@@ -642,7 +616,7 @@ impl Tenant {
|
||||
// For every timeline, download the metadata file, scan the local directory,
|
||||
// and build a layer map that contains an entry for each remote and local
|
||||
// layer file.
|
||||
let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
|
||||
let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
|
||||
for (timeline_id, remote_metadata) in sorted_timelines {
|
||||
let (index_part, remote_client) = remote_index_and_client
|
||||
.remove(&timeline_id)
|
||||
@@ -766,13 +740,12 @@ impl Tenant {
|
||||
/// If the loading fails for some reason, the Tenant will go into Broken
|
||||
/// state.
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_id))]
|
||||
pub(crate) fn spawn_load(
|
||||
pub fn spawn_load(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Arc<Tenant> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
@@ -792,7 +765,7 @@ impl Tenant {
|
||||
tenant_conf,
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
remote_storage.clone(),
|
||||
remote_storage,
|
||||
);
|
||||
let tenant = Arc::new(tenant);
|
||||
|
||||
@@ -808,83 +781,27 @@ impl Tenant {
|
||||
"initial tenant load",
|
||||
false,
|
||||
async move {
|
||||
let make_broken = |t: &Tenant, err: anyhow::Error| {
|
||||
error!("load failed, setting tenant state to Broken: {err:?}");
|
||||
t.state.send_modify(|state| {
|
||||
assert!(
|
||||
matches!(*state, TenantState::Loading | TenantState::Stopping { .. }),
|
||||
"the loading task owns the tenant state until activation is complete"
|
||||
);
|
||||
*state = TenantState::broken_from_reason(err.to_string());
|
||||
});
|
||||
};
|
||||
|
||||
let mut init_order = init_order;
|
||||
|
||||
// take the completion because initial tenant loading will complete when all of
|
||||
// these tasks complete.
|
||||
let _completion = init_order
|
||||
.as_mut()
|
||||
.and_then(|x| x.initial_tenant_load.take());
|
||||
|
||||
// Dont block pageserver startup on figuring out deletion status
|
||||
let pending_deletion = {
|
||||
match DeleteTenantFlow::should_resume_deletion(
|
||||
conf,
|
||||
remote_storage.as_ref(),
|
||||
&tenant_clone,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(should_resume_deletion) => should_resume_deletion,
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
info!("pending deletion {}", pending_deletion.is_some());
|
||||
|
||||
if let Some(deletion) = pending_deletion {
|
||||
// as we are no longer loading, signal completion by dropping
|
||||
// the completion while we resume deletion
|
||||
drop(_completion);
|
||||
// do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
|
||||
let _ = init_order
|
||||
.as_mut()
|
||||
.and_then(|x| x.initial_logical_size_attempt.take());
|
||||
|
||||
match DeleteTenantFlow::resume(
|
||||
deletion,
|
||||
&tenant_clone,
|
||||
init_order.as_ref(),
|
||||
tenants,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => return Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
let background_jobs_can_start =
|
||||
init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());
|
||||
|
||||
match tenant_clone.load(init_order.as_ref(), &ctx).await {
|
||||
Ok(()) => {
|
||||
debug!("load finished",);
|
||||
|
||||
debug!("load finished, activating");
|
||||
let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
|
||||
}
|
||||
Err(err) => make_broken(&tenant_clone, err),
|
||||
Err(err) => {
|
||||
error!("load failed, setting tenant state to Broken: {err:?}");
|
||||
tenant_clone.state.send_modify(|state| {
|
||||
assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
|
||||
*state = TenantState::broken_from_reason(err.to_string());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
|
||||
@@ -960,8 +877,6 @@ impl Tenant {
|
||||
)
|
||||
})?;
|
||||
|
||||
info!("Found deletion mark for timeline {}", timeline_id);
|
||||
|
||||
match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
|
||||
Ok(metadata) => {
|
||||
timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
|
||||
@@ -1051,11 +966,9 @@ impl Tenant {
|
||||
|
||||
// Sort the array of timeline IDs into tree-order, so that parent comes before
|
||||
// all its children.
|
||||
tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
|
||||
TenantDirectoryScan {
|
||||
sorted_timelines_to_load: sorted_timelines,
|
||||
timelines_to_resume_deletion,
|
||||
}
|
||||
tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
|
||||
sorted_timelines_to_load: sorted_timelines,
|
||||
timelines_to_resume_deletion,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1769,7 +1682,7 @@ impl Tenant {
|
||||
// It's mesed up.
|
||||
// we just ignore the failure to stop
|
||||
|
||||
match self.set_stopping(shutdown_progress, false).await {
|
||||
match self.set_stopping(shutdown_progress).await {
|
||||
Ok(()) => {}
|
||||
Err(SetStoppingError::Broken) => {
|
||||
// assume that this is acceptable
|
||||
@@ -1809,25 +1722,18 @@ impl Tenant {
|
||||
/// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
|
||||
///
|
||||
/// This function is not cancel-safe!
|
||||
///
|
||||
/// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
|
||||
async fn set_stopping(
|
||||
&self,
|
||||
progress: completion::Barrier,
|
||||
allow_transition_from_loading: bool,
|
||||
) -> Result<(), SetStoppingError> {
|
||||
async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
|
||||
let mut rx = self.state.subscribe();
|
||||
|
||||
// cannot stop before we're done activating, so wait out until we're done activating
|
||||
rx.wait_for(|state| match state {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
info!(
|
||||
"waiting for {} to turn Active|Broken|Stopping",
|
||||
<&'static str>::from(state)
|
||||
);
|
||||
false
|
||||
}
|
||||
TenantState::Loading => allow_transition_from_loading,
|
||||
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
|
||||
})
|
||||
.await
|
||||
@@ -1836,16 +1742,9 @@ impl Tenant {
|
||||
// we now know we're done activating, let's see whether this task is the winner to transition into Stopping
|
||||
let mut err = None;
|
||||
let stopping = self.state.send_if_modified(|current_state| match current_state {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
|
||||
}
|
||||
TenantState::Loading => {
|
||||
if !allow_transition_from_loading {
|
||||
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
|
||||
};
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
true
|
||||
}
|
||||
TenantState::Active => {
|
||||
// FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
|
||||
// are created after the transition to Stopping. That's harmless, as the Timelines
|
||||
@@ -1914,10 +1813,6 @@ impl Tenant {
|
||||
.expect("cannot drop self.state while on a &self method");
|
||||
|
||||
// we now know we're done activating, let's see whether this task is the winner to transition into Broken
|
||||
self.set_broken_no_wait(reason)
|
||||
}
|
||||
|
||||
pub(crate) fn set_broken_no_wait(&self, reason: String) {
|
||||
self.state.send_modify(|current_state| {
|
||||
match *current_state {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
@@ -1983,28 +1878,22 @@ impl Tenant {
|
||||
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
|
||||
/// perform a topological sort, so that the parent of each timeline comes
|
||||
/// before the children.
|
||||
/// E extracts the ancestor from T
|
||||
/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
|
||||
fn tree_sort_timelines<T, E>(
|
||||
timelines: HashMap<TimelineId, T>,
|
||||
extractor: E,
|
||||
) -> anyhow::Result<Vec<(TimelineId, T)>>
|
||||
where
|
||||
E: Fn(&T) -> Option<TimelineId>,
|
||||
{
|
||||
fn tree_sort_timelines(
|
||||
timelines: HashMap<TimelineId, TimelineMetadata>,
|
||||
) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
|
||||
let mut result = Vec::with_capacity(timelines.len());
|
||||
|
||||
let mut now = Vec::with_capacity(timelines.len());
|
||||
// (ancestor, children)
|
||||
let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
|
||||
let mut later: HashMap<TimelineId, Vec<(TimelineId, TimelineMetadata)>> =
|
||||
HashMap::with_capacity(timelines.len());
|
||||
|
||||
for (timeline_id, value) in timelines {
|
||||
if let Some(ancestor_id) = extractor(&value) {
|
||||
for (timeline_id, metadata) in timelines {
|
||||
if let Some(ancestor_id) = metadata.ancestor_timeline() {
|
||||
let children = later.entry(ancestor_id).or_default();
|
||||
children.push((timeline_id, value));
|
||||
children.push((timeline_id, metadata));
|
||||
} else {
|
||||
now.push((timeline_id, value));
|
||||
now.push((timeline_id, metadata));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2173,7 +2062,7 @@ impl Tenant {
|
||||
remote_client,
|
||||
pg_version,
|
||||
initial_logical_size_can_start.cloned(),
|
||||
initial_logical_size_attempt.cloned().flatten(),
|
||||
initial_logical_size_attempt.cloned(),
|
||||
state,
|
||||
);
|
||||
|
||||
@@ -2257,7 +2146,6 @@ impl Tenant {
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2274,7 +2162,6 @@ impl Tenant {
|
||||
// FIXME If the config file is not found, assume that we're attaching
|
||||
// a detached tenant and config is passed via attach command.
|
||||
// https://github.com/neondatabase/neon/issues/1555
|
||||
// OR: we're loading after incomplete deletion that managed to remove config.
|
||||
if !target_config_path.exists() {
|
||||
info!("tenant config not found in {target_config_display}");
|
||||
return Ok(TenantConfOpt::default());
|
||||
|
||||
@@ -1,546 +0,0 @@
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, instrument, warn, Instrument, Span};
|
||||
|
||||
use utils::{
|
||||
completion, crashsafe, fs_ext,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
task_mgr::{self, TaskKind},
|
||||
InitializationOrder,
|
||||
};
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantsMap},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant,
|
||||
};
|
||||
|
||||
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u8 = 3;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTenantError {
|
||||
#[error("GetTenant {0}")]
|
||||
Get(#[from] GetTenantError),
|
||||
|
||||
#[error("Invalid state {0}. Expected Active or Broken")]
|
||||
InvalidState(TenantState),
|
||||
|
||||
#[error("Tenant deletion is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] DeleteTimelineError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
|
||||
|
||||
fn remote_tenant_delete_mark_path(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let tenant_remote_path = conf
|
||||
.tenant_path(tenant_id)
|
||||
.strip_prefix(&conf.workdir)
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.context("tenant path")?;
|
||||
Ok(tenant_remote_path.join(Path::new("deleted")))
|
||||
}
|
||||
|
||||
async fn create_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
||||
|
||||
let data: &[u8] = &[];
|
||||
remote_storage
|
||||
.upload(data, 0, &remote_mark_path, None)
|
||||
.await
|
||||
.context("mark upload")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_local_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
|
||||
|
||||
// Note: we're ok to replace existing file.
|
||||
let _ = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.open(&marker_path)
|
||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||
|
||||
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn schedule_ordered_timeline_deletions(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
|
||||
// Tenant is stopping at this point. We know it will be deleted.
|
||||
// No new timelines should be created.
|
||||
// Tree sort timelines to delete from leafs to the root.
|
||||
// NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
|
||||
// can complete and remove timeline from the map in between our call to clone
|
||||
// and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
|
||||
// timelines.lock is currently synchronous so we cant hold it across await point.
|
||||
// So just ignore NotFound error if we get it from `run`.
|
||||
// Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
|
||||
let timelines = tenant.timelines.lock().unwrap().clone();
|
||||
let sorted =
|
||||
tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
|
||||
|
||||
let mut already_running_deletions = vec![];
|
||||
|
||||
for (timeline_id, _) in sorted.into_iter().rev() {
|
||||
if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
|
||||
match e {
|
||||
DeleteTimelineError::NotFound => {
|
||||
// Timeline deletion finished after call to clone above but before call
|
||||
// to `DeleteTimelineFlow::run` and removed timeline from the map.
|
||||
continue;
|
||||
}
|
||||
DeleteTimelineError::AlreadyInProgress(guard) => {
|
||||
already_running_deletions.push((guard, timeline_id));
|
||||
continue;
|
||||
}
|
||||
e => return Err(DeleteTenantError::Timeline(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(already_running_deletions)
|
||||
}
|
||||
|
||||
async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
|
||||
// Assert timelines dir is empty.
|
||||
if !fs_ext::is_directory_empty(timelines_path).await? {
|
||||
// Display first 10 items in directory
|
||||
let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"Timelines directory is not empty after all timelines deletion: {list:?}"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn remove_tenant_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
remote_storage
|
||||
.delete(&remote_tenant_delete_mark_path(conf, tenant_id)?)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
|
||||
async fn cleanup_remaining_fs_traces(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let rm = |p: PathBuf, is_dir: bool| async move {
|
||||
if is_dir {
|
||||
tokio::fs::remove_dir(&p).await
|
||||
} else {
|
||||
tokio::fs::remove_file(&p).await
|
||||
}
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.with_context(|| {
|
||||
let to_display = p.display();
|
||||
format!("failed to delete {to_display}")
|
||||
})
|
||||
};
|
||||
|
||||
rm(conf.tenant_config_path(tenant_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-timelines-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.timelines_path(tenant_id), true).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-deleted-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-tenant-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.tenant_path(tenant_id), true).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
/// 1. Upload remote deletion mark.
|
||||
/// 2. Create local mark file.
|
||||
/// 3. Shutdown tasks
|
||||
/// 4. Run ordered timeline deletions
|
||||
/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
|
||||
/// 6. Remove remote mark
|
||||
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are three entrypoints to the process:
|
||||
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTenantFlow::resume`] is called during restarts when local or remote deletion marks are still there.
|
||||
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTenantFlow {
|
||||
#[default]
|
||||
NotStarted,
|
||||
InProgress,
|
||||
Finished,
|
||||
}
|
||||
|
||||
impl DeleteTenantFlow {
|
||||
// These steps are run in the context of management api request handler.
|
||||
// Long running steps are continued to run in the background.
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
// NOTE: static needed for background part.
|
||||
// We assume that calling code sets up the span with tenant_id.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Helper function needed to be able to match once on returned error and transition tenant into broken state.
|
||||
// This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
|
||||
// will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
|
||||
// So the solution is to set tenant state to broken.
|
||||
async fn run_inner(
|
||||
guard: &mut OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-remote-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
// IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
|
||||
// Though sounds scary, different mark name?
|
||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-local-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
create_local_delete_mark(conf, &tenant.tenant_id)
|
||||
.await
|
||||
.context("local delete mark")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-background", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-background"
|
||||
))?
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
|
||||
Self::InProgress { .. } => { /* We're in a retry */ }
|
||||
Self::NotStarted => { /* Fresh start */ }
|
||||
}
|
||||
|
||||
*self = Self::InProgress;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
|
||||
let acquire = |t: &Tenant| {
|
||||
Some(
|
||||
Arc::clone(&t.delete_progress)
|
||||
.try_lock_owned()
|
||||
.expect("we're the only owner during init"),
|
||||
)
|
||||
};
|
||||
|
||||
let tenant_id = tenant.tenant_id;
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
// If remote storage is there we rely on it
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
|
||||
|
||||
let attempt = 1;
|
||||
loop {
|
||||
match remote_storage.download(&remote_mark_path).await {
|
||||
Ok(_) => return Ok(acquire(tenant)),
|
||||
Err(e) => {
|
||||
if matches!(e, DownloadError::NotFound) {
|
||||
return Ok(None);
|
||||
}
|
||||
if attempt > SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS {
|
||||
return Err(anyhow::anyhow!(e))?;
|
||||
}
|
||||
|
||||
warn!(
|
||||
"failed to fetch tenant deletion mark at {} attempt {}",
|
||||
&remote_mark_path, attempt
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
pub(crate) async fn resume(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
tenant
|
||||
.set_stopping(progress, true)
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
// Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
|
||||
let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
if let Some(background) = background_jobs_can_start {
|
||||
info!("waiting for backgound jobs barrier");
|
||||
background.clone().wait().await;
|
||||
info!("ready for backgound jobs barrier");
|
||||
}
|
||||
|
||||
// Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
|
||||
let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
|
||||
if timelines_path.exists() {
|
||||
tenant.load(init_order, ctx).await.context("load")?;
|
||||
}
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
tenant.conf,
|
||||
tenant.remote_storage.clone(),
|
||||
tenants,
|
||||
tenant,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
|
||||
let m = tenants.read().await;
|
||||
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.ok_or(GetTenantError::NotFound(tenant_id))?;
|
||||
|
||||
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
|
||||
// so at least for now allow deletions only for active tenants. TODO recheck
|
||||
// Broken and Stopping is needed for retries.
|
||||
if !matches!(
|
||||
tenant.current_state(),
|
||||
TenantState::Active | TenantState::Broken { .. }
|
||||
) {
|
||||
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
|
||||
}
|
||||
|
||||
let guard = Arc::clone(&tenant.delete_progress)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTenantError::AlreadyInProgress)?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-shutdown", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
|
||||
});
|
||||
|
||||
// make pageserver shutdown not to wait for our completion
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
// It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
|
||||
// i e it is an error to do:
|
||||
// tenant.set_stopping
|
||||
// tenant.shutdown
|
||||
// Its also bad that we're holding tenants.read here.
|
||||
// TODO relax set_stopping to be idempotent?
|
||||
if tenant.shutdown(progress, false).await.is_err() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"tenant shutdown is already in progress"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok((Arc::clone(tenant), guard))
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
Some(tenant_id),
|
||||
None,
|
||||
"tenant_delete",
|
||||
false,
|
||||
async move {
|
||||
if let Err(err) =
|
||||
Self::background(guard, conf, remote_storage, tenants, &tenant).await
|
||||
{
|
||||
error!("Error: {err:#}");
|
||||
tenant.set_broken(format!("{err:#}")).await;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
async fn background(
|
||||
mut guard: OwnedMutexGuard<Self>,
|
||||
conf: &PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
|
||||
// Note that if deletion fails we dont mark timelines as broken,
|
||||
// the whole tenant will become broken as by `Self::schedule_background` logic
|
||||
let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
|
||||
.await
|
||||
.context("schedule_ordered_timeline_deletions")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-polling-ongoing-deletions"
|
||||
))?
|
||||
});
|
||||
|
||||
// Wait for deletions that were already running at the moment when tenant deletion was requested.
|
||||
// When we can lock deletion guard it means that corresponding timeline deletion finished.
|
||||
for (guard, timeline_id) in already_running_timeline_deletions {
|
||||
let flow = guard.lock().await;
|
||||
if !flow.is_finished() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"already running timeline deletion failed: {timeline_id}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let timelines_path = conf.timelines_path(&tenant.tenant_id);
|
||||
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
||||
if timelines_path.exists() {
|
||||
// sanity check to guard against layout changes
|
||||
ensure_timelines_dir_empty(&timelines_path)
|
||||
.await
|
||||
.context("timelines dir not empty")?;
|
||||
}
|
||||
|
||||
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
|
||||
))?
|
||||
});
|
||||
|
||||
cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
|
||||
.await
|
||||
.context("cleanup_remaining_fs_traces")?;
|
||||
|
||||
let mut locked = tenants.write().await;
|
||||
if locked.remove(&tenant.tenant_id).is_none() {
|
||||
warn!("Tenant got removed from tenants map during deletion");
|
||||
};
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -20,7 +20,6 @@
|
||||
//!
|
||||
use byteorder::{ReadBytesExt, BE};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use either::Either;
|
||||
use hex;
|
||||
use std::{cmp::Ordering, io, result};
|
||||
use thiserror::Error;
|
||||
@@ -257,77 +256,63 @@ where
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
||||
// Locate the node.
|
||||
let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
|
||||
self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
|
||||
}
|
||||
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
fn search_recurse<V>(
|
||||
&self,
|
||||
node_blknum: u32,
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
// Locate the node.
|
||||
let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
|
||||
|
||||
assert!(node.num_children > 0);
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
let mut keybuf = Vec::new();
|
||||
keybuf.extend(node.prefix);
|
||||
keybuf.resize(prefix_len + suffix_len, 0);
|
||||
assert!(node.num_children > 0);
|
||||
|
||||
let mut iter = if let Some(iter) = opt_iter {
|
||||
iter
|
||||
} else if dir == VisitDirection::Forwards {
|
||||
// Locate the first match
|
||||
let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
|
||||
Ok(idx) => idx,
|
||||
Err(idx) => {
|
||||
if node.level == 0 {
|
||||
// Imagine that the node contains the following keys:
|
||||
//
|
||||
// 1
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
//
|
||||
// If the search key is '2' and there is exact match,
|
||||
// the binary search would return the index of key
|
||||
// '3'. That's cool, '3' is the first key to return.
|
||||
idx
|
||||
} else {
|
||||
// This is an internal page, so each key represents a lower
|
||||
// bound for what's in the child page. If there is no exact
|
||||
// match, we have to return the *previous* entry.
|
||||
//
|
||||
// 1 <-- return this
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
idx.saturating_sub(1)
|
||||
}
|
||||
}
|
||||
};
|
||||
Either::Left(idx..node.num_children.into())
|
||||
} else {
|
||||
let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
|
||||
Ok(idx) => {
|
||||
// Exact match. That's the first entry to return, and walk
|
||||
// backwards from there.
|
||||
let mut keybuf = Vec::new();
|
||||
keybuf.extend(node.prefix);
|
||||
keybuf.resize(prefix_len + suffix_len, 0);
|
||||
|
||||
if dir == VisitDirection::Forwards {
|
||||
// Locate the first match
|
||||
let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
|
||||
Ok(idx) => idx,
|
||||
Err(idx) => {
|
||||
if node.level == 0 {
|
||||
// Imagine that the node contains the following keys:
|
||||
//
|
||||
// 1
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
//
|
||||
// If the search key is '2' and there is exact match,
|
||||
// the binary search would return the index of key
|
||||
// '3'. That's cool, '3' is the first key to return.
|
||||
idx
|
||||
} else {
|
||||
// This is an internal page, so each key represents a lower
|
||||
// bound for what's in the child page. If there is no exact
|
||||
// match, we have to return the *previous* entry.
|
||||
//
|
||||
// 1 <-- return this
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
idx.saturating_sub(1)
|
||||
}
|
||||
Err(idx) => {
|
||||
// No exact match. The binary search returned the index of the
|
||||
// first key that's > search_key. Back off by one, and walk
|
||||
// backwards from there.
|
||||
if let Some(idx) = idx.checked_sub(1) {
|
||||
idx
|
||||
} else {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
};
|
||||
Either::Right((0..=idx).rev())
|
||||
}
|
||||
};
|
||||
|
||||
// idx points to the first match now. Keep going from there
|
||||
while let Some(idx) = iter.next() {
|
||||
let key_off = idx * suffix_len;
|
||||
let mut key_off = idx * suffix_len;
|
||||
while idx < node.num_children as usize {
|
||||
let suffix = &node.keys[key_off..key_off + suffix_len];
|
||||
keybuf[prefix_len..].copy_from_slice(suffix);
|
||||
let value = node.value(idx);
|
||||
@@ -338,8 +323,52 @@ where
|
||||
return Ok(false);
|
||||
}
|
||||
} else {
|
||||
stack.push((node_blknum, Some(iter)));
|
||||
stack.push((value.to_blknum(), None));
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
idx += 1;
|
||||
key_off += suffix_len;
|
||||
}
|
||||
} else {
|
||||
let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
|
||||
Ok(idx) => {
|
||||
// Exact match. That's the first entry to return, and walk
|
||||
// backwards from there. (The loop below starts from 'idx -
|
||||
// 1', so add one here to compensate.)
|
||||
idx + 1
|
||||
}
|
||||
Err(idx) => {
|
||||
// No exact match. The binary search returned the index of the
|
||||
// first key that's > search_key. Back off by one, and walk
|
||||
// backwards from there. (The loop below starts from idx - 1,
|
||||
// so we don't need to subtract one here)
|
||||
idx
|
||||
}
|
||||
};
|
||||
|
||||
// idx points to the first match + 1 now. Keep going from there.
|
||||
let mut key_off = idx * suffix_len;
|
||||
while idx > 0 {
|
||||
idx -= 1;
|
||||
key_off -= suffix_len;
|
||||
let suffix = &node.keys[key_off..key_off + suffix_len];
|
||||
keybuf[prefix_len..].copy_from_slice(suffix);
|
||||
let value = node.value(idx);
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if node.level == 0 {
|
||||
// leaf
|
||||
if !visitor(&keybuf, value.to_u64()) {
|
||||
return Ok(false);
|
||||
}
|
||||
} else {
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
if idx == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! used to keep in-memory layers spilled on disk.
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
@@ -10,7 +10,7 @@ use once_cell::sync::Lazy;
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{self};
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
@@ -19,9 +19,6 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
mod buffer_pool;
|
||||
mod dirty_buffer;
|
||||
|
||||
///
|
||||
/// This is the global cache of file descriptors (File objects).
|
||||
///
|
||||
@@ -97,13 +94,27 @@ impl EphemeralFile {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_buf_for_write(&self, blkno: u32) -> Result<dirty_buffer::Buffer, io::Error> {
|
||||
let pool = buffer_pool::get();
|
||||
let mut buf = pool.get_buffer();
|
||||
// Read the page from disk into the buffer
|
||||
// TODO: if we're overwriting the whole page, no need to read it in first
|
||||
self.fill_buffer(buf.deref_mut(), blkno)?;
|
||||
Ok(dirty_buffer::Buffer::new(self, buf, blkno))
|
||||
fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
let mut write_guard = match cache
|
||||
.write_ephemeral_buf(self.file_id, blkno)
|
||||
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
|
||||
{
|
||||
WriteBufResult::Found(guard) => guard,
|
||||
WriteBufResult::NotFound(mut guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
// TODO: if we're overwriting the whole page, no need to read it in first
|
||||
self.fill_buffer(guard.deref_mut(), blkno)?;
|
||||
guard.mark_valid();
|
||||
|
||||
// And then fall through to modify it.
|
||||
guard
|
||||
}
|
||||
};
|
||||
write_guard.mark_dirty();
|
||||
|
||||
Ok(write_guard)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,6 +127,75 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
impl FileExt for EphemeralFile {
|
||||
fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
|
||||
// Look up the right page
|
||||
let blkno = (offset / PAGE_SZ as u64) as u32;
|
||||
let off = offset as usize % PAGE_SZ;
|
||||
let len = min(PAGE_SZ - off, dstbuf.len());
|
||||
|
||||
let read_guard;
|
||||
let mut write_guard;
|
||||
|
||||
let cache = page_cache::get();
|
||||
let buf = match cache
|
||||
.read_ephemeral_buf(self.file_id, blkno)
|
||||
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
|
||||
{
|
||||
ReadBufResult::Found(guard) => {
|
||||
read_guard = guard;
|
||||
read_guard.as_ref()
|
||||
}
|
||||
ReadBufResult::NotFound(guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
write_guard = guard;
|
||||
self.fill_buffer(write_guard.deref_mut(), blkno)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// And then fall through to read the requested slice from the
|
||||
// buffer.
|
||||
write_guard.as_ref()
|
||||
}
|
||||
};
|
||||
|
||||
dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
|
||||
// Look up the right page
|
||||
let blkno = (offset / PAGE_SZ as u64) as u32;
|
||||
let off = offset as usize % PAGE_SZ;
|
||||
let len = min(PAGE_SZ - off, srcbuf.len());
|
||||
|
||||
let mut write_guard;
|
||||
let cache = page_cache::get();
|
||||
let buf = match cache
|
||||
.write_ephemeral_buf(self.file_id, blkno)
|
||||
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
|
||||
{
|
||||
WriteBufResult::Found(guard) => {
|
||||
write_guard = guard;
|
||||
write_guard.deref_mut()
|
||||
}
|
||||
WriteBufResult::NotFound(guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
// TODO: if we're overwriting the whole page, no need to read it in first
|
||||
write_guard = guard;
|
||||
self.fill_buffer(write_guard.deref_mut(), blkno)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// And then fall through to modify it.
|
||||
write_guard.deref_mut()
|
||||
}
|
||||
};
|
||||
|
||||
buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
|
||||
write_guard.mark_dirty();
|
||||
Ok(len)
|
||||
}
|
||||
}
|
||||
|
||||
impl BlobWriter for EphemeralFile {
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
|
||||
let pos = self.size;
|
||||
@@ -137,7 +217,6 @@ impl BlobWriter for EphemeralFile {
|
||||
// it needs to be split across pages
|
||||
buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
|
||||
blknum += 1;
|
||||
buf.writeback()?;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
|
||||
off = 4 - thislen;
|
||||
@@ -153,7 +232,6 @@ impl BlobWriter for EphemeralFile {
|
||||
let mut page_remain = PAGE_SZ - off;
|
||||
if page_remain == 0 {
|
||||
blknum += 1;
|
||||
buf.writeback()?;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
off = 0;
|
||||
page_remain = PAGE_SZ;
|
||||
@@ -163,8 +241,7 @@ impl BlobWriter for EphemeralFile {
|
||||
off += this_blk_len;
|
||||
buf_remain = &buf_remain[this_blk_len..];
|
||||
}
|
||||
|
||||
buf.writeback()?;
|
||||
drop(buf);
|
||||
|
||||
if srcbuf.len() < 0x80 {
|
||||
self.size += 1;
|
||||
@@ -179,6 +256,10 @@ impl BlobWriter for EphemeralFile {
|
||||
|
||||
impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// drop all pages from page cache
|
||||
let cache = page_cache::get();
|
||||
cache.drop_buffers_for_ephemeral(self.file_id);
|
||||
|
||||
// remove entry from the hash map
|
||||
EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
|
||||
|
||||
@@ -200,24 +281,62 @@ impl Drop for EphemeralFile {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
|
||||
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
|
||||
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(io::Error::new(
|
||||
ErrorKind::Other,
|
||||
format!(
|
||||
"failed to write back to ephemeral file at {} error: {}",
|
||||
file.path.display(),
|
||||
e
|
||||
),
|
||||
)),
|
||||
}
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
ErrorKind::Other,
|
||||
"could not write back page, not found in ephemeral files hash",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for EphemeralFile {
|
||||
type BlockLease = buffer_pool::Handle;
|
||||
type BlockLease = page_cache::PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
|
||||
// Read the page from disk into the buffer
|
||||
let pool = buffer_pool::get();
|
||||
let mut buf = pool.get_buffer();
|
||||
self.fill_buffer(buf.deref_mut(), blknum)?;
|
||||
Ok(buf)
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_ephemeral_buf(self.file_id, blknum)
|
||||
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
|
||||
{
|
||||
ReadBufResult::Found(guard) => return Ok(guard),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
|
||||
io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use rand::{thread_rng, RngCore};
|
||||
use rand::{seq::SliceRandom, thread_rng, RngCore};
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -238,6 +357,50 @@ mod tests {
|
||||
Ok((conf, tenant_id, timeline_id))
|
||||
}
|
||||
|
||||
// Helper function to slurp contents of a file, starting at the current position,
|
||||
// into a string
|
||||
fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
buf.resize(len, 0u8);
|
||||
|
||||
efile.read_exact_at(&mut buf, offset)?;
|
||||
|
||||
Ok(String::from_utf8_lossy(&buf)
|
||||
.trim_end_matches('\0')
|
||||
.to_string())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_files() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
|
||||
|
||||
let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
file_a.write_all_at(b"foo", 0)?;
|
||||
assert_eq!("foo", read_string(&file_a, 0, 20)?);
|
||||
|
||||
file_a.write_all_at(b"bar", 3)?;
|
||||
assert_eq!("foobar", read_string(&file_a, 0, 20)?);
|
||||
|
||||
// Open a lot of files, enough to cause some page evictions.
|
||||
let mut efiles = Vec::new();
|
||||
for fileno in 0..100 {
|
||||
let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
|
||||
assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
|
||||
efiles.push((fileno, efile));
|
||||
}
|
||||
|
||||
// Check that all the files can still be read from. Use them in random order for
|
||||
// good measure.
|
||||
efiles.as_mut_slice().shuffle(&mut thread_rng());
|
||||
for (fileno, efile) in efiles.iter_mut() {
|
||||
assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
//! Buffer pool for ephemeral file buffers.
|
||||
//!
|
||||
//! Currently this is a very simple implementation that just uses `malloc`.
|
||||
//! But the interface is such that we can switch to a more sophisticated
|
||||
//! implementation later, e.g., one that caps that amount of memory used.
|
||||
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
|
||||
pub struct BufferPool;
|
||||
|
||||
const POOL: BufferPool = BufferPool;
|
||||
|
||||
pub(super) fn get() -> &'static BufferPool {
|
||||
&POOL
|
||||
}
|
||||
|
||||
impl BufferPool {
|
||||
/// Get a [`Handle`] to a buffer in the pool.
|
||||
///
|
||||
/// The buffer is guaranteed to be zeroed out.
|
||||
///
|
||||
/// The implementation may block to wait for buffers to become available,
|
||||
/// and a future async version of this method may `.await` internally to
|
||||
/// wait for buffers to become available.
|
||||
///
|
||||
/// To avoid deadlocks, a thread/task must get all the buffers it needs
|
||||
/// with a single call to `get_buffer`. Without this rule, a deadlock
|
||||
/// can happen. Take for example a buffer pool with 2 buffers X, Y
|
||||
/// and a program with two threads A and B, each requiring 2 buffers.
|
||||
/// If A gets X and B gets Y, then both threads will block forever trying
|
||||
/// to get their second buffer.
|
||||
pub fn get_buffer(&self) -> Handle {
|
||||
Handle {
|
||||
data: vec![0; PAGE_SZ],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Handle {
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Handle {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Handle")
|
||||
.field("data", &self.data.as_ptr())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Handle {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
fn deref(&self) -> &Self::Target {
|
||||
let slice: &[u8] = &self.data[..];
|
||||
slice.try_into().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for Handle {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
let slice: &mut [u8] = &mut self.data[..];
|
||||
slice.try_into().unwrap()
|
||||
}
|
||||
}
|
||||
@@ -1,111 +0,0 @@
|
||||
//! Newtypes to ensure that dirty buffers are written back to the filesystem before they are dropped.
|
||||
|
||||
use std::io::ErrorKind;
|
||||
use std::ops::Deref;
|
||||
use std::ops::DerefMut;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
|
||||
use super::buffer_pool;
|
||||
use super::EphemeralFile;
|
||||
|
||||
pub(super) struct Buffer<'f> {
|
||||
inner: Inner<'f>,
|
||||
}
|
||||
|
||||
enum Inner<'f> {
|
||||
Dirty {
|
||||
ephemeral_file: &'f EphemeralFile,
|
||||
buf: buffer_pool::Handle,
|
||||
blkno: u32,
|
||||
},
|
||||
WritebackOngoing,
|
||||
WrittenBack,
|
||||
WritebackError,
|
||||
Dropped,
|
||||
}
|
||||
|
||||
impl<'f> Buffer<'f> {
|
||||
pub(super) fn new(
|
||||
ephemeral_file: &'f EphemeralFile,
|
||||
buf: buffer_pool::Handle,
|
||||
blkno: u32,
|
||||
) -> Self {
|
||||
Self {
|
||||
inner: Inner::Dirty {
|
||||
ephemeral_file,
|
||||
buf,
|
||||
blkno,
|
||||
},
|
||||
}
|
||||
}
|
||||
pub(super) fn writeback(mut self) -> Result<(), std::io::Error> {
|
||||
let Inner::Dirty {
|
||||
ephemeral_file,
|
||||
buf,
|
||||
blkno,
|
||||
} = std::mem::replace(&mut self.inner, Inner::WritebackOngoing) else {
|
||||
unreachable!("writeback consumes");
|
||||
};
|
||||
match ephemeral_file
|
||||
.file
|
||||
.write_all_at(buf.deref(), blkno as u64 * PAGE_SZ as u64)
|
||||
{
|
||||
Ok(_) => {
|
||||
self.inner = Inner::WrittenBack;
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
self.inner = Inner::WritebackError;
|
||||
Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
format!(
|
||||
"failed to write back to ephemeral file at {} error: {}",
|
||||
ephemeral_file.file.path.display(),
|
||||
e
|
||||
),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'f> Deref for Buffer<'f> {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &[u8] {
|
||||
match &self.inner {
|
||||
Inner::Dirty { buf, .. } => &**buf,
|
||||
Inner::WritebackOngoing => unreachable!("writeback consumes"),
|
||||
Inner::WrittenBack => unreachable!("writeback consumes"),
|
||||
Inner::WritebackError => unreachable!("writeback consumes"),
|
||||
Inner::Dropped => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'f> DerefMut for Buffer<'f> {
|
||||
fn deref_mut(&mut self) -> &mut [u8] {
|
||||
match &mut self.inner {
|
||||
Inner::Dirty { buf, .. } => &mut **buf,
|
||||
Inner::WritebackOngoing => unreachable!("writeback consumes"),
|
||||
Inner::WrittenBack => unreachable!("writeback consumes"),
|
||||
Inner::WritebackError => unreachable!("writeback consumes"),
|
||||
Inner::Dropped => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Buffer<'_> {
|
||||
fn drop(&mut self) {
|
||||
let prev = std::mem::replace(&mut self.inner, Inner::Dropped);
|
||||
match prev {
|
||||
// TODO: check this at compile time
|
||||
Inner::Dirty { .. } => panic!("dropped dirty buffer, need to writeback() first"),
|
||||
Inner::WritebackOngoing => unreachable!("transitory state"),
|
||||
Inner::WrittenBack | Inner::WritebackError => {}
|
||||
Inner::Dropped => unreachable!("drop only happens once"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,19 +20,17 @@ use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
|
||||
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::timeline::delete::DeleteTimelineFlow;
|
||||
|
||||
/// The tenants known to the pageserver.
|
||||
/// The enum variants are used to distinguish the different states that the pageserver can be in.
|
||||
pub(crate) enum TenantsMap {
|
||||
enum TenantsMap {
|
||||
/// [`init_tenant_mgr`] is not done yet.
|
||||
Initializing,
|
||||
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
|
||||
@@ -44,13 +42,13 @@ pub(crate) enum TenantsMap {
|
||||
}
|
||||
|
||||
impl TenantsMap {
|
||||
pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
|
||||
fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
|
||||
match self {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
|
||||
}
|
||||
}
|
||||
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
|
||||
fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
|
||||
match self {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
|
||||
@@ -99,9 +97,7 @@ pub async fn init_tenant_mgr(
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// This case happens if we:
|
||||
// * crash during attach before creating the attach marker file
|
||||
// * crash during tenant delete before removing tenant directory
|
||||
// This case happens if we crash during attach before creating the attach marker file
|
||||
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
|
||||
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
|
||||
})?;
|
||||
@@ -128,7 +124,6 @@ pub async fn init_tenant_mgr(
|
||||
broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
@@ -159,13 +154,12 @@ pub async fn init_tenant_mgr(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_local_tenant_processing(
|
||||
pub fn schedule_local_tenant_processing(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
anyhow::ensure!(
|
||||
@@ -225,7 +219,6 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
broker_client,
|
||||
remote_storage,
|
||||
init_order,
|
||||
tenants,
|
||||
ctx,
|
||||
)
|
||||
};
|
||||
@@ -363,7 +356,7 @@ pub async fn create_tenant(
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -424,14 +417,6 @@ pub async fn get_tenant(
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("Tenant {0}")]
|
||||
@@ -447,7 +432,7 @@ pub async fn delete_timeline(
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
|
||||
DeleteTimelineFlow::run(&tenant, timeline_id).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -522,7 +507,7 @@ pub async fn load_tenant(
|
||||
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
||||
}
|
||||
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
|
||||
.with_context(|| {
|
||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||
})?;
|
||||
@@ -603,7 +588,7 @@ pub async fn attach_tenant(
|
||||
.context("check for attach marker file existence")?;
|
||||
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
|
||||
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
|
||||
@@ -16,7 +16,6 @@ use anyhow::{ensure, Result};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::OnceLock;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
@@ -43,16 +42,14 @@ pub struct InMemoryLayer {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
///
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
/// start is inclusive.
|
||||
///
|
||||
start_lsn: Lsn,
|
||||
|
||||
/// Frozen layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is `None`.
|
||||
end_lsn: OnceLock<Lsn>,
|
||||
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once.
|
||||
/// All other changing parts are in `inner`, and protected by a mutex.
|
||||
/// The above fields never change. The parts that do change are in 'inner',
|
||||
/// and protected by mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
}
|
||||
|
||||
@@ -60,16 +57,21 @@ impl std::fmt::Debug for InMemoryLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("InMemoryLayer")
|
||||
.field("start_lsn", &self.start_lsn)
|
||||
.field("end_lsn", &self.end_lsn)
|
||||
.field("inner", &self.inner)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// Frozen layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is None
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The value is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
///
|
||||
index: HashMap<Key, VecMap<Lsn, u64>>,
|
||||
|
||||
/// The values are stored in a serialized format in this file.
|
||||
@@ -80,7 +82,15 @@ pub struct InMemoryLayerInner {
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("InMemoryLayerInner").finish()
|
||||
f.debug_struct("InMemoryLayerInner")
|
||||
.field("end_lsn", &self.end_lsn)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayerInner {
|
||||
fn assert_writeable(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,21 +101,13 @@ impl InMemoryLayer {
|
||||
|
||||
pub fn info(&self) -> InMemoryLayerInfo {
|
||||
let lsn_start = self.start_lsn;
|
||||
let lsn_end = self.inner.read().unwrap().end_lsn;
|
||||
|
||||
if let Some(&lsn_end) = self.end_lsn.get() {
|
||||
InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
|
||||
} else {
|
||||
InMemoryLayerInfo::Open { lsn_start }
|
||||
match lsn_end {
|
||||
Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
|
||||
None => InMemoryLayerInfo::Open { lsn_start },
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_writable(&self) {
|
||||
assert!(self.end_lsn.get().is_none());
|
||||
}
|
||||
|
||||
fn end_lsn_or_max(&self) -> Lsn {
|
||||
self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -115,7 +117,14 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.start_lsn..self.end_lsn_or_max()
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = if let Some(end_lsn) = inner.end_lsn {
|
||||
end_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
};
|
||||
self.start_lsn..end_lsn
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
@@ -127,7 +136,11 @@ impl Layer for InMemoryLayer {
|
||||
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_str = self.end_lsn_or_max();
|
||||
let end_str = inner
|
||||
.end_lsn
|
||||
.as_ref()
|
||||
.map(Lsn::to_string)
|
||||
.unwrap_or_default();
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for tli {} LSNs {}-{} ----",
|
||||
@@ -223,7 +236,9 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
impl std::fmt::Display for InMemoryLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let end_lsn = self.end_lsn_or_max();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
|
||||
}
|
||||
}
|
||||
@@ -255,8 +270,8 @@ impl InMemoryLayer {
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
start_lsn,
|
||||
end_lsn: OnceLock::new(),
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
index: HashMap::new(),
|
||||
file,
|
||||
}),
|
||||
@@ -270,7 +285,7 @@ impl InMemoryLayer {
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
self.assert_writable();
|
||||
inner.assert_writeable();
|
||||
|
||||
let off = {
|
||||
SER_BUFFER.with(|x| -> Result<_> {
|
||||
@@ -302,10 +317,10 @@ impl InMemoryLayer {
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is exclusive
|
||||
pub fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().unwrap();
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
assert!(self.start_lsn < end_lsn);
|
||||
self.end_lsn.set(end_lsn).expect("end_lsn set only once");
|
||||
inner.end_lsn = Some(end_lsn);
|
||||
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
@@ -329,14 +344,12 @@ impl InMemoryLayer {
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = *self.end_lsn.get().unwrap();
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
Key::MIN,
|
||||
self.start_lsn..end_lsn,
|
||||
self.start_lsn..inner.end_lsn.unwrap(),
|
||||
)?;
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
@@ -919,7 +919,7 @@ impl Timeline {
|
||||
pub fn set_state(&self, new_state: TimelineState) {
|
||||
match (self.current_state(), new_state) {
|
||||
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
|
||||
info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
|
||||
warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
|
||||
}
|
||||
(st, TimelineState::Loading) => {
|
||||
error!("ignoring transition from {st:?} into Loading state");
|
||||
|
||||
@@ -219,13 +219,27 @@ async fn delete_local_layer_files(
|
||||
}
|
||||
};
|
||||
|
||||
if metadata.is_dir() {
|
||||
warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
|
||||
let r = if metadata.is_dir() {
|
||||
// There shouldnt be any directories inside timeline dir as of current layout.
|
||||
tokio::fs::remove_dir(entry.path()).await
|
||||
} else {
|
||||
tokio::fs::remove_file(entry.path()).await
|
||||
};
|
||||
|
||||
if let Err(e) = r {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
path=?entry.path().display(),
|
||||
"got not found err while removing timeline dir, proceeding anyway"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
anyhow::bail!(anyhow::anyhow!(
|
||||
"Failed to remove: {}. Error: {e}",
|
||||
entry.path().display()
|
||||
));
|
||||
}
|
||||
.with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
|
||||
}
|
||||
|
||||
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
||||
@@ -345,11 +359,10 @@ impl DeleteTimelineFlow {
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
|
||||
pub async fn run(
|
||||
tenant: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
inplace: bool,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
|
||||
|
||||
@@ -367,11 +380,7 @@ impl DeleteTimelineFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
if inplace {
|
||||
Self::background(guard, tenant.conf, tenant, &timeline).await?
|
||||
} else {
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
}
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -389,8 +398,6 @@ impl DeleteTimelineFlow {
|
||||
}
|
||||
|
||||
/// Shortcut to create Timeline in stopping state and spawn deletion task.
|
||||
/// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
|
||||
#[instrument(skip_all, fields(%timeline_id))]
|
||||
pub async fn resume_deletion(
|
||||
tenant: Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
@@ -437,15 +444,11 @@ impl DeleteTimelineFlow {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(%timeline_id))]
|
||||
pub async fn cleanup_remaining_timeline_fs_traces(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
let r =
|
||||
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
|
||||
info!("Done");
|
||||
r
|
||||
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
|
||||
}
|
||||
|
||||
fn prepare(
|
||||
@@ -491,17 +494,11 @@ impl DeleteTimelineFlow {
|
||||
// At the end of the operation we're holding the guard and need to lock timelines map
|
||||
// to remove the timeline from it.
|
||||
// Always if you have two locks that are taken in different order this can result in a deadlock.
|
||||
|
||||
let delete_progress = Arc::clone(&timeline.delete_progress);
|
||||
let delete_lock_guard = match delete_progress.try_lock_owned() {
|
||||
Ok(guard) => DeletionGuard(guard),
|
||||
Err(_) => {
|
||||
// Unfortunately if lock fails arc is consumed.
|
||||
return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
|
||||
&timeline.delete_progress,
|
||||
)));
|
||||
}
|
||||
};
|
||||
let delete_lock_guard = DeletionGuard(
|
||||
Arc::clone(&timeline.delete_progress)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
|
||||
);
|
||||
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
|
||||
@@ -556,14 +553,10 @@ impl DeleteTimelineFlow {
|
||||
|
||||
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
*guard.0 = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn is_finished(&self) -> bool {
|
||||
matches!(self, Self::Finished)
|
||||
}
|
||||
}
|
||||
|
||||
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
|
||||
|
||||
@@ -37,14 +37,68 @@ static XLogSegNo walpropSegNo = 0;
|
||||
|
||||
/* START cloned file-local variables and functions from walsender.c */
|
||||
|
||||
/*
|
||||
* xlogreader used for replication. Note that a WAL sender doing physical
|
||||
* replication does not need xlogreader to read WAL, but it needs one to
|
||||
* keep a state of its work.
|
||||
*/
|
||||
static XLogReaderState *xlogreader = NULL;
|
||||
|
||||
/*
|
||||
* These variables keep track of the state of the timeline we're currently
|
||||
* sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
|
||||
* the timeline is not the latest timeline on this server, and the server's
|
||||
* history forked off from that timeline at sendTimeLineValidUpto.
|
||||
*/
|
||||
static TimeLineID sendTimeLine = 0;
|
||||
static TimeLineID sendTimeLineNextTLI = 0;
|
||||
static bool sendTimeLineIsHistoric = false;
|
||||
static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
|
||||
|
||||
/*
|
||||
* Timestamp of last ProcessRepliesIfAny() that saw a reply from the
|
||||
* standby. Set to 0 if wal_sender_timeout doesn't need to be active.
|
||||
*/
|
||||
static TimestampTz last_reply_timestamp = 0;
|
||||
|
||||
/* Have we sent a heartbeat message asking for reply, since last reply? */
|
||||
static bool waiting_for_ping_response = false;
|
||||
|
||||
static bool streamingDoneSending;
|
||||
static bool streamingDoneReceiving;
|
||||
|
||||
/* Are we there yet? */
|
||||
static bool WalSndCaughtUp = false;
|
||||
|
||||
/* Flags set by signal handlers for later service in main loop */
|
||||
static volatile sig_atomic_t got_STOPPING = false;
|
||||
|
||||
/*
|
||||
* How far have we sent WAL already? This is also advertised in
|
||||
* MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.)
|
||||
*/
|
||||
static XLogRecPtr sentPtr = InvalidXLogRecPtr;
|
||||
|
||||
static void WalSndLoop(void);
|
||||
static void XLogBroadcastWalProposer(void);
|
||||
/*
|
||||
* This is set while we are streaming. When not set
|
||||
* PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
|
||||
* the main loop is responsible for checking got_STOPPING and terminating when
|
||||
* it's set (after streaming any remaining WAL).
|
||||
*/
|
||||
static volatile sig_atomic_t replication_active = false;
|
||||
|
||||
typedef void (*WalSndSendDataCallback) (void);
|
||||
static void WalSndLoop(WalSndSendDataCallback send_data);
|
||||
static void XLogSendPhysical(void);
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli);
|
||||
#else
|
||||
static XLogRecPtr GetStandbyFlushRecPtr(void);
|
||||
#endif
|
||||
|
||||
static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
|
||||
TimeLineID *tli_p);
|
||||
|
||||
/* END cloned file-level variables and functions from walsender.c */
|
||||
|
||||
int
|
||||
@@ -452,7 +506,7 @@ XLogWalPropClose(XLogRecPtr recptr)
|
||||
/* START of cloned functions from walsender.c */
|
||||
|
||||
/*
|
||||
* Subscribe for new WAL and stream it in the loop to safekeepers.
|
||||
* Handle START_REPLICATION command.
|
||||
*
|
||||
* At the moment, this never returns, but an ereport(ERROR) will take us back
|
||||
* to the main loop.
|
||||
@@ -470,6 +524,18 @@ StartProposerReplication(StartReplicationCmd *cmd)
|
||||
errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
|
||||
#endif
|
||||
|
||||
/* create xlogreader for physical replication */
|
||||
xlogreader =
|
||||
XLogReaderAllocate(wal_segment_size, NULL,
|
||||
XL_ROUTINE(.segment_open = WalSndSegmentOpen,
|
||||
.segment_close = wal_segment_close),
|
||||
NULL);
|
||||
|
||||
if (!xlogreader)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
|
||||
/*
|
||||
* We assume here that we're logging enough information in the WAL for
|
||||
* log-shipping, since this is checked in PostmasterMain().
|
||||
@@ -503,61 +569,341 @@ StartProposerReplication(StartReplicationCmd *cmd)
|
||||
* we keep this code around to lighten the load for when we need it.
|
||||
*/
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
FlushPtr = GetFlushRecPtr(&currTLI);
|
||||
if (am_cascading_walsender)
|
||||
{
|
||||
/* this also updates ThisTimeLineID */
|
||||
FlushPtr = GetStandbyFlushRecPtr(&currTLI);
|
||||
}
|
||||
else
|
||||
FlushPtr = GetFlushRecPtr(&currTLI);
|
||||
#else
|
||||
FlushPtr = GetFlushRecPtr();
|
||||
if (am_cascading_walsender)
|
||||
{
|
||||
/* this also updates ThisTimeLineID */
|
||||
FlushPtr = GetStandbyFlushRecPtr();
|
||||
}
|
||||
else
|
||||
FlushPtr = GetFlushRecPtr();
|
||||
|
||||
currTLI = ThisTimeLineID;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When we first start replication the standby will be behind the
|
||||
* primary. For some applications, for example synchronous
|
||||
* replication, it is important to have a clear state for this initial
|
||||
* catchup mode, so we can trigger actions when we change streaming
|
||||
* state later. We may stay in this state for a long time, which is
|
||||
* exactly why we want to be able to monitor whether or not we are
|
||||
* still here.
|
||||
*/
|
||||
WalSndSetState(WALSNDSTATE_CATCHUP);
|
||||
|
||||
/*
|
||||
* Don't allow a request to stream from a future point in WAL that
|
||||
* hasn't been flushed to disk in this server yet.
|
||||
*/
|
||||
if (FlushPtr < cmd->startpoint)
|
||||
if (cmd->timeline != 0)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
|
||||
LSN_FORMAT_ARGS(cmd->startpoint),
|
||||
LSN_FORMAT_ARGS(FlushPtr))));
|
||||
XLogRecPtr switchpoint;
|
||||
|
||||
sendTimeLine = cmd->timeline;
|
||||
if (sendTimeLine == currTLI)
|
||||
{
|
||||
sendTimeLineIsHistoric = false;
|
||||
sendTimeLineValidUpto = InvalidXLogRecPtr;
|
||||
}
|
||||
else
|
||||
{
|
||||
List *timeLineHistory;
|
||||
|
||||
sendTimeLineIsHistoric = true;
|
||||
|
||||
/*
|
||||
* Check that the timeline the client requested exists, and the
|
||||
* requested start location is on that timeline.
|
||||
*/
|
||||
timeLineHistory = readTimeLineHistory(currTLI);
|
||||
switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
|
||||
&sendTimeLineNextTLI);
|
||||
list_free_deep(timeLineHistory);
|
||||
|
||||
/*
|
||||
* Found the requested timeline in the history. Check that
|
||||
* requested startpoint is on that timeline in our history.
|
||||
*
|
||||
* This is quite loose on purpose. We only check that we didn't
|
||||
* fork off the requested timeline before the switchpoint. We
|
||||
* don't check that we switched *to* it before the requested
|
||||
* starting point. This is because the client can legitimately
|
||||
* request to start replication from the beginning of the WAL
|
||||
* segment that contains switchpoint, but on the new timeline, so
|
||||
* that it doesn't end up with a partial segment. If you ask for
|
||||
* too old a starting point, you'll get an error later when we
|
||||
* fail to find the requested WAL segment in pg_wal.
|
||||
*
|
||||
* XXX: we could be more strict here and only allow a startpoint
|
||||
* that's older than the switchpoint, if it's still in the same
|
||||
* WAL segment.
|
||||
*/
|
||||
if (!XLogRecPtrIsInvalid(switchpoint) &&
|
||||
switchpoint < cmd->startpoint)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
|
||||
LSN_FORMAT_ARGS(cmd->startpoint),
|
||||
cmd->timeline),
|
||||
errdetail("This server's history forked from timeline %u at %X/%X.",
|
||||
cmd->timeline,
|
||||
LSN_FORMAT_ARGS(switchpoint))));
|
||||
}
|
||||
sendTimeLineValidUpto = switchpoint;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
sendTimeLine = currTLI;
|
||||
sendTimeLineValidUpto = InvalidXLogRecPtr;
|
||||
sendTimeLineIsHistoric = false;
|
||||
}
|
||||
|
||||
/* Start streaming from the requested point */
|
||||
sentPtr = cmd->startpoint;
|
||||
streamingDoneSending = streamingDoneReceiving = false;
|
||||
|
||||
/* Initialize shared memory status, too */
|
||||
SpinLockAcquire(&MyWalSnd->mutex);
|
||||
MyWalSnd->sentPtr = sentPtr;
|
||||
SpinLockRelease(&MyWalSnd->mutex);
|
||||
/* If there is nothing to stream, don't even enter COPY mode */
|
||||
if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
|
||||
{
|
||||
/*
|
||||
* When we first start replication the standby will be behind the
|
||||
* primary. For some applications, for example synchronous
|
||||
* replication, it is important to have a clear state for this initial
|
||||
* catchup mode, so we can trigger actions when we change streaming
|
||||
* state later. We may stay in this state for a long time, which is
|
||||
* exactly why we want to be able to monitor whether or not we are
|
||||
* still here.
|
||||
*/
|
||||
WalSndSetState(WALSNDSTATE_CATCHUP);
|
||||
|
||||
SyncRepInitConfig();
|
||||
/*
|
||||
* Don't allow a request to stream from a future point in WAL that
|
||||
* hasn't been flushed to disk in this server yet.
|
||||
*/
|
||||
if (FlushPtr < cmd->startpoint)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
|
||||
LSN_FORMAT_ARGS(cmd->startpoint),
|
||||
LSN_FORMAT_ARGS(FlushPtr))));
|
||||
}
|
||||
|
||||
/* Infinite send loop, never returns */
|
||||
WalSndLoop();
|
||||
/* Start streaming from the requested point */
|
||||
sentPtr = cmd->startpoint;
|
||||
|
||||
WalSndSetState(WALSNDSTATE_STARTUP);
|
||||
/* Initialize shared memory status, too */
|
||||
SpinLockAcquire(&MyWalSnd->mutex);
|
||||
MyWalSnd->sentPtr = sentPtr;
|
||||
SpinLockRelease(&MyWalSnd->mutex);
|
||||
|
||||
SyncRepInitConfig();
|
||||
|
||||
/* Main loop of walsender */
|
||||
replication_active = true;
|
||||
|
||||
WalSndLoop(XLogSendPhysical);
|
||||
|
||||
replication_active = false;
|
||||
if (got_STOPPING)
|
||||
proc_exit(0);
|
||||
WalSndSetState(WALSNDSTATE_STARTUP);
|
||||
|
||||
Assert(streamingDoneSending && streamingDoneReceiving);
|
||||
}
|
||||
|
||||
if (cmd->slotname)
|
||||
ReplicationSlotRelease();
|
||||
|
||||
/*
|
||||
* Copy is finished now. Send a single-row result set indicating the next
|
||||
* timeline.
|
||||
*/
|
||||
if (sendTimeLineIsHistoric)
|
||||
{
|
||||
char startpos_str[8 + 1 + 8 + 1];
|
||||
DestReceiver *dest;
|
||||
TupOutputState *tstate;
|
||||
TupleDesc tupdesc;
|
||||
Datum values[2];
|
||||
bool nulls[2];
|
||||
|
||||
snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
|
||||
LSN_FORMAT_ARGS(sendTimeLineValidUpto));
|
||||
|
||||
dest = CreateDestReceiver(DestRemoteSimple);
|
||||
MemSet(nulls, false, sizeof(nulls));
|
||||
|
||||
/*
|
||||
* Need a tuple descriptor representing two columns. int8 may seem
|
||||
* like a surprising data type for this, but in theory int4 would not
|
||||
* be wide enough for this, as TimeLineID is unsigned.
|
||||
*/
|
||||
tupdesc = CreateTemplateTupleDesc(2);
|
||||
TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
|
||||
INT8OID, -1, 0);
|
||||
TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
|
||||
TEXTOID, -1, 0);
|
||||
|
||||
/* prepare for projection of tuple */
|
||||
tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
|
||||
|
||||
values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
|
||||
values[1] = CStringGetTextDatum(startpos_str);
|
||||
|
||||
/* send it to dest */
|
||||
do_tup_output(tstate, values, nulls);
|
||||
|
||||
end_tup_output(tstate);
|
||||
}
|
||||
|
||||
/* Send CommandComplete message */
|
||||
EndReplicationCommand("START_STREAMING");
|
||||
}
|
||||
|
||||
/*
|
||||
* Main loop that waits for LSN updates and calls the walproposer.
|
||||
* Synchronous replication sets latch in WalSndWakeup at walsender.c
|
||||
*/
|
||||
static void
|
||||
WalSndLoop(void)
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
static XLogRecPtr
|
||||
GetStandbyFlushRecPtr(TimeLineID *tli)
|
||||
{
|
||||
XLogRecPtr replayPtr;
|
||||
TimeLineID replayTLI;
|
||||
XLogRecPtr receivePtr;
|
||||
TimeLineID receiveTLI;
|
||||
XLogRecPtr result;
|
||||
|
||||
/*
|
||||
* We can safely send what's already been replayed. Also, if walreceiver
|
||||
* is streaming WAL from the same timeline, we can send anything that it
|
||||
* has streamed, but hasn't been replayed yet.
|
||||
*/
|
||||
|
||||
receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
|
||||
replayPtr = GetXLogReplayRecPtr(&replayTLI);
|
||||
|
||||
*tli = replayTLI;
|
||||
|
||||
result = replayPtr;
|
||||
if (receiveTLI == replayTLI && receivePtr > replayPtr)
|
||||
result = receivePtr;
|
||||
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* Returns the latest point in WAL that has been safely flushed to disk, and
|
||||
* can be sent to the standby. This should only be called when in recovery,
|
||||
* ie. we're streaming to a cascaded standby.
|
||||
*
|
||||
* As a side-effect, ThisTimeLineID is updated to the TLI of the last
|
||||
* replayed WAL record.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
GetStandbyFlushRecPtr(void)
|
||||
{
|
||||
XLogRecPtr replayPtr;
|
||||
TimeLineID replayTLI;
|
||||
XLogRecPtr receivePtr;
|
||||
TimeLineID receiveTLI;
|
||||
XLogRecPtr result;
|
||||
|
||||
/*
|
||||
* We can safely send what's already been replayed. Also, if walreceiver
|
||||
* is streaming WAL from the same timeline, we can send anything that it
|
||||
* has streamed, but hasn't been replayed yet.
|
||||
*/
|
||||
|
||||
receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
|
||||
replayPtr = GetXLogReplayRecPtr(&replayTLI);
|
||||
|
||||
ThisTimeLineID = replayTLI;
|
||||
|
||||
result = replayPtr;
|
||||
if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
|
||||
result = receivePtr;
|
||||
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* XLogReaderRoutine->segment_open callback */
|
||||
static void
|
||||
WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
|
||||
TimeLineID *tli_p)
|
||||
{
|
||||
char path[MAXPGPATH];
|
||||
|
||||
/*-------
|
||||
* When reading from a historic timeline, and there is a timeline switch
|
||||
* within this segment, read from the WAL segment belonging to the new
|
||||
* timeline.
|
||||
*
|
||||
* For example, imagine that this server is currently on timeline 5, and
|
||||
* we're streaming timeline 4. The switch from timeline 4 to 5 happened at
|
||||
* 0/13002088. In pg_wal, we have these files:
|
||||
*
|
||||
* ...
|
||||
* 000000040000000000000012
|
||||
* 000000040000000000000013
|
||||
* 000000050000000000000013
|
||||
* 000000050000000000000014
|
||||
* ...
|
||||
*
|
||||
* In this situation, when requested to send the WAL from segment 0x13, on
|
||||
* timeline 4, we read the WAL from file 000000050000000000000013. Archive
|
||||
* recovery prefers files from newer timelines, so if the segment was
|
||||
* restored from the archive on this server, the file belonging to the old
|
||||
* timeline, 000000040000000000000013, might not exist. Their contents are
|
||||
* equal up to the switchpoint, because at a timeline switch, the used
|
||||
* portion of the old segment is copied to the new file. -------
|
||||
*/
|
||||
*tli_p = sendTimeLine;
|
||||
if (sendTimeLineIsHistoric)
|
||||
{
|
||||
XLogSegNo endSegNo;
|
||||
|
||||
XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
|
||||
if (nextSegNo == endSegNo)
|
||||
*tli_p = sendTimeLineNextTLI;
|
||||
}
|
||||
|
||||
XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
|
||||
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
|
||||
if (state->seg.ws_file >= 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the file is not found, assume it's because the standby asked for a
|
||||
* too old WAL segment that has already been removed or recycled.
|
||||
*/
|
||||
if (errno == ENOENT)
|
||||
{
|
||||
char xlogfname[MAXFNAMELEN];
|
||||
int save_errno = errno;
|
||||
|
||||
XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
|
||||
errno = save_errno;
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("requested WAL segment %s has already been removed",
|
||||
xlogfname)));
|
||||
}
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not open file \"%s\": %m",
|
||||
path)));
|
||||
}
|
||||
|
||||
|
||||
/* Main loop of walsender process that streams the WAL over Copy messages. */
|
||||
static void
|
||||
WalSndLoop(WalSndSendDataCallback send_data)
|
||||
{
|
||||
/*
|
||||
* Initialize the last reply timestamp. That enables timeout processing
|
||||
* from hereon.
|
||||
*/
|
||||
last_reply_timestamp = GetCurrentTimestamp();
|
||||
waiting_for_ping_response = false;
|
||||
|
||||
/*
|
||||
* Loop until we reach the end of this timeline or the client requests to
|
||||
* stop streaming.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
/* Clear any already-pending wakeups */
|
||||
@@ -565,41 +911,153 @@ WalSndLoop(void)
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
XLogBroadcastWalProposer();
|
||||
/* Process any requests or signals received recently */
|
||||
if (ConfigReloadPending)
|
||||
{
|
||||
ConfigReloadPending = false;
|
||||
ProcessConfigFile(PGC_SIGHUP);
|
||||
SyncRepInitConfig();
|
||||
}
|
||||
|
||||
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
|
||||
WalSndSetState(WALSNDSTATE_STREAMING);
|
||||
WalProposerPoll();
|
||||
/* always true */
|
||||
if (am_wal_proposer)
|
||||
{
|
||||
send_data();
|
||||
if (WalSndCaughtUp)
|
||||
{
|
||||
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
|
||||
WalSndSetState(WALSNDSTATE_STREAMING);
|
||||
WalProposerPoll();
|
||||
WalSndCaughtUp = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify walproposer about the new WAL position.
|
||||
* Send out the WAL in its normal physical/stored form.
|
||||
*
|
||||
* Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
|
||||
* but not yet sent to the client, and buffer it in the libpq output
|
||||
* buffer.
|
||||
*
|
||||
* If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
|
||||
* otherwise WalSndCaughtUp is set to false.
|
||||
*/
|
||||
static void
|
||||
XLogBroadcastWalProposer(void)
|
||||
XLogSendPhysical(void)
|
||||
{
|
||||
XLogRecPtr SendRqstPtr;
|
||||
XLogRecPtr startptr;
|
||||
XLogRecPtr endptr;
|
||||
Size nbytes PG_USED_FOR_ASSERTS_ONLY;
|
||||
TimeLineID currTLI;
|
||||
|
||||
/* Start from the last sent position */
|
||||
startptr = sentPtr;
|
||||
/* If requested switch the WAL sender to the stopping state. */
|
||||
if (got_STOPPING)
|
||||
WalSndSetState(WALSNDSTATE_STOPPING);
|
||||
|
||||
/*
|
||||
* Streaming the current timeline on a primary.
|
||||
*
|
||||
* Attempt to send all data that's already been written out and
|
||||
* fsync'd to disk. We cannot go further than what's been written out
|
||||
* given the current implementation of WALRead(). And in any case
|
||||
* it's unsafe to send WAL that is not securely down to disk on the
|
||||
* primary: if the primary subsequently crashes and restarts, standbys
|
||||
* must not have applied any WAL that got lost on the primary.
|
||||
*/
|
||||
if (streamingDoneSending)
|
||||
{
|
||||
WalSndCaughtUp = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Figure out how far we can safely send the WAL. */
|
||||
if (sendTimeLineIsHistoric)
|
||||
{
|
||||
/*
|
||||
* Streaming an old timeline that's in this server's history, but is
|
||||
* not the one we're currently inserting or replaying. It can be
|
||||
* streamed up to the point where we switched off that timeline.
|
||||
*/
|
||||
SendRqstPtr = sendTimeLineValidUpto;
|
||||
}
|
||||
else if (am_cascading_walsender)
|
||||
{
|
||||
/*
|
||||
* Streaming the latest timeline on a standby.
|
||||
*
|
||||
* Attempt to send all WAL that has already been replayed, so that we
|
||||
* know it's valid. If we're receiving WAL through streaming
|
||||
* replication, it's also OK to send any WAL that has been received
|
||||
* but not replayed.
|
||||
*
|
||||
* The timeline we're recovering from can change, or we can be
|
||||
* promoted. In either case, the current timeline becomes historic. We
|
||||
* need to detect that so that we don't try to stream past the point
|
||||
* where we switched to another timeline. We check for promotion or
|
||||
* timeline switch after calculating FlushPtr, to avoid a race
|
||||
* condition: if the timeline becomes historic just after we checked
|
||||
* that it was still current, it's still be OK to stream it up to the
|
||||
* FlushPtr that was calculated before it became historic.
|
||||
*/
|
||||
bool becameHistoric = false;
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
endptr = GetFlushRecPtr(NULL);
|
||||
SendRqstPtr = GetStandbyFlushRecPtr(&currTLI);
|
||||
#else
|
||||
endptr = GetFlushRecPtr();
|
||||
SendRqstPtr = GetStandbyFlushRecPtr();
|
||||
currTLI = ThisTimeLineID;
|
||||
#endif
|
||||
if (!RecoveryInProgress())
|
||||
{
|
||||
/*
|
||||
* We have been promoted. RecoveryInProgress() updated
|
||||
* ThisTimeLineID to the new current timeline.
|
||||
*/
|
||||
am_cascading_walsender = false;
|
||||
becameHistoric = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Still a cascading standby. But is the timeline we're sending
|
||||
* still the one recovery is recovering from? currTLI was updated
|
||||
* by the GetStandbyFlushRecPtr() call above.
|
||||
*/
|
||||
if (sendTimeLine != currTLI)
|
||||
becameHistoric = true;
|
||||
}
|
||||
|
||||
if (becameHistoric)
|
||||
{
|
||||
/*
|
||||
* The timeline we were sending has become historic. Read the
|
||||
* timeline history file of the new timeline to see where exactly
|
||||
* we forked off from the timeline we were sending.
|
||||
*/
|
||||
List *history;
|
||||
|
||||
history = readTimeLineHistory(currTLI);
|
||||
sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
|
||||
|
||||
Assert(sendTimeLine < sendTimeLineNextTLI);
|
||||
list_free_deep(history);
|
||||
|
||||
sendTimeLineIsHistoric = true;
|
||||
|
||||
SendRqstPtr = sendTimeLineValidUpto;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Streaming the current timeline on a primary.
|
||||
*
|
||||
* Attempt to send all data that's already been written out and
|
||||
* fsync'd to disk. We cannot go further than what's been written out
|
||||
* given the current implementation of WALRead(). And in any case
|
||||
* it's unsafe to send WAL that is not securely down to disk on the
|
||||
* primary: if the primary subsequently crashes and restarts, standbys
|
||||
* must not have applied any WAL that got lost on the primary.
|
||||
*/
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
SendRqstPtr = GetFlushRecPtr(NULL);
|
||||
#else
|
||||
SendRqstPtr = GetFlushRecPtr();
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Record the current system time as an approximation of the time at which
|
||||
@@ -625,14 +1083,91 @@ XLogBroadcastWalProposer(void)
|
||||
* that arbitrary LSN is eventually reported as written, flushed and
|
||||
* applied, so that it can measure the elapsed time.
|
||||
*/
|
||||
LagTrackerWrite(endptr, GetCurrentTimestamp());
|
||||
LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
|
||||
|
||||
/*
|
||||
* If this is a historic timeline and we've reached the point where we
|
||||
* forked to the next timeline, stop streaming.
|
||||
*
|
||||
* Note: We might already have sent WAL > sendTimeLineValidUpto. The
|
||||
* startup process will normally replay all WAL that has been received
|
||||
* from the primary, before promoting, but if the WAL streaming is
|
||||
* terminated at a WAL page boundary, the valid portion of the timeline
|
||||
* might end in the middle of a WAL record. We might've already sent the
|
||||
* first half of that partial WAL record to the cascading standby, so that
|
||||
* sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
|
||||
* replay the partial WAL record either, so it can still follow our
|
||||
* timeline switch.
|
||||
*/
|
||||
if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
|
||||
{
|
||||
/* close the current file. */
|
||||
if (xlogreader->seg.ws_file >= 0)
|
||||
wal_segment_close(xlogreader);
|
||||
|
||||
/* Send CopyDone */
|
||||
pq_putmessage_noblock('c', NULL, 0);
|
||||
streamingDoneSending = true;
|
||||
|
||||
WalSndCaughtUp = true;
|
||||
|
||||
elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
|
||||
LSN_FORMAT_ARGS(sendTimeLineValidUpto),
|
||||
LSN_FORMAT_ARGS(sentPtr));
|
||||
return;
|
||||
}
|
||||
|
||||
/* Do we have any work to do? */
|
||||
Assert(startptr <= endptr);
|
||||
if (endptr <= startptr)
|
||||
Assert(sentPtr <= SendRqstPtr);
|
||||
if (SendRqstPtr <= sentPtr)
|
||||
{
|
||||
WalSndCaughtUp = true;
|
||||
return;
|
||||
}
|
||||
|
||||
WalProposerBroadcast(startptr, endptr);
|
||||
/*
|
||||
* Figure out how much to send in one message. If there's no more than
|
||||
* MAX_SEND_SIZE bytes to send, send everything. Otherwise send
|
||||
* MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
|
||||
*
|
||||
* The rounding is not only for performance reasons. Walreceiver relies on
|
||||
* the fact that we never split a WAL record across two messages. Since a
|
||||
* long WAL record is split at page boundary into continuation records,
|
||||
* page boundary is always a safe cut-off point. We also assume that
|
||||
* SendRqstPtr never points to the middle of a WAL record.
|
||||
*/
|
||||
startptr = sentPtr;
|
||||
endptr = startptr;
|
||||
endptr += MAX_SEND_SIZE;
|
||||
|
||||
/* if we went beyond SendRqstPtr, back off */
|
||||
if (SendRqstPtr <= endptr)
|
||||
{
|
||||
endptr = SendRqstPtr;
|
||||
if (sendTimeLineIsHistoric)
|
||||
WalSndCaughtUp = false;
|
||||
else
|
||||
WalSndCaughtUp = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* round down to page boundary. */
|
||||
endptr -= (endptr % XLOG_BLCKSZ);
|
||||
WalSndCaughtUp = false;
|
||||
}
|
||||
|
||||
nbytes = endptr - startptr;
|
||||
Assert(nbytes <= MAX_SEND_SIZE);
|
||||
|
||||
/* always true */
|
||||
if (am_wal_proposer)
|
||||
{
|
||||
WalProposerBroadcast(startptr, endptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* code removed for brevity */
|
||||
}
|
||||
sentPtr = endptr;
|
||||
|
||||
/* Update shared memory status */
|
||||
|
||||
@@ -8,7 +8,6 @@ use super::{
|
||||
use crate::{auth::ClientCredentials, compute, http, scram};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use tokio::time::Instant;
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
|
||||
@@ -48,9 +47,7 @@ impl Api {
|
||||
.build()?;
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = match parse_body::<GetRoleSecret>(response).await {
|
||||
Ok(body) => body,
|
||||
// Error 404 is special: it's ok not to have a secret.
|
||||
@@ -91,9 +88,7 @@ impl Api {
|
||||
.build()?;
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = parse_body::<WakeCompute>(response).await?;
|
||||
|
||||
// Unfortunately, ownership won't let us use `Option::ok_or` here.
|
||||
|
||||
@@ -7,14 +7,11 @@ pub mod server;
|
||||
pub mod sql_over_http;
|
||||
pub mod websocket;
|
||||
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::FutureExt;
|
||||
pub use reqwest::{Request, Response, StatusCode};
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
use tokio::time::Instant;
|
||||
use tracing::trace;
|
||||
|
||||
use crate::url::ApiUrl;
|
||||
use reqwest_middleware::RequestBuilder;
|
||||
@@ -23,21 +20,13 @@ use reqwest_middleware::RequestBuilder;
|
||||
/// because it takes care of observability (OpenTelemetry).
|
||||
/// We deliberately don't want to replace this with a public static.
|
||||
pub fn new_client() -> ClientWithMiddleware {
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.dns_resolver(Arc::new(GaiResolver::default()))
|
||||
.connection_verbose(true)
|
||||
.build()
|
||||
.expect("Failed to create http client");
|
||||
|
||||
reqwest_middleware::ClientBuilder::new(client)
|
||||
reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
|
||||
.with(reqwest_tracing::TracingMiddleware::default())
|
||||
.build()
|
||||
}
|
||||
|
||||
pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
|
||||
let timeout_client = reqwest::ClientBuilder::new()
|
||||
.dns_resolver(Arc::new(GaiResolver::default()))
|
||||
.connection_verbose(true)
|
||||
.timeout(default_timout)
|
||||
.build()
|
||||
.expect("Failed to create http client with timeout");
|
||||
@@ -50,10 +39,6 @@ pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware
|
||||
// As per docs, "This middleware always errors when given requests with streaming bodies".
|
||||
// That's all right because we only use this client to send `serde_json::RawValue`, which
|
||||
// is not a stream.
|
||||
//
|
||||
// ex-maintainer note:
|
||||
// this limitation can be fixed if streaming is necessary.
|
||||
// retries will still not be performed, but it wont error immediately
|
||||
.with(RetryTransientMiddleware::new_with_policy(retry_policy))
|
||||
.build()
|
||||
}
|
||||
@@ -96,37 +81,6 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
|
||||
use hyper::{
|
||||
client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
|
||||
service::Service,
|
||||
};
|
||||
use reqwest::dns::{Addrs, Resolve, Resolving};
|
||||
#[derive(Debug)]
|
||||
pub struct GaiResolver(HyperGaiResolver);
|
||||
|
||||
impl Default for GaiResolver {
|
||||
fn default() -> Self {
|
||||
Self(HyperGaiResolver::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl Resolve for GaiResolver {
|
||||
fn resolve(&self, name: Name) -> Resolving {
|
||||
let this = &mut self.0.clone();
|
||||
let start = Instant::now();
|
||||
Box::pin(
|
||||
Service::<Name>::call(this, name.clone()).map(move |result| {
|
||||
let resolve_duration = start.elapsed();
|
||||
trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
|
||||
result
|
||||
.map(|addrs| -> Addrs { Box::new(addrs) })
|
||||
.map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
|
||||
}),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -34,7 +34,7 @@ enum Payload {
|
||||
Batch(Vec<QueryData>),
|
||||
}
|
||||
|
||||
pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
|
||||
pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
|
||||
const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
|
||||
|
||||
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
|
||||
@@ -214,7 +214,7 @@ pub async fn handle(
|
||||
|
||||
if request_content_length > MAX_REQUEST_SIZE {
|
||||
return Err(anyhow::anyhow!(
|
||||
"request is too large (max is {MAX_REQUEST_SIZE} bytes)"
|
||||
"request is too large (max {MAX_REQUEST_SIZE} bytes)"
|
||||
));
|
||||
}
|
||||
|
||||
@@ -292,15 +292,13 @@ async fn query_to_json<T: GenericClient>(
|
||||
// big.
|
||||
pin_mut!(row_stream);
|
||||
let mut rows: Vec<tokio_postgres::Row> = Vec::new();
|
||||
let mut current_size = 0;
|
||||
let mut curret_size = 0;
|
||||
while let Some(row) = row_stream.next().await {
|
||||
let row = row?;
|
||||
current_size += row.body_len();
|
||||
curret_size += row.body_len();
|
||||
rows.push(row);
|
||||
if current_size > MAX_RESPONSE_SIZE {
|
||||
return Err(anyhow::anyhow!(
|
||||
"response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
|
||||
));
|
||||
if curret_size > MAX_RESPONSE_SIZE {
|
||||
return Err(anyhow::anyhow!("response too large"));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -187,16 +187,12 @@ async fn ws_handler(
|
||||
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
if let Err(e) =
|
||||
serve_websocket(websocket, config, &cancel_map, session_id, host).await
|
||||
{
|
||||
error!(session_id = ?session_id, "error in websocket connection: {e:#}");
|
||||
}
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
|
||||
{
|
||||
error!(session_id = ?session_id, "error in websocket connection: {e:?}");
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
});
|
||||
|
||||
// Return the response so the spawned future can continue.
|
||||
Ok(response)
|
||||
@@ -221,10 +217,6 @@ async fn ws_handler(
|
||||
},
|
||||
None => Value::Null,
|
||||
};
|
||||
error!(
|
||||
?code,
|
||||
"sql-over-http per-client task finished with an error: {e:#}"
|
||||
);
|
||||
(
|
||||
json!({ "message": message, "code": code }),
|
||||
HashMap::default(),
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
//! protocol commands.
|
||||
|
||||
use anyhow::Context;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::str::{self};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, info_span, Instrument};
|
||||
|
||||
@@ -11,7 +11,6 @@ use crate::auth::check_permission;
|
||||
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
|
||||
|
||||
use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
|
||||
use crate::safekeeper::Term;
|
||||
use crate::timeline::TimelineError;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
@@ -45,7 +44,7 @@ pub struct SafekeeperPostgresHandler {
|
||||
/// Parsed Postgres command.
|
||||
enum SafekeeperPostgresCommand {
|
||||
StartWalPush,
|
||||
StartReplication { start_lsn: Lsn, term: Option<Term> },
|
||||
StartReplication { start_lsn: Lsn },
|
||||
IdentifySystem,
|
||||
TimelineStatus,
|
||||
JSONCtrl { cmd: AppendLogicalMessage },
|
||||
@@ -56,21 +55,15 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush)
|
||||
} else if cmd.starts_with("START_REPLICATION") {
|
||||
let re = Regex::new(
|
||||
// We follow postgres START_REPLICATION LOGICAL options to pass term.
|
||||
r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)(?: \(term='(\d+)'\))?",
|
||||
r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
|
||||
)
|
||||
.unwrap();
|
||||
let caps = re
|
||||
.captures(cmd)
|
||||
.context(format!("failed to parse START_REPLICATION command {}", cmd))?;
|
||||
let start_lsn =
|
||||
Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?;
|
||||
let term = if let Some(m) = caps.get(2) {
|
||||
Some(m.as_str().parse::<u64>().context("invalid term")?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(SafekeeperPostgresCommand::StartReplication { start_lsn, term })
|
||||
let mut caps = re.captures_iter(cmd);
|
||||
let start_lsn = caps
|
||||
.next()
|
||||
.map(|cap| Lsn::from_str(&cap[1]))
|
||||
.context("parse start LSN from START_REPLICATION command")??;
|
||||
Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
|
||||
} else if cmd.starts_with("IDENTIFY_SYSTEM") {
|
||||
Ok(SafekeeperPostgresCommand::IdentifySystem)
|
||||
} else if cmd.starts_with("TIMELINE_STATUS") {
|
||||
@@ -225,8 +218,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
.instrument(info_span!("WAL receiver", ttid = %span_ttid))
|
||||
.await
|
||||
}
|
||||
SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
|
||||
self.handle_start_replication(pgb, start_lsn, term)
|
||||
SafekeeperPostgresCommand::StartReplication { start_lsn } => {
|
||||
self.handle_start_replication(pgb, start_lsn)
|
||||
.instrument(info_span!("WAL sender", ttid = %span_ttid))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
//! with the "START_REPLICATION" message, and registry of walsenders.
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::safekeeper::Term;
|
||||
use crate::timeline::Timeline;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::wal_storage::WalReader;
|
||||
@@ -360,12 +359,8 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
term: Option<Term>,
|
||||
) -> Result<(), QueryError> {
|
||||
if let Err(end) = self
|
||||
.handle_start_replication_guts(pgb, start_pos, term)
|
||||
.await
|
||||
{
|
||||
if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await {
|
||||
// Log the result and probably send it to the client, closing the stream.
|
||||
pgb.handle_copy_stream_end(end).await;
|
||||
}
|
||||
@@ -376,7 +371,6 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
term: Option<Term>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let appname = self.appname.clone();
|
||||
let tli =
|
||||
@@ -446,7 +440,6 @@ impl SafekeeperPostgresHandler {
|
||||
start_pos,
|
||||
end_pos,
|
||||
stop_pos,
|
||||
term,
|
||||
commit_lsn_watch_rx,
|
||||
ws_guard: ws_guard.clone(),
|
||||
wal_reader,
|
||||
@@ -483,10 +476,6 @@ struct WalSender<'a, IO> {
|
||||
// If present, terminate after reaching this position; used by walproposer
|
||||
// in recovery.
|
||||
stop_pos: Option<Lsn>,
|
||||
/// When streaming uncommitted part, the term the client acts as the leader
|
||||
/// in. Streaming is stopped if local term changes to a different (higher)
|
||||
/// value.
|
||||
term: Option<Term>,
|
||||
commit_lsn_watch_rx: Receiver<Lsn>,
|
||||
ws_guard: Arc<WalSenderGuard>,
|
||||
wal_reader: WalReader,
|
||||
@@ -529,18 +518,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
.0 as usize;
|
||||
send_size = min(send_size, self.send_buf.len());
|
||||
let send_buf = &mut self.send_buf[..send_size];
|
||||
let send_size: usize;
|
||||
{
|
||||
// If uncommitted part is being pulled, check that the term is
|
||||
// still the expected one.
|
||||
let _term_guard = if let Some(t) = self.term {
|
||||
Some(self.tli.acquire_term(t).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// read wal into buffer
|
||||
send_size = self.wal_reader.read(send_buf).await?
|
||||
};
|
||||
// read wal into buffer
|
||||
send_size = self.wal_reader.read(send_buf).await?;
|
||||
let send_buf = &send_buf[..send_size];
|
||||
|
||||
// and send it
|
||||
|
||||
@@ -499,19 +499,6 @@ impl Timeline {
|
||||
false
|
||||
}
|
||||
|
||||
/// Ensure taht current term is t, erroring otherwise, and lock the state.
|
||||
pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
|
||||
let ss = self.write_shared_state().await;
|
||||
if ss.sk.state.acceptor_state.term != t {
|
||||
bail!(
|
||||
"failed to acquire term {}, current term {}",
|
||||
t,
|
||||
ss.sk.state.acceptor_state.term
|
||||
);
|
||||
}
|
||||
Ok(ss)
|
||||
}
|
||||
|
||||
/// Returns whether s3 offloading is required and sets current status as
|
||||
/// matching it.
|
||||
pub async fn wal_backup_attend(&self) -> bool {
|
||||
|
||||
@@ -32,7 +32,6 @@ import requests
|
||||
from _pytest.config import Config
|
||||
from _pytest.config.argparsing import Parser
|
||||
from _pytest.fixtures import FixtureRequest
|
||||
from mypy_boto3_s3 import S3Client
|
||||
|
||||
# Type-related stuff
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
@@ -433,7 +432,7 @@ class NeonEnvBuilder:
|
||||
self.port_distributor = port_distributor
|
||||
self.remote_storage = remote_storage
|
||||
self.ext_remote_storage: Optional[S3Storage] = None
|
||||
self.remote_storage_client: Optional[S3Client] = None
|
||||
self.remote_storage_client: Optional[Any] = None
|
||||
self.remote_storage_users = remote_storage_users
|
||||
self.broker = broker
|
||||
self.run_id = run_id
|
||||
@@ -876,14 +875,7 @@ class NeonEnv:
|
||||
|
||||
def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
|
||||
"""Get a timeline directory's path based on the repo directory of the test environment"""
|
||||
return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
|
||||
|
||||
def tenant_dir(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
) -> Path:
|
||||
"""Get a tenant directory's path based on the repo directory of the test environment"""
|
||||
return self.repo_dir / "tenants" / str(tenant_id)
|
||||
return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
|
||||
|
||||
def get_pageserver_version(self) -> str:
|
||||
bin_pageserver = str(self.neon_binpath / "pageserver")
|
||||
@@ -1526,8 +1518,6 @@ class NeonPageserver(PgProtocol):
|
||||
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
|
||||
".*Error processing HTTP request: NotFound: Timeline .* was not found",
|
||||
".*took more than expected to complete.*",
|
||||
# these can happen during shutdown, but it should not be a reason to fail a test
|
||||
".*completed, took longer than expected.*",
|
||||
]
|
||||
|
||||
def start(
|
||||
@@ -2827,15 +2817,8 @@ def check_restored_datadir_content(
|
||||
endpoint: Endpoint,
|
||||
):
|
||||
# Get the timeline ID. We need it for the 'basebackup' command
|
||||
timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
|
||||
timeline = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
|
||||
|
||||
# many tests already checkpoint, but do it just in case
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CHECKPOINT")
|
||||
|
||||
# wait for pageserver to catch up
|
||||
wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
|
||||
# stop postgres to ensure that files won't change
|
||||
endpoint.stop()
|
||||
|
||||
@@ -2850,7 +2833,7 @@ def check_restored_datadir_content(
|
||||
{psql_path} \
|
||||
--no-psqlrc \
|
||||
postgres://localhost:{env.pageserver.service_port.pg} \
|
||||
-c 'basebackup {endpoint.tenant_id} {timeline_id}' \
|
||||
-c 'basebackup {endpoint.tenant_id} {timeline}' \
|
||||
| tar -x -C {restored_dir_path}
|
||||
"""
|
||||
|
||||
|
||||
@@ -210,10 +210,6 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_delete(self, tenant_id: TenantId):
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
|
||||
self.verbose_error(res)
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.remote_storage import RemoteStorageKind, S3Storage
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
def assert_tenant_state(
|
||||
@@ -19,6 +17,15 @@ def assert_tenant_state(
|
||||
assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
|
||||
|
||||
|
||||
def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
|
||||
tenants = pageserver_http.tenant_list()
|
||||
matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
|
||||
assert len(matching) < 2
|
||||
if len(matching) == 0:
|
||||
return None
|
||||
return matching[0]
|
||||
|
||||
|
||||
def remote_consistent_lsn(
|
||||
pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
|
||||
) -> Lsn:
|
||||
@@ -192,19 +199,20 @@ def wait_timeline_detail_404(
|
||||
timeline_id: TimelineId,
|
||||
iterations: int,
|
||||
):
|
||||
def timeline_is_missing():
|
||||
data = {}
|
||||
last_exc = None
|
||||
for _ in range(iterations):
|
||||
time.sleep(0.250)
|
||||
try:
|
||||
data = pageserver_http.timeline_detail(tenant_id, timeline_id)
|
||||
log.info(f"timeline detail {data}")
|
||||
log.info(f"detail {data}")
|
||||
except PageserverApiException as e:
|
||||
log.debug(e)
|
||||
if e.status_code == 404:
|
||||
return
|
||||
|
||||
raise RuntimeError(f"Timeline exists state {data.get('state')}")
|
||||
last_exc = e
|
||||
|
||||
wait_until(iterations, interval=0.250, func=timeline_is_missing)
|
||||
raise last_exc or RuntimeError(f"Timeline wasnt deleted in time, state: {data['state']}")
|
||||
|
||||
|
||||
def timeline_delete_wait_completed(
|
||||
@@ -216,72 +224,3 @@ def timeline_delete_wait_completed(
|
||||
):
|
||||
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
|
||||
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# TODO avoid by combining remote storage related stuff in single type
|
||||
# and just passing in this type instead of whole builder
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
|
||||
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
|
||||
assert neon_env_builder.remote_storage_kind in (
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
RemoteStorageKind.REAL_S3,
|
||||
)
|
||||
# For mypy
|
||||
assert isinstance(neon_env_builder.remote_storage, S3Storage)
|
||||
assert neon_env_builder.remote_storage_client is not None
|
||||
|
||||
# Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
|
||||
response = neon_env_builder.remote_storage_client.list_objects_v2(
|
||||
Bucket=neon_env_builder.remote_storage.bucket_name,
|
||||
Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
|
||||
)
|
||||
objects = response.get("Contents")
|
||||
assert (
|
||||
response["KeyCount"] == 0
|
||||
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
|
||||
|
||||
|
||||
def wait_tenant_status_404(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
iterations: int,
|
||||
interval: float = 0.250,
|
||||
):
|
||||
def tenant_is_missing():
|
||||
data = {}
|
||||
try:
|
||||
data = pageserver_http.tenant_status(tenant_id)
|
||||
log.info(f"tenant status {data}")
|
||||
except PageserverApiException as e:
|
||||
log.debug(e)
|
||||
if e.status_code == 404:
|
||||
return
|
||||
|
||||
raise RuntimeError(f"Timeline exists state {data.get('state')}")
|
||||
|
||||
wait_until(iterations, interval=interval, func=tenant_is_missing)
|
||||
|
||||
|
||||
def tenant_delete_wait_completed(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
iterations: int,
|
||||
):
|
||||
pageserver_http.tenant_delete(tenant_id=tenant_id)
|
||||
wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
|
||||
|
||||
|
||||
MANY_SMALL_LAYERS_TENANT_CONFIG = {
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
"checkpoint_distance": f"{1024**2}",
|
||||
"image_creation_threshold": "100",
|
||||
}
|
||||
|
||||
|
||||
def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
|
||||
return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 6
|
||||
|
||||
@@ -6,16 +6,13 @@ import subprocess
|
||||
import tarfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar
|
||||
from typing import Any, Callable, Dict, List, Tuple, TypeVar
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import allure
|
||||
from psycopg2.extensions import cursor
|
||||
|
||||
from fixtures.log_helper import log
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import PgBin
|
||||
from fixtures.types import TimelineId
|
||||
|
||||
Fn = TypeVar("Fn", bound=Callable[..., Any])
|
||||
@@ -303,13 +300,17 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
|
||||
raise Exception("timed out while waiting for %s" % func) from last_exception
|
||||
|
||||
|
||||
def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
|
||||
def wait_while(number_of_iterations: int, interval: float, func):
|
||||
"""
|
||||
Fast way to populate data.
|
||||
For more layers consider combining with these tenant settings:
|
||||
{
|
||||
"checkpoint_distance": 1024 ** 2,
|
||||
"image_creation_threshold": 100,
|
||||
}
|
||||
Wait until 'func' returns false, or throws an exception.
|
||||
"""
|
||||
pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
|
||||
for i in range(number_of_iterations):
|
||||
try:
|
||||
if not func():
|
||||
return
|
||||
log.info("waiting for %s iteration %s failed", func, i + 1)
|
||||
time.sleep(interval)
|
||||
continue
|
||||
except Exception:
|
||||
return
|
||||
raise Exception("timed out while waiting for %s" % func)
|
||||
|
||||
@@ -82,7 +82,6 @@ def upload_files(env):
|
||||
|
||||
# Test downloading remote extension.
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
|
||||
@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
|
||||
def test_remote_extensions(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
@@ -111,7 +110,7 @@ def test_remote_extensions(
|
||||
"test_remote_extensions",
|
||||
tenant_id=tenant_id,
|
||||
remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
# config_lines=["log_min_messages=debug3"],
|
||||
config_lines=["shared_preload_libraries='neon,ololo'"],
|
||||
)
|
||||
try:
|
||||
with closing(endpoint.connect()) as conn:
|
||||
@@ -122,206 +121,203 @@ def test_remote_extensions(
|
||||
log.info(all_extensions)
|
||||
assert "anon" in all_extensions
|
||||
|
||||
# postgis is on real s3 but not mock s3.
|
||||
# it's kind of a big file, would rather not upload to github
|
||||
if remote_storage_kind == RemoteStorageKind.REAL_S3:
|
||||
assert "postgis" in all_extensions
|
||||
# this may fail locally if dependency is missing
|
||||
# we don't really care about the error,
|
||||
# we just want to make sure it downloaded
|
||||
try:
|
||||
cur.execute("CREATE EXTENSION postgis")
|
||||
except Exception as err:
|
||||
log.info(f"(expected) error creating postgis extension: {err}")
|
||||
# we do not check the error, so this is basically a NO-OP
|
||||
# however checking the log you can make sure that it worked
|
||||
# and also get valuable information about how long loading the extension took
|
||||
# # postgis is on real s3 but not mock s3.
|
||||
# # it's kind of a big file, would rather not upload to github
|
||||
# if remote_storage_kind == RemoteStorageKind.REAL_S3:
|
||||
# assert "postgis" in all_extensions
|
||||
# # this may fail locally if dependency is missing
|
||||
# # we don't really care about the error,
|
||||
# # we just want to make sure it downloaded
|
||||
# try:
|
||||
# cur.execute("CREATE EXTENSION postgis")
|
||||
# except Exception as err:
|
||||
# log.info(f"(expected) error creating postgis extension: {err}")
|
||||
# # we do not check the error, so this is basically a NO-OP
|
||||
# # however checking the log you can make sure that it worked
|
||||
# # and also get valuable information about how long loading the extension took
|
||||
|
||||
# this is expected to fail on my computer because I don't have the pgcrypto extension
|
||||
try:
|
||||
cur.execute("CREATE EXTENSION anon")
|
||||
except Exception as err:
|
||||
log.info("error creating anon extension")
|
||||
assert "pgcrypto" in str(err), "unexpected error creating anon extension"
|
||||
# # this is expected to fail on my computer because I don't have the pgcrypto extension
|
||||
# try:
|
||||
# cur.execute("CREATE EXTENSION anon")
|
||||
# except Exception as err:
|
||||
# log.info("error creating anon extension")
|
||||
# assert "pgcrypto" in str(err), "unexpected error creating anon extension"
|
||||
finally:
|
||||
cleanup(pg_version)
|
||||
|
||||
|
||||
# Test downloading remote library.
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
|
||||
@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
|
||||
def test_remote_library(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
pg_version: PgVersion,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_library",
|
||||
enable_remote_extensions=True,
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id)
|
||||
# # Test downloading remote library.
|
||||
# @pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
|
||||
# def test_remote_library(
|
||||
# neon_env_builder: NeonEnvBuilder,
|
||||
# remote_storage_kind: RemoteStorageKind,
|
||||
# pg_version: PgVersion,
|
||||
# ):
|
||||
# neon_env_builder.enable_remote_storage(
|
||||
# remote_storage_kind=remote_storage_kind,
|
||||
# test_name="test_remote_library",
|
||||
# enable_remote_extensions=True,
|
||||
# )
|
||||
# env = neon_env_builder.init_start()
|
||||
# tenant_id, _ = env.neon_cli.create_tenant()
|
||||
# env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id)
|
||||
|
||||
assert env.ext_remote_storage is not None # satisfy mypy
|
||||
assert env.remote_storage_client is not None # satisfy mypy
|
||||
# assert env.ext_remote_storage is not None # satisfy mypy
|
||||
# assert env.remote_storage_client is not None # satisfy mypy
|
||||
|
||||
# For MOCK_S3 we upload test files.
|
||||
# For REAL_S3 we use the files already in the bucket
|
||||
if remote_storage_kind == RemoteStorageKind.MOCK_S3:
|
||||
upload_files(env)
|
||||
# # For MOCK_S3 we upload test files.
|
||||
# # For REAL_S3 we use the files already in the bucket
|
||||
# if remote_storage_kind == RemoteStorageKind.MOCK_S3:
|
||||
# upload_files(env)
|
||||
|
||||
# and use them to run LOAD library
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_remote_library",
|
||||
tenant_id=tenant_id,
|
||||
remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
# config_lines=["log_min_messages=debug3"],
|
||||
)
|
||||
try:
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# try to load library
|
||||
try:
|
||||
cur.execute("LOAD 'anon'")
|
||||
except Exception as err:
|
||||
log.info(f"error loading anon library: {err}")
|
||||
raise AssertionError("unexpected error loading anon library") from err
|
||||
# # and use them to run LOAD library
|
||||
# endpoint = env.endpoints.create_start(
|
||||
# "test_remote_library",
|
||||
# tenant_id=tenant_id,
|
||||
# remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
# # config_lines=["log_min_messages=debug3"],
|
||||
# )
|
||||
# try:
|
||||
# with closing(endpoint.connect()) as conn:
|
||||
# with conn.cursor() as cur:
|
||||
# # try to load library
|
||||
# try:
|
||||
# cur.execute("LOAD 'anon'")
|
||||
# except Exception as err:
|
||||
# log.info(f"error loading anon library: {err}")
|
||||
# raise AssertionError("unexpected error loading anon library") from err
|
||||
|
||||
# test library which name is different from extension name
|
||||
# this may fail locally if dependency is missing
|
||||
# however, it does successfully download the postgis archive
|
||||
if remote_storage_kind == RemoteStorageKind.REAL_S3:
|
||||
try:
|
||||
cur.execute("LOAD 'postgis_topology-3'")
|
||||
except Exception as err:
|
||||
log.info("error loading postgis_topology-3")
|
||||
assert "No such file or directory" in str(
|
||||
err
|
||||
), "unexpected error loading postgis_topology-3"
|
||||
finally:
|
||||
cleanup(pg_version)
|
||||
# # test library which name is different from extension name
|
||||
# # this may fail locally if dependency is missing
|
||||
# # however, it does successfully download the postgis archive
|
||||
# if remote_storage_kind == RemoteStorageKind.REAL_S3:
|
||||
# try:
|
||||
# cur.execute("LOAD 'postgis_topology-3'")
|
||||
# except Exception as err:
|
||||
# log.info("error loading postgis_topology-3")
|
||||
# assert "No such file or directory" in str(
|
||||
# err
|
||||
# ), "unexpected error loading postgis_topology-3"
|
||||
# finally:
|
||||
# cleanup(pg_version)
|
||||
|
||||
|
||||
# Here we test a complex extension
|
||||
# which has multiple extensions in one archive
|
||||
# using postgis as an example
|
||||
# # Here we test a complex extension
|
||||
# # which has multiple extensions in one archive
|
||||
# # using postgis as an example
|
||||
# @pytest.mark.skipif(
|
||||
# RemoteStorageKind.REAL_S3 not in available_s3_storages(),
|
||||
# reason="skipping test because real s3 not enabled",
|
||||
# RemoteStorageKind.REAL_S3 not in available_s3_storages(),
|
||||
# reason="skipping test because real s3 not enabled",
|
||||
# )
|
||||
@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
|
||||
def test_multiple_extensions_one_archive(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_version: PgVersion,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.REAL_S3,
|
||||
test_name="test_multiple_extensions_one_archive",
|
||||
enable_remote_extensions=True,
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id)
|
||||
# def test_multiple_extensions_one_archive(
|
||||
# neon_env_builder: NeonEnvBuilder,
|
||||
# pg_version: PgVersion,
|
||||
# ):
|
||||
# neon_env_builder.enable_remote_storage(
|
||||
# remote_storage_kind=RemoteStorageKind.REAL_S3,
|
||||
# test_name="test_multiple_extensions_one_archive",
|
||||
# enable_remote_extensions=True,
|
||||
# )
|
||||
# env = neon_env_builder.init_start()
|
||||
# tenant_id, _ = env.neon_cli.create_tenant()
|
||||
# env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id)
|
||||
|
||||
assert env.ext_remote_storage is not None # satisfy mypy
|
||||
assert env.remote_storage_client is not None # satisfy mypy
|
||||
# assert env.ext_remote_storage is not None # satisfy mypy
|
||||
# assert env.remote_storage_client is not None # satisfy mypy
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_multiple_extensions_one_archive",
|
||||
tenant_id=tenant_id,
|
||||
remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
)
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE EXTENSION address_standardizer;")
|
||||
cur.execute("CREATE EXTENSION address_standardizer_data_us;")
|
||||
# execute query to ensure that it works
|
||||
cur.execute(
|
||||
"SELECT house_num, name, suftype, city, country, state, unit \
|
||||
FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \
|
||||
'One Rust Place, Boston, MA 02109');"
|
||||
)
|
||||
res = cur.fetchall()
|
||||
log.info(res)
|
||||
assert len(res) > 0
|
||||
# endpoint = env.endpoints.create_start(
|
||||
# "test_multiple_extensions_one_archive",
|
||||
# tenant_id=tenant_id,
|
||||
# remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
# )
|
||||
# with closing(endpoint.connect()) as conn:
|
||||
# with conn.cursor() as cur:
|
||||
# cur.execute("CREATE EXTENSION address_standardizer;")
|
||||
# cur.execute("CREATE EXTENSION address_standardizer_data_us;")
|
||||
# # execute query to ensure that it works
|
||||
# cur.execute(
|
||||
# "SELECT house_num, name, suftype, city, country, state, unit \
|
||||
# FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \
|
||||
# 'One Rust Place, Boston, MA 02109');"
|
||||
# )
|
||||
# res = cur.fetchall()
|
||||
# log.info(res)
|
||||
# assert len(res) > 0
|
||||
|
||||
cleanup(pg_version)
|
||||
# cleanup(pg_version)
|
||||
|
||||
|
||||
# Test that extension is downloaded after endpoint restart,
|
||||
# when the library is used in the query.
|
||||
#
|
||||
# Run the test with mutliple simultaneous connections to an endpoint.
|
||||
# to ensure that the extension is downloaded only once.
|
||||
#
|
||||
@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
|
||||
def test_extension_download_after_restart(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_version: PgVersion,
|
||||
):
|
||||
if "15" in pg_version: # SKIP v15 for now because test set only has extension built for v14
|
||||
return None
|
||||
# # Test that extension is downloaded after endpoint restart,
|
||||
# # when the library is used in the query.
|
||||
# #
|
||||
# # Run the test with mutliple simultaneous connections to an endpoint.
|
||||
# # to ensure that the extension is downloaded only once.
|
||||
# #
|
||||
# def test_extension_download_after_restart(
|
||||
# neon_env_builder: NeonEnvBuilder,
|
||||
# pg_version: PgVersion,
|
||||
# ):
|
||||
# if "15" in pg_version: # SKIP v15 for now because test set only has extension built for v14
|
||||
# return None
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
test_name="test_extension_download_after_restart",
|
||||
enable_remote_extensions=True,
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id)
|
||||
# neon_env_builder.enable_remote_storage(
|
||||
# remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
# test_name="test_extension_download_after_restart",
|
||||
# enable_remote_extensions=True,
|
||||
# )
|
||||
# env = neon_env_builder.init_start()
|
||||
# tenant_id, _ = env.neon_cli.create_tenant()
|
||||
# env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id)
|
||||
|
||||
assert env.ext_remote_storage is not None # satisfy mypy
|
||||
assert env.remote_storage_client is not None # satisfy mypy
|
||||
# assert env.ext_remote_storage is not None # satisfy mypy
|
||||
# assert env.remote_storage_client is not None # satisfy mypy
|
||||
|
||||
# For MOCK_S3 we upload test files.
|
||||
upload_files(env)
|
||||
# # For MOCK_S3 we upload test files.
|
||||
# upload_files(env)
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_extension_download_after_restart",
|
||||
tenant_id=tenant_id,
|
||||
remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
config_lines=["log_min_messages=debug3"],
|
||||
)
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE extension pg_buffercache;")
|
||||
cur.execute("SELECT * from pg_buffercache;")
|
||||
res = cur.fetchall()
|
||||
assert len(res) > 0
|
||||
log.info(res)
|
||||
# endpoint = env.endpoints.create_start(
|
||||
# "test_extension_download_after_restart",
|
||||
# tenant_id=tenant_id,
|
||||
# remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
# config_lines=["log_min_messages=debug3"],
|
||||
# )
|
||||
# with closing(endpoint.connect()) as conn:
|
||||
# with conn.cursor() as cur:
|
||||
# cur.execute("CREATE extension pg_buffercache;")
|
||||
# cur.execute("SELECT * from pg_buffercache;")
|
||||
# res = cur.fetchall()
|
||||
# assert len(res) > 0
|
||||
# log.info(res)
|
||||
|
||||
# shutdown compute node
|
||||
endpoint.stop()
|
||||
# remove extension files locally
|
||||
cleanup(pg_version)
|
||||
# # shutdown compute node
|
||||
# endpoint.stop()
|
||||
# # remove extension files locally
|
||||
# cleanup(pg_version)
|
||||
|
||||
# spin up compute node again (there are no extension files available, because compute is stateless)
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_extension_download_after_restart",
|
||||
tenant_id=tenant_id,
|
||||
remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
config_lines=["log_min_messages=debug3"],
|
||||
)
|
||||
# # spin up compute node again (there are no extension files available, because compute is stateless)
|
||||
# endpoint = env.endpoints.create_start(
|
||||
# "test_extension_download_after_restart",
|
||||
# tenant_id=tenant_id,
|
||||
# remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
# config_lines=["log_min_messages=debug3"],
|
||||
# )
|
||||
|
||||
# connect to compute node and run the query
|
||||
# that will trigger the download of the extension
|
||||
def run_query(endpoint, thread_id: int):
|
||||
log.info("thread_id {%d} starting", thread_id)
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT * from pg_buffercache;")
|
||||
res = cur.fetchall()
|
||||
assert len(res) > 0
|
||||
log.info("thread_id {%d}, res = %s", thread_id, res)
|
||||
# # connect to compute node and run the query
|
||||
# # that will trigger the download of the extension
|
||||
# def run_query(endpoint, thread_id: int):
|
||||
# log.info("thread_id {%d} starting", thread_id)
|
||||
# with closing(endpoint.connect()) as conn:
|
||||
# with conn.cursor() as cur:
|
||||
# cur.execute("SELECT * from pg_buffercache;")
|
||||
# res = cur.fetchall()
|
||||
# assert len(res) > 0
|
||||
# log.info("thread_id {%d}, res = %s", thread_id, res)
|
||||
|
||||
threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)]
|
||||
# threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)]
|
||||
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
# for thread in threads:
|
||||
# thread.start()
|
||||
# for thread in threads:
|
||||
# thread.join()
|
||||
|
||||
cleanup(pg_version)
|
||||
# cleanup(pg_version)
|
||||
|
||||
@@ -56,6 +56,10 @@ def test_pg_regress(
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
|
||||
|
||||
# checkpoint one more time to ensure that the lsn we get is the latest one
|
||||
endpoint.safe_psql("CHECKPOINT")
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(test_output_dir, env, endpoint)
|
||||
|
||||
|
||||
@@ -162,4 +166,9 @@ def test_sql_regress(
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
|
||||
|
||||
# checkpoint one more time to ensure that the lsn we get is the latest one
|
||||
endpoint.safe_psql("CHECKPOINT")
|
||||
endpoint.safe_psql("select pg_current_wal_insert_lsn()")[0][0]
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(test_output_dir, env, endpoint)
|
||||
|
||||
@@ -97,11 +97,6 @@ def test_remote_storage_backup_and_restore(
|
||||
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
# Thats because of UnreliableWrapper's injected failures
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
|
||||
)
|
||||
|
||||
checkpoint_numbers = range(1, 3)
|
||||
|
||||
for checkpoint_number in checkpoint_numbers:
|
||||
|
||||
@@ -33,4 +33,8 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
|
||||
cur.execute(f"insert into t1 values ({i}, {j})")
|
||||
cur.execute("commit")
|
||||
|
||||
# force wal flush
|
||||
cur.execute("checkpoint")
|
||||
|
||||
# Check that we can restore the content of the datadir correctly
|
||||
check_restored_datadir_content(test_output_dir, env, endpoint)
|
||||
|
||||
@@ -1,250 +0,0 @@
|
||||
import enum
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
MANY_SMALL_LAYERS_TENANT_CONFIG,
|
||||
assert_prefix_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
tenant_delete_wait_completed,
|
||||
wait_tenant_status_404,
|
||||
wait_until_tenant_active,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
|
||||
from fixtures.types import TenantId
|
||||
from fixtures.utils import run_pg_bench_small
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
|
||||
)
|
||||
def test_tenant_delete_smoke(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_tenant_delete_smoke",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# first try to delete non existing tenant
|
||||
tenant_id = TenantId.generate()
|
||||
env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
|
||||
with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"):
|
||||
ps_http.tenant_delete(tenant_id=tenant_id)
|
||||
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id=tenant_id,
|
||||
conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
|
||||
)
|
||||
|
||||
# create two timelines one being the parent of another
|
||||
parent = None
|
||||
for timeline in ["first", "second"]:
|
||||
timeline_id = env.neon_cli.create_branch(
|
||||
timeline, tenant_id=tenant_id, ancestor_branch_name=parent
|
||||
)
|
||||
with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
|
||||
|
||||
parent = timeline
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
|
||||
tenant_path = env.tenant_dir(tenant_id=tenant_id)
|
||||
assert not tenant_path.exists()
|
||||
|
||||
if remote_storage_kind in [RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3]:
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class Check(enum.Enum):
|
||||
RETRY_WITHOUT_RESTART = enum.auto()
|
||||
RETRY_WITH_RESTART = enum.auto()
|
||||
|
||||
|
||||
FAILPOINTS = [
|
||||
"tenant-delete-before-shutdown",
|
||||
"tenant-delete-before-create-remote-mark",
|
||||
"tenant-delete-before-create-local-mark",
|
||||
"tenant-delete-before-background",
|
||||
"tenant-delete-before-polling-ongoing-deletions",
|
||||
"tenant-delete-before-cleanup-remaining-fs-traces",
|
||||
"tenant-delete-before-remove-timelines-dir",
|
||||
"tenant-delete-before-remove-deleted-mark",
|
||||
"tenant-delete-before-remove-tenant-dir",
|
||||
# Some failpoints from timeline deletion
|
||||
"timeline-delete-before-index-deleted-at",
|
||||
"timeline-delete-before-rm",
|
||||
"timeline-delete-before-index-delete",
|
||||
"timeline-delete-after-rm-dir",
|
||||
]
|
||||
|
||||
FAILPOINTS_BEFORE_BACKGROUND = [
|
||||
"timeline-delete-before-schedule",
|
||||
"tenant-delete-before-shutdown",
|
||||
"tenant-delete-before-create-remote-mark",
|
||||
"tenant-delete-before-create-local-mark",
|
||||
"tenant-delete-before-background",
|
||||
]
|
||||
|
||||
|
||||
def combinations():
|
||||
result = []
|
||||
|
||||
remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
|
||||
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
|
||||
remotes.append(RemoteStorageKind.REAL_S3)
|
||||
|
||||
for remote_storage_kind in remotes:
|
||||
for delete_failpoint in FAILPOINTS:
|
||||
if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
|
||||
"timeline-delete-before-index-delete",
|
||||
):
|
||||
# the above failpoint are not relevant for config without remote storage
|
||||
continue
|
||||
|
||||
result.append((remote_storage_kind, delete_failpoint))
|
||||
return result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
|
||||
@pytest.mark.parametrize("check", list(Check))
|
||||
def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
failpoint: str,
|
||||
check: Check,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# From deletion polling
|
||||
f".*NotFound: tenant {env.initial_tenant}.*",
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# We may leave some upload tasks in the queue. They're likely deletes.
|
||||
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
|
||||
# So by ignoring these instead of waiting for empty upload queue
|
||||
# we execute more distinct code paths.
|
||||
'.*stopping left-over name="remote upload".*',
|
||||
]
|
||||
)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
|
||||
with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
|
||||
# generate enough layers
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
if remote_storage_kind is RemoteStorageKind.NOOP:
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
else:
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
ps_http.configure_failpoints((failpoint, "return"))
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
# These failpoints are earlier than background task is spawned.
|
||||
# so they result in api request failure.
|
||||
if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
|
||||
with pytest.raises(PageserverApiException, match=failpoint):
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
|
||||
else:
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
tenant_info = wait_until_tenant_state(
|
||||
pageserver_http=ps_http,
|
||||
tenant_id=tenant_id,
|
||||
expected_state="Broken",
|
||||
iterations=iterations,
|
||||
)
|
||||
|
||||
reason = tenant_info["state"]["data"]["reason"]
|
||||
log.info(f"tenant broken: {reason}")
|
||||
|
||||
# failpoint may not be the only error in the stack
|
||||
assert reason.endswith(f"failpoint: {failpoint}"), reason
|
||||
|
||||
if check is Check.RETRY_WITH_RESTART:
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
if (
|
||||
remote_storage_kind is RemoteStorageKind.NOOP
|
||||
and failpoint == "tenant-delete-before-create-local-mark"
|
||||
):
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
|
||||
elif failpoint in (
|
||||
"tenant-delete-before-shutdown",
|
||||
"tenant-delete-before-create-remote-mark",
|
||||
):
|
||||
wait_until_tenant_active(
|
||||
ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
|
||||
)
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
|
||||
else:
|
||||
# Pageserver should've resumed deletion after restart.
|
||||
wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
|
||||
elif check is Check.RETRY_WITHOUT_RESTART:
|
||||
# this should succeed
|
||||
# this also checks that delete can be retried even when tenant is in Broken state
|
||||
ps_http.configure_failpoints((failpoint, "off"))
|
||||
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
|
||||
|
||||
# Check remote is impty
|
||||
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
tenant_dir = env.tenant_dir(tenant_id)
|
||||
# Check local is empty
|
||||
assert not tenant_dir.exists()
|
||||
|
||||
|
||||
# TODO test concurrent deletions with "hang" failpoint
|
||||
# TODO test tenant delete continues after attach
|
||||
@@ -66,10 +66,6 @@ def test_tenant_reattach(
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
# Thats because of UnreliableWrapper's injected failures
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
|
||||
)
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
with endpoint.cursor() as cur:
|
||||
|
||||
@@ -17,9 +17,9 @@ from fixtures.neon_fixtures import (
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
tenant_exists,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
wait_tenant_status_404,
|
||||
)
|
||||
from fixtures.port_distributor import PortDistributor
|
||||
from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
|
||||
@@ -29,6 +29,7 @@ from fixtures.utils import (
|
||||
start_in_background,
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
wait_while,
|
||||
)
|
||||
|
||||
|
||||
@@ -268,16 +269,11 @@ def test_tenant_relocation(
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
# Needed for detach polling.
|
||||
env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
|
||||
|
||||
# create folder for remote storage mock
|
||||
remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage"
|
||||
|
||||
@@ -287,7 +283,9 @@ def test_tenant_relocation(
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
_, initial_timeline_id = env.neon_cli.create_tenant(tenant_id)
|
||||
tenant_id, initial_timeline_id = env.neon_cli.create_tenant(
|
||||
TenantId("74ee8b079a0e437eb0afea7d26a07209")
|
||||
)
|
||||
log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)
|
||||
|
||||
env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id)
|
||||
@@ -471,8 +469,11 @@ def test_tenant_relocation(
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
# Wait a little, so that the detach operation has time to finish.
|
||||
wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1)
|
||||
|
||||
wait_while(
|
||||
number_of_iterations=100,
|
||||
interval=1,
|
||||
func=lambda: tenant_exists(pageserver_http, tenant_id),
|
||||
)
|
||||
post_migration_check(ep_main, 500500, old_local_path_main)
|
||||
post_migration_check(ep_second, 1001000, old_local_path_second)
|
||||
|
||||
|
||||
@@ -146,11 +146,6 @@ def test_tenants_attached_after_download(
|
||||
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
# Thats because of UnreliableWrapper's injected failures
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
|
||||
)
|
||||
|
||||
for checkpoint_number in range(1, 3):
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute(
|
||||
|
||||
@@ -4,6 +4,7 @@ import queue
|
||||
import shutil
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
@@ -17,8 +18,6 @@ from fixtures.neon_fixtures import (
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
timeline_delete_wait_completed,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
@@ -28,6 +27,7 @@ from fixtures.pageserver.utils import (
|
||||
)
|
||||
from fixtures.remote_storage import (
|
||||
RemoteStorageKind,
|
||||
S3Storage,
|
||||
available_remote_storages,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
@@ -187,9 +187,10 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
8. Retry or restart without the failpoint and check the result.
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
|
||||
)
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
@@ -230,7 +231,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
|
||||
ps_http.configure_failpoints((failpoint, "return"))
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
|
||||
|
||||
# These failpoints are earlier than background task is spawned.
|
||||
# so they result in api request failure.
|
||||
@@ -279,14 +280,14 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
"remote_storage_s3_request_seconds_count",
|
||||
filter={"request_type": "get_object", "result": "err"},
|
||||
).value
|
||||
== 2 # One is missing tenant deletion mark, second is missing index part
|
||||
== 1
|
||||
)
|
||||
assert (
|
||||
m.query_one(
|
||||
"remote_storage_s3_request_seconds_count",
|
||||
filter={"request_type": "get_object", "result": "ok"},
|
||||
).value
|
||||
== 1 # index part for initial timeline
|
||||
== 1
|
||||
)
|
||||
elif check is Check.RETRY_WITHOUT_RESTART:
|
||||
# this should succeed
|
||||
@@ -412,6 +413,27 @@ def test_timeline_resurrection_on_attach(
|
||||
assert all([tl["state"] == "Active" for tl in timelines])
|
||||
|
||||
|
||||
def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] = None):
|
||||
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
|
||||
assert neon_env_builder.remote_storage_kind in (
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
RemoteStorageKind.REAL_S3,
|
||||
)
|
||||
# For mypy
|
||||
assert isinstance(neon_env_builder.remote_storage, S3Storage)
|
||||
|
||||
# Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
|
||||
assert neon_env_builder.remote_storage_client is not None
|
||||
response = neon_env_builder.remote_storage_client.list_objects_v2(
|
||||
Bucket=neon_env_builder.remote_storage.bucket_name,
|
||||
Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
|
||||
)
|
||||
objects = response.get("Contents")
|
||||
assert (
|
||||
response["KeyCount"] == 0
|
||||
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
|
||||
|
||||
|
||||
def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
When deleting a timeline, if we succeed in setting the deleted flag remotely
|
||||
|
||||
@@ -869,49 +869,6 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
assert debug_dump_1["config"]["id"] == env.safekeepers[0].id
|
||||
|
||||
|
||||
class DummyConsumer(object):
|
||||
def __call__(self, msg):
|
||||
pass
|
||||
|
||||
|
||||
def test_start_replication_term(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test START_REPLICATION of uncommitted part specifying leader term. It must
|
||||
error if safekeeper switched to different term.
|
||||
"""
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.neon_cli.create_branch("test_start_replication_term")
|
||||
endpoint = env.endpoints.create_start("test_start_replication_term")
|
||||
|
||||
endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
||||
|
||||
# learn neon timeline from compute
|
||||
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
sk = env.safekeepers[0]
|
||||
sk_http_cli = sk.http_client()
|
||||
tli_status = sk_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
timeline_start_lsn = tli_status.timeline_start_lsn
|
||||
|
||||
conn_opts = {
|
||||
"host": "127.0.0.1",
|
||||
"options": f"-c timeline_id={timeline_id} tenant_id={tenant_id}",
|
||||
"port": sk.port.pg,
|
||||
"connection_factory": psycopg2.extras.PhysicalReplicationConnection,
|
||||
}
|
||||
sk_pg_conn = psycopg2.connect(**conn_opts) # type: ignore
|
||||
with sk_pg_conn.cursor() as cur:
|
||||
# should fail, as first start has term 2
|
||||
cur.start_replication_expert(f"START_REPLICATION {timeline_start_lsn} (term='3')")
|
||||
dummy_consumer = DummyConsumer()
|
||||
with pytest.raises(psycopg2.errors.InternalError_) as excinfo:
|
||||
cur.consume_stream(dummy_consumer)
|
||||
assert "failed to acquire term 3" in str(excinfo.value)
|
||||
|
||||
|
||||
# Test auth on WAL service (postgres protocol) ports.
|
||||
def test_sk_auth(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.auth_enabled = True
|
||||
|
||||
Reference in New Issue
Block a user