Compare commits

..

1 Commits

Author SHA1 Message Date
Konstantin Knizhnik
8799a603a2 walingest createdb record both for CREATE_DATABASE_FROM_WAL and CREATE_DATABASE_FILE_COPY 2023-09-19 13:23:14 +03:00
70 changed files with 723 additions and 4291 deletions

View File

@@ -834,7 +834,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.17.11
VM_BUILDER_VERSION: v0.17.10
steps:
- name: Checkout
@@ -1091,9 +1091,8 @@ jobs:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"

55
Cargo.lock generated
View File

@@ -636,7 +636,7 @@ dependencies = [
"sha1",
"sync_wrapper",
"tokio",
"tokio-tungstenite",
"tokio-tungstenite 0.20.0",
"tower",
"tower-layer",
"tower-service",
@@ -1941,15 +1941,15 @@ dependencies = [
[[package]]
name = "hyper-tungstenite"
version = "0.11.1"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
checksum = "880b8b1c98a5ec2a505c7c90db6d3f6f1f480af5655d9c5b55facc9382a5a5b5"
dependencies = [
"hyper",
"pin-project-lite",
"pin-project",
"tokio",
"tokio-tungstenite",
"tungstenite",
"tokio-tungstenite 0.18.0",
"tungstenite 0.18.0",
]
[[package]]
@@ -2908,9 +2908,9 @@ dependencies = [
[[package]]
name = "pin-project-lite"
version = "0.2.13"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
[[package]]
name = "pin-utils"
@@ -3417,7 +3417,6 @@ dependencies = [
"metrics",
"once_cell",
"pin-project-lite",
"rand",
"scopeguard",
"serde",
"serde_json",
@@ -4642,6 +4641,18 @@ dependencies = [
"xattr",
]
[[package]]
name = "tokio-tungstenite"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54319c93411147bced34cb5609a80e0a8e44c5999c93903a81cd866630ec0bfd"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.18.0",
]
[[package]]
name = "tokio-tungstenite"
version = "0.20.0"
@@ -4651,7 +4662,7 @@ dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite",
"tungstenite 0.20.0",
]
[[package]]
@@ -4966,9 +4977,28 @@ checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
[[package]]
name = "tungstenite"
version = "0.20.1"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9"
checksum = "30ee6ab729cd4cf0fd55218530c4522ed30b7b6081752839b68fcec8d0960788"
dependencies = [
"base64 0.13.1",
"byteorder",
"bytes",
"http",
"httparse",
"log",
"rand",
"sha1",
"thiserror",
"url",
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
dependencies = [
"byteorder",
"bytes",
@@ -5618,7 +5648,6 @@ dependencies = [
"tower",
"tracing",
"tracing-core",
"tungstenite",
"url",
"uuid",
]

View File

@@ -78,7 +78,7 @@ hostname = "0.3.1"
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.11"
hyper-tungstenite = "0.9"
inotify = "0.10.2"
itertools = "0.10"
jsonwebtoken = "8"

View File

@@ -589,7 +589,8 @@ RUN case "${PG_VERSION}" in \
echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
#########################################################################################
#

View File

@@ -153,6 +153,18 @@ neon-pg-ext-%: postgres-%
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
# pg_embedding was temporarily released as hnsw from this repo, when we only
# supported PostgreSQL 14 and 15
neon-pg-ext-v14: neon-pg-ext-hnsw-v14
neon-pg-ext-v15: neon-pg-ext-hnsw-v15
neon-pg-ext-hnsw-%: postgres-headers-% postgres-%
+@echo "Compiling hnsw $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
@@ -167,6 +179,9 @@ neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean
.PHONY: neon-pg-ext
neon-pg-ext: \

View File

@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
```bash
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
libcurl4-openssl-dev openssl python-poetry lsof
```
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
protobuf-devel libcurl-devel openssl poetry lsof
```
* On Arch based systems, these packages are needed:
```bash

View File

@@ -223,7 +223,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
if attach_req.pageserver_id.is_some() {
tenant_state.generation += 1;
}
tenant_state.pageserver = attach_req.pageserver_id;
let generation = tenant_state.generation;
locked.save().await.map_err(ApiError::InternalServerError)?;

View File

@@ -363,15 +363,8 @@ pub struct TimelineInfo {
pub latest_gc_cutoff_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub disk_consistent_lsn: Lsn,
/// The LSN that we have succesfully uploaded to remote storage
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn,
/// The LSN that we are advertizing to safekeepers
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn_visible: Lsn,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
/// Sum of the size of all layer files.
/// If a layer is present in both local FS and S3, it counts only once.

View File

@@ -29,4 +29,3 @@ workspace_hack.workspace = true
[dev-dependencies]
tempfile.workspace = true
test-context.workspace = true
rand.workspace = true

View File

@@ -20,16 +20,11 @@ use std::{
use anyhow::{bail, Context};
use serde::{Deserialize, Serialize};
use tokio::io;
use toml_edit::Item;
use tracing::info;
pub use self::{
local_fs::LocalFs,
s3_bucket::S3Bucket,
simulate_failures::{SimulatedError, UnreliableWrapper},
};
pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -47,9 +42,6 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
/// As defined in S3 docs
pub const MAX_KEYS_PER_DELETE: usize = 1000;
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
/// Path on the remote storage, relative to some inner prefix.
@@ -58,25 +50,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RemotePath(PathBuf);
impl Serialize for RemotePath {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(self)
}
}
impl<'de> Deserialize<'de> for RemotePath {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let str = String::deserialize(deserializer)?;
Ok(Self(PathBuf::from(&str)))
}
}
impl std::fmt::Display for RemotePath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0.display())
@@ -115,10 +88,6 @@ impl RemotePath {
pub fn extension(&self) -> Option<&str> {
self.0.extension()?.to_str()
}
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
self.0.strip_prefix(&p.0)
}
}
/// Storage (potentially remote) API to manage its state.
@@ -194,44 +163,26 @@ impl Debug for Download {
#[derive(Debug)]
pub enum DownloadError {
/// Validation or other error happened due to user input.
///
/// This is only used by LOCAL_FS.
BadInput(anyhow::Error),
/// The file was not found in the remote storage.
///
/// This can only happen during download, never during delete.
NotFound,
/// The file was found in the remote storage, but the operation failed.
///
/// The error should have context already describing the real failed operation.
/// The file was found in the remote storage, but the download failed.
Other(anyhow::Error),
}
impl std::fmt::Display for DownloadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use DownloadError::*;
match self {
NotFound => write!(f, "No file found for the remote object id given"),
// this is same as thiserror error(transparent); it handles {} and {:#}
Other(e) | BadInput(e) => std::fmt::Display::fmt(e, f),
DownloadError::BadInput(e) => {
write!(f, "Failed to download a remote file due to user input: {e}")
}
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
}
}
}
impl std::error::Error for DownloadError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
use DownloadError::*;
match self {
NotFound => None,
Other(_) | BadInput(_) => {
// TODO: these are anyhow, cannot return here
None
}
}
}
}
impl std::error::Error for DownloadError {}
/// Every storage, currently supported.
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.

View File

@@ -33,10 +33,11 @@ use tracing::debug;
use super::StorageMetadata;
use crate::{
Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
REMOTE_STORAGE_PREFIX_SEPARATOR,
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
pub(super) mod metrics;
use self::metrics::{AttemptOutcome, RequestKind};
@@ -345,7 +346,7 @@ impl RemoteStorage for S3Bucket {
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.context("list S3 prefixes")
.context("Failed to list S3 prefixes")
.map_err(DownloadError::Other);
let started_at = ScopeGuard::into_inner(started_at);
@@ -397,7 +398,7 @@ impl RemoteStorage for S3Bucket {
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.context("list files in S3 bucket");
.context("Failed to list files in S3 bucket");
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
@@ -499,7 +500,7 @@ impl RemoteStorage for S3Bucket {
delete_objects.push(obj_id);
}
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
let started_at = start_measuring_requests(kind);
let resp = self
@@ -521,7 +522,10 @@ impl RemoteStorage for S3Bucket {
.deleted_objects_total
.inc_by(chunk.len() as u64);
if let Some(errors) = resp.errors {
return Err(anyhow::anyhow!("delete {} objects", errors.len()));
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
}
}
Err(e) => {

View File

@@ -18,7 +18,7 @@ pub struct UnreliableWrapper {
}
/// Used to identify retries of different unique operation.
#[derive(Hash, Eq, PartialEq)]
#[derive(Debug, Hash, Eq, PartialEq)]
enum RemoteOp {
ListPrefixes(Option<RemotePath>),
Upload(RemotePath),
@@ -27,22 +27,6 @@ enum RemoteOp {
DeleteObjects(Vec<RemotePath>),
}
impl std::fmt::Debug for RemoteOp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use RemoteOp::*;
match self {
ListPrefixes(arg0) => f.debug_tuple("ListPrefixes").field(arg0).finish(),
Upload(arg0) => f.debug_tuple("Upload").field(arg0).finish(),
Download(arg0) => f.debug_tuple("Download").field(arg0).finish(),
Delete(arg0) => f.debug_tuple("Delete").field(arg0).finish(),
DeleteObjects(many) if many.len() > 3 => {
write!(f, "DeleteObjects({} paths)", many.len())
}
DeleteObjects(few) => f.debug_tuple("DeleteObjects").field(few).finish(),
}
}
}
impl UnreliableWrapper {
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
assert!(attempts_to_fail > 0);
@@ -75,12 +59,13 @@ impl UnreliableWrapper {
e.remove();
Ok(attempts_before_this)
} else {
let error = anyhow::anyhow!(SimulatedError::from(e.key()));
let error =
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
Err(DownloadError::Other(error))
}
}
Entry::Vacant(e) => {
let error = anyhow::anyhow!(SimulatedError::from(e.key()));
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
e.insert(1);
Err(DownloadError::Other(error))
}
@@ -95,26 +80,6 @@ impl UnreliableWrapper {
}
}
/// `pub` type for checking if this is the root cause around logging.
///
/// This is just a string to avoid cloning a huge number of paths a second time.
#[derive(Debug)]
pub struct SimulatedError(String);
impl<'a> From<&'a RemoteOp> for SimulatedError {
fn from(value: &'_ RemoteOp) -> Self {
SimulatedError(format!("simulated failure of remote operation {:?}", value))
}
}
impl std::fmt::Display for SimulatedError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}
impl std::error::Error for SimulatedError {}
#[async_trait::async_trait]
impl RemoteStorage for UnreliableWrapper {
async fn list_prefixes(

View File

@@ -378,30 +378,21 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
fn create_s3_client(
max_keys_per_list_response: Option<i32>,
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
use rand::Rng;
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
.context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
// due to how time works, we've had test runners use the same nanos as bucket prefixes.
// millis is just a debugging aid for easier finding the prefix later.
let millis = std::time::SystemTime::now()
let random_prefix_part = std::time::SystemTime::now()
.duration_since(UNIX_EPOCH)
.context("random s3 test prefix part calculation")?
.as_millis();
// because nanos can be the same for two threads so can millis, add randomness
let random = rand::thread_rng().gen::<u32>();
.as_nanos();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region,
prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response,

View File

@@ -89,22 +89,6 @@ impl Generation {
Self::Broken => panic!("Attempted to use a broken generation"),
}
}
pub fn next(&self) -> Generation {
match self {
Self::Valid(n) => Self::Valid(*n + 1),
Self::None => Self::Valid(1),
Self::Broken => panic!("Attempted to use a broken generation"),
}
}
pub fn into(self) -> Option<u32> {
if let Self::Valid(v) = self {
Some(v)
} else {
None
}
}
}
impl Serialize for Generation {

View File

@@ -24,9 +24,6 @@ pub enum ApiError {
#[error("Precondition failed: {0}")]
PreconditionFailed(Box<str>),
#[error("Shutting down")]
ShuttingDown,
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -55,10 +52,6 @@ impl ApiError {
self.to_string(),
StatusCode::PRECONDITION_FAILED,
),
ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
"Shutting down".to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,

View File

@@ -3,8 +3,6 @@
//! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.
use anyhow::Result;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use std::cmp::Ordering;
use std::collections::BinaryHeap;
@@ -98,9 +96,9 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
}
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let summary_blk = file.read_blk(0, ctx).await?;
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,
@@ -127,7 +125,6 @@ async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Resul
prev_key = Some(curr.next());
true
},
ctx,
)
.await?;
let mut holes = heap.into_vec();
@@ -138,7 +135,6 @@ async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Resul
pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
let storage_path = &cmd.path;
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
pageserver::virtual_file::init(10);
@@ -167,7 +163,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
parse_filename(&layer.file_name().into_string().unwrap())
{
if layer_file.is_delta {
layer_file.holes = get_holes(&layer.path(), max_holes, &ctx).await?;
layer_file.holes = get_holes(&layer.path(), max_holes).await?;
n_deltas += 1;
}
layers.push(layer_file);

View File

@@ -2,8 +2,6 @@ use std::path::{Path, PathBuf};
use anyhow::Result;
use clap::Subcommand;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::block_io::BlockCursor;
use pageserver::tenant::disk_btree::DiskBtreeReader;
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
@@ -46,12 +44,12 @@ pub(crate) enum LayerCmd {
},
}
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
let path = path.as_ref();
virtual_file::init(10);
page_cache::init(100);
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let summary_blk = file.read_blk(0, ctx).await?;
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,
@@ -69,12 +67,11 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
all.push((curr, BlobRef(value_offset)));
true
},
ctx,
)
.await?;
let cursor = BlockCursor::new_fileblockreader(&file);
for (k, v) in all {
let value = cursor.read_blob(v.pos(), ctx).await?;
let value = cursor.read_blob(v.pos()).await?;
println!("key:{} value_len:{}", k, value.len());
}
// TODO(chi): special handling for last key?
@@ -82,7 +79,6 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
}
pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
match cmd {
LayerCmd::List { path } => {
for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
@@ -157,7 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
);
if layer_file.is_delta {
read_delta_file(layer.path(), &ctx).await?;
read_delta_file(layer.path()).await?;
} else {
anyhow::bail!("not supported yet :(");
}

View File

@@ -8,7 +8,6 @@ use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -21,7 +20,6 @@ use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
context::{DownloadBehavior, RequestContext},
deletion_queue::DeletionQueue,
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -348,22 +346,9 @@ fn start_pageserver(
}
};
// Top-level cancellation token for the process
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
// Set up deletion queue
let (deletion_queue, deletion_workers) = DeletionQueue::new(
remote_storage.clone(),
ControlPlaneClient::new(conf, &shutdown_pageserver),
conf,
);
if let Some(deletion_workers) = deletion_workers {
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
}
// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint("initial", "Starting loading tenants");
@@ -394,13 +379,13 @@ fn start_pageserver(
};
// Scan the local 'tenants/' directory and start loading the tenants
let deletion_queue_client = deletion_queue.new_client();
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
TenantSharedResources {
broker_client: broker_client.clone(),
remote_storage: remote_storage.clone(),
deletion_queue_client,
},
order,
shutdown_pageserver.clone(),
@@ -496,10 +481,9 @@ fn start_pageserver(
http::routes::State::new(
conf,
http_auth.clone(),
remote_storage.clone(),
remote_storage,
broker_client.clone(),
disk_usage_eviction_state,
deletion_queue.new_client(),
)
.context("Failed to initialize router state")?,
);
@@ -627,12 +611,7 @@ fn start_pageserver(
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
unreachable!()
}
})

View File

@@ -475,8 +475,8 @@ impl PageServerConfigBuilder {
self.background_task_maximum_delay = BuilderValue::Set(delay);
}
pub fn control_plane_api(&mut self, api: Option<Url>) {
self.control_plane_api = BuilderValue::Set(api)
pub fn control_plane_api(&mut self, api: Url) {
self.control_plane_api = BuilderValue::Set(Some(api))
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
@@ -580,27 +580,6 @@ impl PageServerConf {
self.workdir.join(TENANTS_SEGMENT_NAME)
}
pub fn deletion_prefix(&self) -> PathBuf {
self.workdir.join("deletion")
}
pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
// Encode a version in the filename, so that if we ever switch away from JSON we can
// increment this.
const VERSION: u8 = 1;
self.deletion_prefix()
.join(format!("{sequence:016x}-{VERSION:02x}.list"))
}
pub fn deletion_header_path(&self) -> PathBuf {
// Encode a version in the filename, so that if we ever switch away from JSON we can
// increment this.
const VERSION: u8 = 1;
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
}
pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenants_path().join(tenant_id.to_string())
}
@@ -768,14 +747,7 @@ impl PageServerConf {
},
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
"control_plane_api" => {
let parsed = parse_toml_string(key, item)?;
if parsed.is_empty() {
builder.control_plane_api(None)
} else {
builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
}
},
"control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}

View File

@@ -94,18 +94,6 @@ pub struct RequestContext {
task_kind: TaskKind,
download_behavior: DownloadBehavior,
access_stats_behavior: AccessStatsBehavior,
page_content_kind: PageContentKind,
}
/// The kind of access to the page cache.
#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
pub enum PageContentKind {
Unknown,
DeltaLayerBtreeNode,
DeltaLayerValue,
ImageLayerBtreeNode,
ImageLayerValue,
InMemoryLayer,
}
/// Desired behavior if the operation requires an on-demand download
@@ -149,7 +137,6 @@ impl RequestContextBuilder {
task_kind,
download_behavior: DownloadBehavior::Download,
access_stats_behavior: AccessStatsBehavior::Update,
page_content_kind: PageContentKind::Unknown,
},
}
}
@@ -162,7 +149,6 @@ impl RequestContextBuilder {
task_kind: original.task_kind,
download_behavior: original.download_behavior,
access_stats_behavior: original.access_stats_behavior,
page_content_kind: original.page_content_kind,
},
}
}
@@ -181,11 +167,6 @@ impl RequestContextBuilder {
self
}
pub(crate) fn page_content_kind(mut self, k: PageContentKind) -> Self {
self.inner.page_content_kind = k;
self
}
pub fn build(self) -> RequestContext {
self.inner
}
@@ -282,8 +263,4 @@ impl RequestContext {
pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
self.access_stats_behavior
}
pub(crate) fn page_content_kind(&self) -> PageContentKind {
self.page_content_kind
}
}

View File

@@ -1,9 +1,7 @@
use std::collections::HashMap;
use pageserver_api::control_api::{
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
};
use serde::{de::DeserializeOwned, Serialize};
use hyper::StatusCode;
use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{
@@ -14,34 +12,25 @@ use utils::{
use crate::config::PageServerConf;
// Backoffs when control plane requests do not succeed: compromise between reducing load
// on control plane, and retrying frequently when we are blocked on a control plane
// response to make progress.
const BACKOFF_INCREMENT: f64 = 0.1;
const BACKOFF_MAX: f64 = 10.0;
/// The Pageserver's client for using the control plane API: this is a small subset
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
pub struct ControlPlaneClient {
pub(crate) struct ControlPlaneClient {
http_client: reqwest::Client,
base_url: Url,
node_id: NodeId,
cancel: CancellationToken,
}
/// Represent operations which internally retry on all errors other than
/// cancellation token firing: the only way they can fail is ShuttingDown.
pub enum RetryForeverError {
ShuttingDown,
}
#[async_trait::async_trait]
pub trait ControlPlaneGenerationsApi {
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
async fn validate(
&self,
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
}
impl ControlPlaneClient {
/// A None return value indicates that the input `conf` object does not have control
/// plane API enabled.
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
let mut url = match conf.control_plane_api.as_ref() {
Some(u) => u.clone(),
None => return None,
@@ -65,62 +54,27 @@ impl ControlPlaneClient {
})
}
async fn retry_http_forever<R, T>(
async fn try_re_attach(
&self,
url: &url::Url,
request: R,
) -> Result<T, RetryForeverError>
where
R: Serialize,
T: DeserializeOwned,
{
#[derive(thiserror::Error, Debug)]
enum RemoteAttemptError {
#[error("shutdown")]
Shutdown,
#[error("remote: {0}")]
Remote(reqwest::Error),
}
match backoff::retry(
|| async {
let response = self
.http_client
.post(url.clone())
.json(&request)
.send()
.await
.map_err(RemoteAttemptError::Remote)?;
response
.error_for_status_ref()
.map_err(RemoteAttemptError::Remote)?;
response
.json::<T>()
.await
.map_err(RemoteAttemptError::Remote)
},
|_| false,
3,
u32::MAX,
"calling control plane generation validation API",
backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
)
.await
{
Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
Err(RemoteAttemptError::Remote(_)) => {
panic!("We retry forever, this should never be reached");
url: Url,
request: &ReAttachRequest,
) -> anyhow::Result<ReAttachResponse> {
match self.http_client.post(url).json(request).send().await {
Err(e) => Err(anyhow::Error::from(e)),
Ok(r) => {
if r.status() == StatusCode::OK {
r.json::<ReAttachResponse>()
.await
.map_err(anyhow::Error::from)
} else {
Err(anyhow::anyhow!("Unexpected status {}", r.status()))
}
}
Ok(r) => Ok(r),
}
}
}
#[async_trait::async_trait]
impl ControlPlaneGenerationsApi for ControlPlaneClient {
/// Block until we get a successful response, or error out if we are shut down
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
/// Block until we get a successful response
pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
let re_attach_path = self
.base_url
.join("re-attach")
@@ -129,47 +83,37 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
node_id: self.node_id,
};
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
tracing::info!(
"Received re-attach response with {} tenants",
response.tenants.len()
);
let mut attempt = 0;
loop {
let result = self.try_re_attach(re_attach_path.clone(), &request).await;
match result {
Ok(res) => {
tracing::info!(
"Received re-attach response with {} tenants",
res.tenants.len()
);
Ok(response
.tenants
.into_iter()
.map(|t| (t.id, Generation::new(t.generation)))
.collect::<HashMap<_, _>>())
}
/// Block until we get a successful response, or error out if we are shut down
async fn validate(
&self,
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
let re_attach_path = self
.base_url
.join("validate")
.expect("Failed to build validate path");
let request = ValidateRequest {
tenants: tenants
.into_iter()
.map(|(id, gen)| ValidateRequestTenant {
id,
gen: gen
.into()
.expect("Generation should always be valid for a Tenant doing deletions"),
})
.collect(),
};
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
Ok(response
.tenants
.into_iter()
.map(|rt| (rt.id, rt.valid))
.collect())
return Ok(res
.tenants
.into_iter()
.map(|t| (t.id, Generation::new(t.generation)))
.collect::<HashMap<_, _>>());
}
Err(e) => {
tracing::error!("Error re-attaching tenants, retrying: {e:#}");
backoff::exponential_backoff(
attempt,
BACKOFF_INCREMENT,
BACKOFF_MAX,
&self.cancel,
)
.await;
if self.cancel.is_cancelled() {
return Err(anyhow::anyhow!("Shutting down"));
}
attempt += 1;
}
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,156 +0,0 @@
//! The deleter is the final stage in the deletion queue. It accumulates remote
//! paths to delete, and periodically executes them in batches of up to 1000
//! using the DeleteObjects request.
//!
//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
//! number of full-sized DeleteObjects requests, rather than a larger number of
//! smaller requests.
use remote_storage::GenericRemoteStorage;
use remote_storage::RemotePath;
use remote_storage::MAX_KEYS_PER_DELETE;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
use tracing::info;
use tracing::warn;
use crate::metrics;
use super::DeletionQueueError;
use super::FlushOp;
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
pub(super) enum DeleterMessage {
Delete(Vec<RemotePath>),
Flush(FlushOp),
}
/// Non-persistent deletion queue, for coalescing multiple object deletes into
/// larger DeleteObjects requests.
pub(super) struct Deleter {
// Accumulate up to 1000 keys for the next deletion operation
accumulator: Vec<RemotePath>,
rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
cancel: CancellationToken,
remote_storage: GenericRemoteStorage,
}
impl Deleter {
pub(super) fn new(
remote_storage: GenericRemoteStorage,
rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
cancel: CancellationToken,
) -> Self {
Self {
remote_storage,
rx,
cancel,
accumulator: Vec::new(),
}
}
/// Wrap the remote `delete_objects` with a failpoint
async fn remote_delete(&self) -> Result<(), anyhow::Error> {
fail::fail_point!("deletion-queue-before-execute", |_| {
info!("Skipping execution, failpoint set");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["failpoint"])
.inc();
Err(anyhow::anyhow!("failpoint hit"))
});
self.remote_storage.delete_objects(&self.accumulator).await
}
/// Block until everything in accumulator has been executed
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
match self.remote_delete().await {
Ok(()) => {
// Note: we assume that the remote storage layer returns Ok(()) if some
// or all of the deleted objects were already gone.
metrics::DELETION_QUEUE
.keys_executed
.inc_by(self.accumulator.len() as u64);
info!(
"Executed deletion batch {}..{}",
self.accumulator
.first()
.expect("accumulator should be non-empty"),
self.accumulator
.last()
.expect("accumulator should be non-empty"),
);
self.accumulator.clear();
}
Err(e) => {
warn!("DeleteObjects request failed: {e:#}, will retry");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["execute"])
.inc();
}
};
}
if self.cancel.is_cancelled() {
// Expose an error because we may not have actually flushed everything
Err(DeletionQueueError::ShuttingDown)
} else {
Ok(())
}
}
pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
self.accumulator.reserve(MAX_KEYS_PER_DELETE);
loop {
if self.cancel.is_cancelled() {
return Err(DeletionQueueError::ShuttingDown);
}
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
Ok(Some(m)) => m,
Ok(None) => {
// All queue senders closed
info!("Shutting down");
return Err(DeletionQueueError::ShuttingDown);
}
Err(_) => {
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
// return immediately if no work is pending
self.flush().await?;
continue;
}
};
match msg {
DeleterMessage::Delete(mut list) => {
while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
if self.accumulator.len() == MAX_KEYS_PER_DELETE {
self.flush().await?;
// If we have received this number of keys, proceed with attempting to execute
assert_eq!(self.accumulator.len(), 0);
}
let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
let take_count = std::cmp::min(available_slots, list.len());
for path in list.drain(list.len() - take_count..) {
self.accumulator.push(path);
}
}
}
DeleterMessage::Flush(flush_op) => {
// If flush() errors, we drop the flush_op and the caller will get
// an error recv()'ing their oneshot channel.
self.flush().await?;
flush_op.notify();
}
}
}
}
}

View File

@@ -1,487 +0,0 @@
//! The list writer is the first stage in the deletion queue. It accumulates
//! layers to delete, and periodically writes out these layers into a persistent
//! DeletionList.
//!
//! The purpose of writing DeletionLists is to decouple the decision to
//! delete an object from the validation required to execute it: even if
//! validation is not possible, e.g. due to a control plane outage, we can
//! still persist our intent to delete an object, in a way that would
//! survive a restart.
//!
//! DeletionLists are passed onwards to the Validator.
use super::DeletionHeader;
use super::DeletionList;
use super::FlushOp;
use super::ValidatorQueueMessage;
use std::collections::HashMap;
use std::fs::create_dir_all;
use std::time::Duration;
use regex::Regex;
use remote_storage::RemotePath;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use tracing::info;
use tracing::warn;
use utils::generation::Generation;
use utils::id::TenantId;
use utils::id::TimelineId;
use crate::config::PageServerConf;
use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::storage_layer::LayerFileName;
// The number of keys in a DeletionList before we will proactively persist it
// (without reaching a flush deadline). This aims to deliver objects of the order
// of magnitude 1MB when we are under heavy delete load.
const DELETION_LIST_TARGET_SIZE: usize = 16384;
// Ordinarily, we only flush to DeletionList periodically, to bound the window during
// which we might leak objects from not flushing a DeletionList after
// the objects are already unlinked from timeline metadata.
const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
// more objects before doing the flush.
const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
#[derive(Debug)]
pub(super) struct DeletionOp {
pub(super) tenant_id: TenantId,
pub(super) timeline_id: TimelineId,
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
// have a config object handy to project it to a remote key, and need the consuming worker
// to do it for you.
pub(super) layers: Vec<(LayerFileName, Generation)>,
pub(super) objects: Vec<RemotePath>,
/// The _current_ generation of the Tenant attachment in which we are enqueuing
/// this deletion.
pub(super) generation: Generation,
}
#[derive(Debug)]
pub(super) struct RecoverOp {
pub(super) attached_tenants: HashMap<TenantId, Generation>,
}
#[derive(Debug)]
pub(super) enum ListWriterQueueMessage {
Delete(DeletionOp),
// Wait until all prior deletions make it into a persistent DeletionList
Flush(FlushOp),
// Wait until all prior deletions have been executed (i.e. objects are actually deleted)
FlushExecute(FlushOp),
// Call once after re-attaching to control plane, to notify the deletion queue about
// latest attached generations & load any saved deletion lists from disk.
Recover(RecoverOp),
}
pub(super) struct ListWriter {
conf: &'static PageServerConf,
// Incoming frontend requests to delete some keys
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
// Outbound requests to the backend to execute deletion lists we have composed.
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
// The list we are currently building, contains a buffer of keys to delete
// and our next sequence number
pending: DeletionList,
// These FlushOps should notify the next time we flush
pending_flushes: Vec<FlushOp>,
// Worker loop is torn down when this fires.
cancel: CancellationToken,
// Safety guard to do recovery exactly once
recovered: bool,
}
impl ListWriter {
// Initially DeletionHeader.validated_sequence is zero. The place we start our
// sequence numbers must be higher than that.
const BASE_SEQUENCE: u64 = 1;
pub(super) fn new(
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
cancel: CancellationToken,
) -> Self {
Self {
pending: DeletionList::new(Self::BASE_SEQUENCE),
conf,
rx,
tx,
pending_flushes: Vec::new(),
cancel,
recovered: false,
}
}
/// Try to flush `list` to persistent storage
///
/// This does not return errors, because on failure to flush we do not lose
/// any state: flushing will be retried implicitly on the next deadline
async fn flush(&mut self) {
if self.pending.is_empty() {
for f in self.pending_flushes.drain(..) {
f.notify();
}
return;
}
match self.pending.save(self.conf).await {
Ok(_) => {
info!(sequence = self.pending.sequence, "Stored deletion list");
for f in self.pending_flushes.drain(..) {
f.notify();
}
// Take the list we've accumulated, replace it with a fresh list for the next sequence
let next_list = DeletionList::new(self.pending.sequence + 1);
let list = std::mem::replace(&mut self.pending, next_list);
if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
// This is allowed to fail: it will only happen if the backend worker is shut down,
// so we can just drop this on the floor.
info!("Deletion list dropped, this is normal during shutdown ({e:#})");
}
}
Err(e) => {
metrics::DELETION_QUEUE.unexpected_errors.inc();
warn!(
sequence = self.pending.sequence,
"Failed to write deletion list, will retry later ({e:#})"
);
}
}
}
/// Load the header, to learn the sequence number up to which deletions
/// have been validated. We will apply validated=true to DeletionLists
/// <= this sequence when loading them.
///
/// It is not an error for the header to not exist: we return None, and
/// the caller should act as if validated_sequence is 0
async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
let header_path = self.conf.deletion_header_path();
match tokio::fs::read(&header_path).await {
Ok(header_bytes) => {
match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
Ok(h) => Ok(Some(h.validated_sequence)),
Err(e) => {
warn!(
"Failed to deserialize deletion header, ignoring {}: {e:#}",
header_path.display()
);
// This should never happen unless we make a mistake with our serialization.
// Ignoring a deletion header is not consequential for correctnes because all deletions
// are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
metrics::DELETION_QUEUE.unexpected_errors.inc();
Ok(None)
}
}
}
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
debug!(
"Deletion header {} not found, first start?",
header_path.display()
);
Ok(None)
} else {
Err(anyhow::anyhow!(e))
}
}
}
}
async fn recover(
&mut self,
attached_tenants: HashMap<TenantId, Generation>,
) -> Result<(), anyhow::Error> {
debug!(
"recovering with {} attached tenants",
attached_tenants.len()
);
// Load the header
let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
self.pending.sequence = validated_sequence + 1;
let deletion_directory = self.conf.deletion_prefix();
let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
Ok(d) => d,
Err(e) => {
warn!(
"Failed to open deletion list directory {}: {e:#}",
deletion_directory.display(),
);
// Give up: if we can't read the deletion list directory, we probably can't
// write lists into it later, so the queue won't work.
return Err(e.into());
}
};
let list_name_pattern =
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
let header_path = self.conf.deletion_header_path();
let mut seqs: Vec<u64> = Vec::new();
while let Some(dentry) = dir.next_entry().await? {
let file_name = dentry.file_name();
let dentry_str = file_name.to_string_lossy();
if Some(file_name.as_os_str()) == header_path.file_name() {
// Don't try and parse the header's name like a list
continue;
}
if dentry_str.ends_with(TEMP_SUFFIX) {
info!("Cleaning up temporary file {dentry_str}");
let absolute_path = deletion_directory.join(dentry.file_name());
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
// Non-fatal error: we will just leave the file behind but not
// try and load it.
warn!(
"Failed to clean up temporary file {}: {e:#}",
absolute_path.display()
);
}
continue;
}
let file_name = dentry.file_name().to_owned();
let basename = file_name.to_string_lossy();
let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
m.name("sequence")
.expect("Non optional group should be present")
.as_str()
} else {
warn!("Unexpected key in deletion queue: {basename}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
};
let seq: u64 = match u64::from_str_radix(seq_part, 16) {
Ok(s) => s,
Err(e) => {
warn!("Malformed key '{basename}': {e}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
}
};
seqs.push(seq);
}
seqs.sort();
// Start our next deletion list from after the last location validated by
// previous process lifetime, or after the last location found (it is updated
// below after enumerating the deletion lists)
self.pending.sequence = validated_sequence + 1;
if let Some(max_list_seq) = seqs.last() {
self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
}
for s in seqs {
let list_path = self.conf.deletion_list_path(s);
let list_bytes = tokio::fs::read(&list_path).await?;
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
Ok(l) => l,
Err(e) => {
// Drop the list on the floor: any objects it referenced will be left behind
// for scrubbing to clean up. This should never happen unless we have a serialization bug.
warn!(sequence = s, "Failed to deserialize deletion list: {e}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
}
};
if deletion_list.sequence <= validated_sequence {
// If the deletion list falls below valid_seq, we may assume that it was
// already validated the last time this pageserver ran. Otherwise, we still
// load it, as it may still contain content valid in this generation.
deletion_list.validated = true;
} else {
// Special case optimization: if a tenant is still attached, and no other
// generation was issued to another node in the interval while we restarted,
// then we may treat deletion lists from the previous generation as if they
// belong to our currently attached generation, and proceed to validate & execute.
for (tenant_id, tenant_list) in &mut deletion_list.tenants {
if let Some(attached_gen) = attached_tenants.get(tenant_id) {
if attached_gen.previous() == tenant_list.generation {
tenant_list.generation = *attached_gen;
}
}
}
}
info!(
validated = deletion_list.validated,
sequence = deletion_list.sequence,
"Recovered deletion list"
);
// We will drop out of recovery if this fails: it indicates that we are shutting down
// or the backend has panicked
metrics::DELETION_QUEUE
.keys_submitted
.inc_by(deletion_list.len() as u64);
self.tx
.send(ValidatorQueueMessage::Delete(deletion_list))
.await?;
}
info!(next_sequence = self.pending.sequence, "Replay complete");
Ok(())
}
/// This is the front-end ingest, where we bundle up deletion requests into DeletionList
/// and write them out, for later validation by the backend and execution by the executor.
pub(super) async fn background(&mut self) {
info!("Started deletion frontend worker");
// Synchronous, but we only do it once per process lifetime so it's tolerable
if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
tracing::error!(
"Failed to create deletion list directory {}, deletions will not be executed ({e})",
self.conf.deletion_prefix().display()
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
return;
}
while !self.cancel.is_cancelled() {
let timeout = if self.pending_flushes.is_empty() {
FRONTEND_DEFAULT_TIMEOUT
} else {
FRONTEND_FLUSHING_TIMEOUT
};
let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
Ok(Some(msg)) => msg,
Ok(None) => {
// Queue sender destroyed, shutting down
break;
}
Err(_) => {
// Hit deadline, flush.
self.flush().await;
continue;
}
};
match msg {
ListWriterQueueMessage::Delete(op) => {
assert!(
self.recovered,
"Cannot process deletions before recovery. This is a bug."
);
debug!(
"Delete: ingesting {} layers, {} other objects",
op.layers.len(),
op.objects.len()
);
let mut layer_paths = Vec::new();
for (layer, generation) in op.layers {
layer_paths.push(remote_layer_path(
&op.tenant_id,
&op.timeline_id,
&layer,
generation,
));
}
layer_paths.extend(op.objects);
if !self.pending.push(
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
) {
self.flush().await;
let retry_succeeded = self.pending.push(
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
);
if !retry_succeeded {
// Unexpected: after we flush, we should have
// drained self.pending, so a conflict on
// generation numbers should be impossible.
tracing::error!(
"Failed to enqueue deletions, leaking objects. This is a bug."
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
}
}
}
ListWriterQueueMessage::Flush(op) => {
if self.pending.is_empty() {
// Execute immediately
debug!("Flush: No pending objects, flushing immediately");
op.notify()
} else {
// Execute next time we flush
debug!("Flush: adding to pending flush list for next deadline flush");
self.pending_flushes.push(op);
}
}
ListWriterQueueMessage::FlushExecute(op) => {
debug!("FlushExecute: passing through to backend");
// We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
info!("Can't flush, shutting down ({e})");
// Caller will get error when their oneshot sender was dropped.
}
}
ListWriterQueueMessage::Recover(op) => {
if self.recovered {
tracing::error!(
"Deletion queue recovery called more than once. This is a bug."
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
// Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
continue;
}
if let Err(e) = self.recover(op.attached_tenants).await {
// This should only happen in truly unrecoverable cases, like the recovery finding that the backend
// queue receiver has been dropped, or something is critically broken with
// the local filesystem holding deletion lists.
info!(
"Deletion queue recover aborted, deletion queue will not proceed ({e})"
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
return;
} else {
self.recovered = true;
}
}
}
if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
self.flush().await;
}
}
info!("Deletion queue shut down.");
}
}

View File

@@ -1,414 +0,0 @@
//! The validator is responsible for validating DeletionLists for execution,
//! based on whethe the generation in the DeletionList is still the latest
//! generation for a tenant.
//!
//! The purpose of validation is to ensure split-brain safety in the cluster
//! of pageservers: a deletion may only be executed if the tenant generation
//! that originated it is still current. See docs/rfcs/025-generation-numbers.md
//! The purpose of accumulating lists before validating them is to reduce load
//! on the control plane API by issuing fewer, larger requests.
//!
//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
//! to decide when old
//!
//! Deletions are passed onward to the Deleter.
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use tracing::info;
use tracing::warn;
use crate::config::PageServerConf;
use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::control_plane_client::RetryForeverError;
use crate::metrics;
use super::deleter::DeleterMessage;
use super::DeletionHeader;
use super::DeletionList;
use super::DeletionQueueError;
use super::FlushOp;
use super::VisibleLsnUpdates;
// After this length of time, do any validation work that is pending,
// even if we haven't accumulated many keys to delete.
//
// This also causes updates to remote_consistent_lsn to be validated, even
// if there were no deletions enqueued.
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
// If we have received this number of keys, proceed with attempting to execute
const AUTOFLUSH_KEY_COUNT: usize = 16384;
#[derive(Debug)]
pub(super) enum ValidatorQueueMessage {
Delete(DeletionList),
Flush(FlushOp),
}
pub(super) struct Validator<C>
where
C: ControlPlaneGenerationsApi,
{
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
// Client for calling into control plane API for validation of deletes
control_plane_client: Option<C>,
// DeletionLists which are waiting generation validation. Not safe to
// execute until [`validate`] has processed them.
pending_lists: Vec<DeletionList>,
// DeletionLists which have passed validation and are ready to execute.
validated_lists: Vec<DeletionList>,
// Sum of all the lengths of lists in pending_lists
pending_key_count: usize,
// Lsn validation state: we read projected LSNs and write back visible LSNs
// after validation. This is the LSN equivalent of `pending_validation_lists`:
// it is drained in [`validate`]
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
// If we failed to rewrite a deletion list due to local filesystem I/O failure,
// we must remember that and refuse to advance our persistent validated sequence
// number past the failure.
list_write_failed: Option<u64>,
cancel: CancellationToken,
}
impl<C> Validator<C>
where
C: ControlPlaneGenerationsApi,
{
pub(super) fn new(
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
control_plane_client: Option<C>,
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
cancel: CancellationToken,
) -> Self {
Self {
conf,
rx,
tx,
control_plane_client,
lsn_table,
pending_lists: Vec::new(),
validated_lists: Vec::new(),
pending_key_count: 0,
list_write_failed: None,
cancel,
}
}
/// Process any outstanding validations of generations of pending LSN updates or pending
/// DeletionLists.
///
/// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
/// go into the queue of ready-to-execute lists.
async fn validate(&mut self) -> Result<(), DeletionQueueError> {
let mut tenant_generations = HashMap::new();
for list in &self.pending_lists {
for (tenant_id, tenant_list) in &list.tenants {
// Note: DeletionLists are in logical time order, so generation always
// goes up. By doing a simple insert() we will always end up with
// the latest generation seen for a tenant.
tenant_generations.insert(*tenant_id, tenant_list.generation);
}
}
let pending_lsn_updates = {
let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
std::mem::take(&mut *lsn_table)
};
for (tenant_id, update) in &pending_lsn_updates.tenants {
let entry = tenant_generations
.entry(*tenant_id)
.or_insert(update.generation);
if update.generation > *entry {
*entry = update.generation;
}
}
if tenant_generations.is_empty() {
// No work to do
return Ok(());
}
let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
match control_plane_client
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
.await
{
Ok(tenants) => tenants,
Err(RetryForeverError::ShuttingDown) => {
// The only way a validation call returns an error is when the cancellation token fires
return Err(DeletionQueueError::ShuttingDown);
}
}
} else {
// Control plane API disabled. In legacy mode we consider everything valid.
tenant_generations.keys().map(|k| (*k, true)).collect()
};
let mut validated_sequence: Option<u64> = None;
// Apply the validation results to the pending LSN updates
for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
let validated_generation = tenant_generations
.get(&tenant_id)
.expect("Map was built from the same keys we're reading");
let valid = tenants_valid
.get(&tenant_id)
.copied()
// If the tenant was missing from the validation response, it has been deleted.
// The Timeline that requested the LSN update is probably already torn down,
// or will be torn down soon. In this case, drop the update by setting valid=false.
.unwrap_or(false);
if valid && *validated_generation == tenant_lsn_state.generation {
for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
pending_lsn.result_slot.store(pending_lsn.projected);
}
} else {
// If we failed validation, then do not apply any of the projected updates
warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
}
}
// Apply the validation results to the pending deletion lists
for list in &mut self.pending_lists {
// Filter the list based on whether the server responded valid: true.
// If a tenant is omitted in the response, it has been deleted, and we should
// proceed with deletion.
let mut mutated = false;
list.tenants.retain(|tenant_id, tenant| {
let validated_generation = tenant_generations
.get(tenant_id)
.expect("Map was built from the same keys we're reading");
// If the tenant was missing from the validation response, it has been deleted.
// This means that a deletion is valid, but also redundant since the tenant's
// objects should have already been deleted. Treat it as invalid to drop the
// redundant deletion.
let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
// A list is valid if it comes from the current _or previous_ generation.
// - The previous generation case is permitted due to how we store deletion lists locally:
// if we see the immediately previous generation in a locally stored deletion list,
// it proves that this node's disk was used for both current & previous generations,
// and therefore no other node was involved in between: the two generations may be
// logically treated as the same.
// - In that previous generation case, we rewrote it to the current generation
// in recover(), so the comparison here is simply an equality.
let this_list_valid = valid
&& (tenant.generation == *validated_generation);
if !this_list_valid {
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
mutated = true;
}
this_list_valid
});
list.validated = true;
if mutated {
// Save the deletion list if we had to make changes due to stale generations. The
// saved list is valid for execution.
if let Err(e) = list.save(self.conf).await {
// Highly unexpected. Could happen if e.g. disk full.
// If we didn't save the trimmed list, it is _not_ valid to execute.
warn!("Failed to save modified deletion list {list}: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
// Rather than have a complex retry process, just drop it and leak the objects,
// scrubber will clean up eventually.
list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
// We must remember this failure, to prevent later writing out a header that
// would imply the unwritable list was valid on disk.
if self.list_write_failed.is_none() {
self.list_write_failed = Some(list.sequence);
}
}
}
validated_sequence = Some(list.sequence);
}
if let Some(validated_sequence) = validated_sequence {
if let Some(list_write_failed) = self.list_write_failed {
// Rare error case: we failed to write out a deletion list to excise invalid
// entries, so we cannot advance the header's valid sequence number past that point.
//
// In this state we will continue to validate, execute and delete deletion lists,
// we just cannot update the header. It should be noticed and fixed by a human due to
// the nonzero value of our unexpected_errors metric.
warn!(
sequence_number = list_write_failed,
"Cannot write header because writing a deletion list failed earlier",
);
} else {
// Write the queue header to record how far validation progressed. This avoids having
// to rewrite each DeletionList to set validated=true in it.
let header = DeletionHeader::new(validated_sequence);
// Drop result because the validated_sequence is an optimization. If we fail to save it,
// then restart, we will drop some deletion lists, creating work for scrubber.
// The save() function logs a warning on error.
if let Err(e) = header.save(self.conf).await {
warn!("Failed to write deletion queue header: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
}
}
}
// Transfer the validated lists to the validated queue, for eventual execution
self.validated_lists.append(&mut self.pending_lists);
Ok(())
}
async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
for list_path in list_paths {
debug!("Removing deletion list {}", list_path.display());
if let Err(e) = tokio::fs::remove_file(&list_path).await {
// Unexpected: we should have permissions and nothing else should
// be touching these files. We will leave the file behind. Subsequent
// pageservers will try and load it again: hopefully whatever storage
// issue (probably permissions) has been fixed by then.
tracing::error!("Failed to delete {}: {e:#}", list_path.display());
metrics::DELETION_QUEUE.unexpected_errors.inc();
break;
}
}
}
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
// Issue any required generation validation calls to the control plane
self.validate().await?;
// After successful validation, nothing is pending: any lists that
// made it through validation will be in validated_lists.
assert!(self.pending_lists.is_empty());
self.pending_key_count = 0;
tracing::debug!(
"Validation complete, have {} validated lists",
self.validated_lists.len()
);
// Return quickly if we have no validated lists to execute. This avoids flushing the
// executor when an idle backend hits its autoflush interval
if self.validated_lists.is_empty() {
return Ok(());
}
// Drain `validated_lists` into the executor
let mut executing_lists = Vec::new();
for list in self.validated_lists.drain(..) {
let list_path = self.conf.deletion_list_path(list.sequence);
let objects = list.into_remote_paths();
self.tx
.send(DeleterMessage::Delete(objects))
.await
.map_err(|_| DeletionQueueError::ShuttingDown)?;
executing_lists.push(list_path);
}
self.flush_executor().await?;
// Erase the deletion lists whose keys have all be deleted from remote storage
self.cleanup_lists(executing_lists).await;
Ok(())
}
async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
// Flush the executor, so that all the keys referenced by these deletion lists
// are actually removed from remote storage. This is a precondition to deleting
// the deletion lists themselves.
let (flush_op, rx) = FlushOp::new();
self.tx
.send(DeleterMessage::Flush(flush_op))
.await
.map_err(|_| DeletionQueueError::ShuttingDown)?;
rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
}
pub(super) async fn background(&mut self) {
tracing::info!("Started deletion backend worker");
while !self.cancel.is_cancelled() {
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
Ok(Some(m)) => m,
Ok(None) => {
// All queue senders closed
info!("Shutting down");
break;
}
Err(_) => {
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
// return immediately if no work is pending.
match self.flush().await {
Ok(()) => {}
Err(DeletionQueueError::ShuttingDown) => {
// If we are shutting down, then auto-flush can safely be skipped
}
}
continue;
}
};
match msg {
ValidatorQueueMessage::Delete(list) => {
if list.validated {
// A pre-validated list may only be seen during recovery, if we are recovering
// a DeletionList whose on-disk state has validated=true
self.validated_lists.push(list)
} else {
self.pending_key_count += list.len();
self.pending_lists.push(list);
}
if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
match self.flush().await {
Ok(()) => {}
Err(DeletionQueueError::ShuttingDown) => {
// If we are shutting down, then auto-flush can safely be skipped
}
}
}
}
ValidatorQueueMessage::Flush(op) => {
match self.flush().await {
Ok(()) => {
op.notify();
}
Err(DeletionQueueError::ShuttingDown) => {
// If we fail due to shutting down, we will just drop `op` to propagate that status.
}
}
}
}
}
}
}

View File

@@ -1093,9 +1093,6 @@ components:
remote_consistent_lsn:
type: string
format: hex
remote_consistent_lsn_visible:
type: string
format: hex
ancestor_timeline_id:
type: string
format: hex

View File

@@ -5,7 +5,6 @@ use std::collections::HashMap;
use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
use futures::TryFutureExt;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
@@ -25,7 +24,6 @@ use super::models::{
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
@@ -36,7 +34,7 @@ use crate::tenant::mgr::{
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::Timeline;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant};
use utils::{
@@ -63,7 +61,6 @@ pub struct State {
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
}
impl State {
@@ -73,7 +70,6 @@ impl State {
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
.iter()
@@ -86,17 +82,8 @@ impl State {
remote_storage,
broker_client,
disk_usage_eviction_state,
deletion_queue_client,
})
}
fn tenant_resources(&self) -> TenantSharedResources {
TenantSharedResources {
broker_client: self.broker_client.clone(),
remote_storage: self.remote_storage.clone(),
deletion_queue_client: self.deletion_queue_client.clone(),
}
}
}
#[inline(always)]
@@ -296,12 +283,7 @@ async fn build_timeline_info_common(
};
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
let remote_consistent_lsn_projected = timeline
.get_remote_consistent_lsn_projected()
.unwrap_or(Lsn(0));
let remote_consistent_lsn_visible = timeline
.get_remote_consistent_lsn_visible()
.unwrap_or(Lsn(0));
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
let walreceiver_status = timeline.walreceiver_status();
@@ -311,8 +293,7 @@ async fn build_timeline_info_common(
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
remote_consistent_lsn: remote_consistent_lsn_projected,
remote_consistent_lsn_visible,
remote_consistent_lsn,
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -511,23 +492,24 @@ async fn tenant_attach_handler(
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
if state.remote_storage.is_none() {
if let Some(remote_storage) = &state.remote_storage {
mgr::attach_tenant(
state.conf,
tenant_id,
generation,
tenant_conf,
state.broker_client.clone(),
remote_storage.clone(),
&ctx,
)
.instrument(info_span!("tenant_attach", %tenant_id))
.await?;
} else {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
)));
}
mgr::attach_tenant(
state.conf,
tenant_id,
generation,
tenant_conf,
state.tenant_resources(),
&ctx,
)
.instrument(info_span!("tenant_attach", %tenant_id))
.await?;
json_response(StatusCode::ACCEPTED, ())
}
@@ -588,7 +570,6 @@ async fn tenant_load_handler(
generation,
state.broker_client.clone(),
state.remote_storage.clone(),
state.deletion_queue_client.clone(),
&ctx,
)
.instrument(info_span!("load", %tenant_id))
@@ -930,7 +911,8 @@ async fn tenant_create_handler(
tenant_conf,
target_tenant_id,
generation,
state.tenant_resources(),
state.broker_client.clone(),
state.remote_storage.clone(),
&ctx,
)
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1147,39 +1129,6 @@ async fn timeline_download_remote_layers_handler_get(
json_response(StatusCode::OK, info)
}
async fn deletion_queue_flush(
r: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&r);
if state.remote_storage.is_none() {
// Nothing to do if remote storage is disabled.
return json_response(StatusCode::OK, ());
}
let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
let flush = async {
if execute {
state.deletion_queue_client.flush_execute().await
} else {
state.deletion_queue_client.flush().await
}
}
// DeletionQueueError's only case is shutting down.
.map_err(|_| ApiError::ShuttingDown);
tokio::select! {
res = flush => {
res.map(|()| json_response(StatusCode::OK, ()))?
}
_ = cancel.cancelled() => {
Err(ApiError::ShuttingDown)
}
}
}
async fn active_timeline_of_active_tenant(
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -1514,9 +1463,6 @@ pub fn make_router(
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})

View File

@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
{
pg_control = Some(control_file);
}
modification.flush(ctx).await?;
modification.flush().await?;
}
}
// We're done importing all the data files.
modification.commit(ctx).await?;
modification.commit().await?;
// We expect the Postgres server to be shut down cleanly.
let pg_control = pg_control.context("pg_control file not found")?;
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
// We found the pg_control file.
pg_control = Some(res);
}
modification.flush(ctx).await?;
modification.flush().await?;
}
tokio_tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
// sanity check: ensure that pg_control is loaded
let _pg_control = pg_control.context("pg_control file not found")?;
modification.commit(ctx).await?;
modification.commit().await?;
Ok(())
}

View File

@@ -3,8 +3,7 @@ pub mod basebackup;
pub mod config;
pub mod consumption_metrics;
pub mod context;
pub mod control_plane_client;
pub mod deletion_queue;
mod control_plane_client;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
@@ -28,7 +27,6 @@ pub mod failpoint_support;
use std::path::Path;
use crate::task_mgr::TaskKind;
use deletion_queue::DeletionQueue;
use tracing::info;
/// Current storage format version
@@ -50,8 +48,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub use crate::metrics::preinitialize_metrics;
#[tracing::instrument(skip_all, fields(%exit_code))]
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
#[tracing::instrument]
pub async fn shutdown_pageserver(exit_code: i32) {
use std::time::Duration;
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
@@ -79,11 +77,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
)
.await;
// Best effort to persist any outstanding deletions, to avoid leaking objects
if let Some(mut deletion_queue) = deletion_queue {
deletion_queue.shutdown(Duration::from_secs(5)).await;
}
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.

View File

@@ -1,4 +1,3 @@
use enum_map::EnumMap;
use metrics::metric_vec_duration::DurationResultObserver;
use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
@@ -128,24 +127,22 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub struct PageCacheMetricsForTaskKind {
pub struct PageCacheMetrics {
pub read_accesses_materialized_page: IntCounter,
pub read_accesses_ephemeral: IntCounter,
pub read_accesses_immutable: IntCounter,
pub read_hits_ephemeral: IntCounter,
pub read_hits_immutable: IntCounter,
pub read_hits_materialized_page_exact: IntCounter,
pub read_hits_materialized_page_older_lsn: IntCounter,
}
pub struct PageCacheMetrics {
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
}
static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_read_hits_total",
"Number of read accesses to the page cache that hit",
&["task_kind", "key_kind", "content_kind", "hit_kind"]
&["key_kind", "hit_kind"]
)
.expect("failed to define a metric")
});
@@ -154,73 +151,55 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_read_accesses_total",
"Number of read accesses to the page cache",
&["task_kind", "key_kind", "content_kind"]
&["key_kind"]
)
.expect("failed to define a metric")
});
pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
let task_kind: &'static str = task_kind.into();
EnumMap::from_array(std::array::from_fn(|content_kind| {
let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
let content_kind: &'static str = content_kind.into();
PageCacheMetricsForTaskKind {
read_accesses_materialized_page: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&[
task_kind,
"materialized_page",
content_kind,
])
.unwrap()
},
read_accesses_materialized_page: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&["materialized_page"])
.unwrap()
},
read_accesses_immutable: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&[task_kind, "immutable", content_kind])
.unwrap()
},
read_accesses_ephemeral: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&["ephemeral"])
.unwrap()
},
read_hits_immutable: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
.unwrap()
},
read_accesses_immutable: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&["immutable"])
.unwrap()
},
read_hits_materialized_page_exact: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&[
task_kind,
"materialized_page",
content_kind,
"exact",
])
.unwrap()
},
read_hits_ephemeral: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["ephemeral", "-"])
.unwrap()
},
read_hits_materialized_page_older_lsn: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&[
task_kind,
"materialized_page",
content_kind,
"older_lsn",
])
.unwrap()
},
}
}))
})),
read_hits_immutable: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["immutable", "-"])
.unwrap()
},
read_hits_materialized_page_exact: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["materialized_page", "exact"])
.unwrap()
},
read_hits_materialized_page_older_lsn: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["materialized_page", "older_lsn"])
.unwrap()
},
});
impl PageCacheMetrics {
pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind {
&self.map[ctx.task_kind()][ctx.page_content_kind()]
}
}
pub struct PageCacheSizeMetrics {
pub max_bytes: UIntGauge,
@@ -887,54 +866,6 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
.expect("failed to define a metric")
});
pub(crate) struct DeletionQueueMetrics {
pub(crate) keys_submitted: IntCounter,
pub(crate) keys_dropped: IntCounter,
pub(crate) keys_executed: IntCounter,
pub(crate) dropped_lsn_updates: IntCounter,
pub(crate) unexpected_errors: IntCounter,
pub(crate) remote_errors: IntCounterVec,
}
pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
DeletionQueueMetrics{
keys_submitted: register_int_counter!(
"pageserver_deletion_queue_submitted_total",
"Number of objects submitted for deletion"
)
.expect("failed to define a metric"),
keys_dropped: register_int_counter!(
"pageserver_deletion_queue_dropped_total",
"Number of object deletions dropped due to stale generation."
)
.expect("failed to define a metric"),
keys_executed: register_int_counter!(
"pageserver_deletion_queue_executed_total",
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
)
.expect("failed to define a metric"),
dropped_lsn_updates: register_int_counter!(
"pageserver_deletion_queue_dropped_lsn_updates_total",
"Updates to remote_consistent_lsn dropped due to stale generation number."
)
.expect("failed to define a metric"),
unexpected_errors: register_int_counter!(
"pageserver_deletion_queue_unexpected_errors_total",
"Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
)
.expect("failed to define a metric"),
remote_errors: register_int_counter_vec!(
"pageserver_deletion_queue_remote_errors_total",
"Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
&["op_kind"],
)
.expect("failed to define a metric")
}
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
@@ -1349,9 +1280,6 @@ use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::{Duration, Instant};
use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;
pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
@@ -1723,9 +1651,6 @@ pub fn preinitialize_metrics() {
Lazy::force(c);
});
// Deletion queue stats
Lazy::force(&DELETION_QUEUE);
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()

View File

@@ -85,7 +85,7 @@ use utils::{
lsn::Lsn,
};
use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
use crate::{metrics::PageCacheSizeMetrics, repository::Key};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -346,10 +346,8 @@ impl PageCache {
timeline_id: TimelineId,
key: &Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, PageReadGuard)> {
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_materialized_page
.inc();
@@ -370,12 +368,10 @@ impl PageCache {
{
if available_lsn == lsn {
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_hits_materialized_page_exact
.inc();
} else {
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_hits_materialized_page_older_lsn
.inc();
}
@@ -430,11 +426,10 @@ impl PageCache {
&self,
file_id: FileId,
blkno: u32,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
self.lock_for_read(&mut cache_key, ctx).await
self.lock_for_read(&mut cache_key).await
}
//
@@ -502,20 +497,14 @@ impl PageCache {
/// }
/// ```
///
async fn lock_for_read(
&self,
cache_key: &mut CacheKey,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
let (read_access, hit) = match cache_key {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
}
CacheKey::ImmutableFilePage { .. } => (
&crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_immutable,
&crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
&crate::metrics::PAGE_CACHE.read_accesses_immutable,
&crate::metrics::PAGE_CACHE.read_hits_immutable,
),
};
read_access.inc();

View File

@@ -1138,7 +1138,7 @@ impl<'a> DatadirModification<'a> {
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
pub async fn flush(&mut self) -> anyhow::Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
@@ -1154,7 +1154,7 @@ impl<'a> DatadirModification<'a> {
if is_rel_block_key(key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, self.lsn, &value, ctx).await?;
writer.put(key, self.lsn, &value).await?;
} else {
retained_pending_updates.insert(key, value);
}
@@ -1174,14 +1174,14 @@ impl<'a> DatadirModification<'a> {
/// underlying timeline.
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
pub async fn commit(&mut self) -> anyhow::Result<()> {
let writer = self.tline.writer().await;
let lsn = self.lsn;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
for (key, value) in self.pending_updates.drain() {
writer.put(key, lsn, &value, ctx).await?;
writer.put(key, lsn, &value).await?;
}
for key_range in self.pending_deletions.drain(..) {
writer.delete(key_range, lsn).await?;

View File

@@ -37,7 +37,7 @@ impl Key {
| self.field6 as i128
}
pub const fn from_i128(x: i128) -> Self {
pub fn from_i128(x: i128) -> Self {
Key {
field1: ((x >> 120) & 0xf) as u8,
field2: ((x >> 104) & 0xFFFF) as u32,

View File

@@ -187,7 +187,6 @@ task_local! {
Debug,
// NB: enumset::EnumSetType derives PartialEq, Eq, Clone, Copy
enumset::EnumSetType,
enum_map::Enum,
serde::Serialize,
serde::Deserialize,
strum_macros::IntoStaticStr,
@@ -456,7 +455,7 @@ async fn task_finish(
}
if shutdown_process {
shutdown_pageserver(None, 1).await;
shutdown_pageserver(1).await;
}
}

View File

@@ -57,7 +57,6 @@ use self::timeline::EvictionTaskTenantState;
use self::timeline::TimelineResources;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_ACTIVATION;
@@ -118,7 +117,7 @@ mod span;
pub mod metadata;
mod par_fsync;
pub mod remote_timeline_client;
mod remote_timeline_client;
pub mod storage_layer;
pub mod config;
@@ -158,7 +157,6 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
pub struct TenantSharedResources {
pub broker_client: storage_broker::BrokerClientChannel,
pub remote_storage: Option<GenericRemoteStorage>,
pub deletion_queue_client: DeletionQueueClient,
}
///
@@ -199,9 +197,6 @@ pub struct Tenant {
// provides access to timeline data sitting in the remote storage
pub(crate) remote_storage: Option<GenericRemoteStorage>,
// Access to global deletion queue for when this tenant wants to schedule a deletion
deletion_queue_client: DeletionQueueClient,
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -528,20 +523,15 @@ impl Tenant {
conf: &'static PageServerConf,
tenant_id: TenantId,
generation: Generation,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
// TODO dedup with spawn_load
let tenant_conf =
Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
let TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client,
} = resources;
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
let tenant = Arc::new(Tenant::new(
TenantState::Attaching,
@@ -550,8 +540,7 @@ impl Tenant {
wal_redo_manager,
tenant_id,
generation,
remote_storage.clone(),
deletion_queue_client,
Some(remote_storage.clone()),
));
// Do all the hard work in the background
@@ -582,7 +571,7 @@ impl Tenant {
let pending_deletion = {
match DeleteTenantFlow::should_resume_deletion(
conf,
remote_storage.as_ref(),
Some(&remote_storage),
&tenant_clone,
)
.await
@@ -671,7 +660,6 @@ impl Tenant {
for timeline_id in remote_timeline_ids {
let client = RemoteTimelineClient::new(
remote_storage.clone(),
self.deletion_queue_client.clone(),
self.conf,
self.tenant_id,
timeline_id,
@@ -738,7 +726,6 @@ impl Tenant {
remote_metadata,
TimelineResources {
remote_client: Some(remote_client),
deletion_queue_client: self.deletion_queue_client.clone(),
},
ctx,
)
@@ -763,7 +750,6 @@ impl Tenant {
timeline_id,
&index_part.metadata,
Some(remote_timeline_client),
self.deletion_queue_client.clone(),
None,
)
.await
@@ -865,7 +851,6 @@ impl Tenant {
tenant_id,
Generation::broken(),
None,
DeletionQueueClient::broken(),
))
}
@@ -910,7 +895,6 @@ impl Tenant {
tenant_id,
generation,
remote_storage.clone(),
resources.deletion_queue_client.clone(),
);
let tenant = Arc::new(tenant);
@@ -1318,7 +1302,6 @@ impl Tenant {
timeline_id,
&local_metadata,
Some(remote_client),
self.deletion_queue_client.clone(),
init_order,
)
.await
@@ -1368,7 +1351,6 @@ impl Tenant {
timeline_id,
&local_metadata,
None,
self.deletion_queue_client.clone(),
init_order,
)
.await
@@ -1522,7 +1504,7 @@ impl Tenant {
.init_empty_test_timeline()
.context("init_empty_test_timeline")?;
modification
.commit(ctx)
.commit()
.await
.context("commit init_empty_test_timeline modification")?;
@@ -2260,9 +2242,6 @@ impl Tenant {
Ok(timeline)
}
// Allow too_many_arguments because a constructor's argument list naturally grows with the
// number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
#[allow(clippy::too_many_arguments)]
fn new(
state: TenantState,
conf: &'static PageServerConf,
@@ -2271,7 +2250,6 @@ impl Tenant {
tenant_id: TenantId,
generation: Generation,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
) -> Tenant {
let (state, mut rx) = watch::channel(state);
@@ -2339,7 +2317,6 @@ impl Tenant {
gc_cs: tokio::sync::Mutex::new(()),
walredo_mgr,
remote_storage,
deletion_queue_client,
state,
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2879,7 +2856,6 @@ impl Tenant {
let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
let remote_client = RemoteTimelineClient::new(
remote_storage.clone(),
self.deletion_queue_client.clone(),
self.conf,
self.tenant_id,
timeline_id,
@@ -2890,10 +2866,7 @@ impl Tenant {
None
};
TimelineResources {
remote_client,
deletion_queue_client: self.deletion_queue_client.clone(),
}
TimelineResources { remote_client }
}
/// Creates intermediate timeline structure and its files.
@@ -3349,7 +3322,6 @@ pub mod harness {
use utils::logging;
use utils::lsn::Lsn;
use crate::deletion_queue::mock::MockDeletionQueue;
use crate::{
config::PageServerConf,
repository::Key,
@@ -3411,7 +3383,6 @@ pub mod harness {
pub generation: Generation,
pub remote_storage: GenericRemoteStorage,
pub remote_fs_dir: PathBuf,
pub deletion_queue: MockDeletionQueue,
}
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3460,7 +3431,6 @@ pub mod harness {
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
Ok(Self {
conf,
@@ -3469,7 +3439,6 @@ pub mod harness {
generation: Generation::new(0xdeadbeef),
remote_storage,
remote_fs_dir,
deletion_queue,
})
}
@@ -3494,7 +3463,6 @@ pub mod harness {
self.tenant_id,
self.generation,
Some(self.remote_storage.clone()),
self.deletion_queue.new_client(),
));
tenant
.load(None, ctx)
@@ -3570,24 +3538,14 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x10),
&Value::Image(TEST_IMG("foo at 0x10")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
.await?;
writer.finish_write(Lsn(0x10));
drop(writer);
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x20),
&Value::Image(TEST_IMG("foo at 0x20")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
.await?;
writer.finish_write(Lsn(0x20));
drop(writer);
@@ -3661,19 +3619,19 @@ mod tests {
// Insert a value on the timeline
writer
.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"), &ctx)
.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))
.await?;
writer
.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"), &ctx)
.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))
.await?;
writer.finish_write(Lsn(0x20));
writer
.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"), &ctx)
.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))
.await?;
writer.finish_write(Lsn(0x30));
writer
.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"), &ctx)
.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))
.await?;
writer.finish_write(Lsn(0x40));
@@ -3688,7 +3646,7 @@ mod tests {
.expect("Should have a local timeline");
let new_writer = newtline.writer().await;
new_writer
.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))
.await?;
new_writer.finish_write(Lsn(0x40));
@@ -3711,11 +3669,7 @@ mod tests {
Ok(())
}
async fn make_some_layers(
tline: &Timeline,
start_lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<()> {
async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
let mut lsn = start_lsn;
#[allow(non_snake_case)]
{
@@ -3726,7 +3680,6 @@ mod tests {
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
ctx,
)
.await?;
writer.finish_write(lsn);
@@ -3736,7 +3689,6 @@ mod tests {
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
ctx,
)
.await?;
writer.finish_write(lsn);
@@ -3750,7 +3702,6 @@ mod tests {
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
ctx,
)
.await?;
writer.finish_write(lsn);
@@ -3760,7 +3711,6 @@ mod tests {
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
ctx,
)
.await?;
writer.finish_write(lsn);
@@ -3777,7 +3727,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
// FIXME: this doesn't actually remove any layer currently, given how the flushing
@@ -3851,7 +3801,7 @@ mod tests {
.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
@@ -3873,7 +3823,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3882,7 +3832,7 @@ mod tests {
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
tline.set_broken("test".to_owned());
@@ -3923,7 +3873,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3948,7 +3898,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3957,7 +3907,7 @@ mod tests {
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
// run gc on parent
tenant
@@ -3982,7 +3932,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), true)
@@ -4011,7 +3961,7 @@ mod tests {
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
let child_tline = tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -4022,7 +3972,7 @@ mod tests {
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
@@ -4054,7 +4004,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
let layer_map = tline.layers.read().await;
let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
@@ -4137,12 +4087,7 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x10),
&Value::Image(TEST_IMG("foo at 0x10")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
.await?;
writer.finish_write(Lsn(0x10));
drop(writer);
@@ -4152,12 +4097,7 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x20),
&Value::Image(TEST_IMG("foo at 0x20")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
.await?;
writer.finish_write(Lsn(0x20));
drop(writer);
@@ -4167,12 +4107,7 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x30),
&Value::Image(TEST_IMG("foo at 0x30")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))
.await?;
writer.finish_write(Lsn(0x30));
drop(writer);
@@ -4182,12 +4117,7 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x40),
&Value::Image(TEST_IMG("foo at 0x40")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))
.await?;
writer.finish_write(Lsn(0x40));
drop(writer);
@@ -4225,8 +4155,7 @@ mod tests {
//
#[tokio::test]
async fn test_bulk_insert() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_bulk_insert")?;
let (tenant, ctx) = harness.load().await;
let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -4246,7 +4175,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
@@ -4273,8 +4201,7 @@ mod tests {
#[tokio::test]
async fn test_random_updates() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_random_updates")?;
let (tenant, ctx) = harness.load().await;
let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -4300,7 +4227,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
@@ -4321,7 +4247,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
@@ -4381,7 +4306,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
@@ -4410,7 +4334,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
println!("updating {} at {}", blknum, lsn);
@@ -4479,7 +4402,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
&ctx,
)
.await?;
println!("updating [{}][{}] at {}", idx, blknum, lsn);
@@ -4552,7 +4474,7 @@ mod tests {
.init_empty_test_timeline()
.context("init_empty_test_timeline")?;
modification
.commit(&ctx)
.commit()
.await
.context("commit init_empty_test_timeline modification")?;

View File

@@ -11,7 +11,6 @@
//! len < 128: 0XXXXXXX
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
//!
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::tenant::block_io::BlockCursor;
use crate::virtual_file::VirtualFile;
@@ -20,13 +19,9 @@ use std::io::{Error, ErrorKind};
impl<'a> BlockCursor<'a> {
/// Read a blob into a new buffer.
pub async fn read_blob(
&self,
offset: u64,
ctx: &RequestContext,
) -> Result<Vec<u8>, std::io::Error> {
pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
let mut buf = Vec::new();
self.read_blob_into_buf(offset, &mut buf, ctx).await?;
self.read_blob_into_buf(offset, &mut buf).await?;
Ok(buf)
}
/// Read blob into the given buffer. Any previous contents in the buffer
@@ -35,12 +30,11 @@ impl<'a> BlockCursor<'a> {
&self,
offset: u64,
dstbuf: &mut Vec<u8>,
ctx: &RequestContext,
) -> Result<(), std::io::Error> {
let mut blknum = (offset / PAGE_SZ as u64) as u32;
let mut off = (offset % PAGE_SZ as u64) as usize;
let mut buf = self.read_blk(blknum, ctx).await?;
let mut buf = self.read_blk(blknum).await?;
// peek at the first byte, to determine if it's a 1- or 4-byte length
let first_len_byte = buf[off];
@@ -56,7 +50,7 @@ impl<'a> BlockCursor<'a> {
// it is split across two pages
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
blknum += 1;
buf = self.read_blk(blknum, ctx).await?;
buf = self.read_blk(blknum).await?;
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
off = 4 - thislen;
} else {
@@ -77,7 +71,7 @@ impl<'a> BlockCursor<'a> {
if page_remain == 0 {
// continue on next page
blknum += 1;
buf = self.read_blk(blknum, ctx).await?;
buf = self.read_blk(blknum).await?;
off = 0;
page_remain = PAGE_SZ;
}
@@ -234,13 +228,12 @@ impl BlobWriter<false> {
#[cfg(test)]
mod tests {
use super::*;
use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
use crate::tenant::block_io::BlockReaderRef;
use rand::{Rng, SeedableRng};
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
let temp_dir = tempfile::tempdir()?;
let path = temp_dir.path().join("file");
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
// Write part (in block to drop the file)
let mut offsets = Vec::new();
@@ -262,7 +255,7 @@ mod tests {
let rdr = BlockReaderRef::VirtualFile(&file);
let rdr = BlockCursor::new(rdr);
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
let blob_read = rdr.read_blob(*offset, &ctx).await?;
let blob_read = rdr.read_blob(*offset).await?;
assert_eq!(
blob, &blob_read,
"mismatch for idx={idx} at offset={offset}"

View File

@@ -4,7 +4,6 @@
use super::ephemeral_file::EphemeralFile;
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
use crate::context::RequestContext;
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
use crate::virtual_file::VirtualFile;
use bytes::Bytes;
@@ -83,16 +82,12 @@ pub(crate) enum BlockReaderRef<'a> {
impl<'a> BlockReaderRef<'a> {
#[inline(always)]
async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
use BlockReaderRef::*;
match self {
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
Adapter(r) => r.read_blk(blknum, ctx).await,
FileBlockReader(r) => r.read_blk(blknum).await,
EphemeralFile(r) => r.read_blk(blknum).await,
Adapter(r) => r.read_blk(blknum).await,
#[cfg(test)]
TestDisk(r) => r.read_blk(blknum),
#[cfg(test)]
@@ -110,13 +105,11 @@ impl<'a> BlockReaderRef<'a> {
///
/// ```no_run
/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
/// # use pageserver::context::RequestContext;
/// # let reader: FileBlockReader = unimplemented!("stub");
/// # let ctx: RequestContext = unimplemented!("stub");
/// let cursor = reader.block_cursor();
/// let buf = cursor.read_blk(1, &ctx);
/// let buf = cursor.read_blk(1);
/// // do stuff with 'buf'
/// let buf = cursor.read_blk(2, &ctx);
/// let buf = cursor.read_blk(2);
/// // do stuff with 'buf'
/// ```
///
@@ -141,12 +134,8 @@ impl<'a> BlockCursor<'a> {
/// access to the contents of the page. (For the page cache, the
/// lease object represents a lock on the buffer.)
#[inline(always)]
pub async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
self.reader.read_blk(blknum, ctx).await
pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
self.reader.read_blk(blknum).await
}
}
@@ -180,15 +169,11 @@ impl FileBlockReader {
/// Returns a "lease" object that can be used to
/// access to the contents of the page. (For the page cache, the
/// lease object represents a lock on the buffer.)
pub async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
let cache = page_cache::get();
loop {
match cache
.read_immutable_buf(self.file_id, blknum, ctx)
.read_immutable_buf(self.file_id, blknum)
.await
.map_err(|e| {
std::io::Error::new(

View File

@@ -26,11 +26,7 @@ use std::{cmp::Ordering, io, result};
use thiserror::Error;
use tracing::error;
use crate::{
context::{DownloadBehavior, RequestContext},
task_mgr::TaskKind,
tenant::block_io::{BlockReader, BlockWriter},
};
use crate::tenant::block_io::{BlockReader, BlockWriter};
// The maximum size of a value stored in the B-tree. 5 bytes is enough currently.
pub const VALUE_SZ: usize = 5;
@@ -235,19 +231,14 @@ where
///
/// Read the value for given key. Returns the value, or None if it doesn't exist.
///
pub async fn get(&self, search_key: &[u8; L], ctx: &RequestContext) -> Result<Option<u64>> {
pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
let mut result: Option<u64> = None;
self.visit(
search_key,
VisitDirection::Forwards,
|key, value| {
if key == search_key {
result = Some(value);
}
false
},
ctx,
)
self.visit(search_key, VisitDirection::Forwards, |key, value| {
if key == search_key {
result = Some(value);
}
false
})
.await?;
Ok(result)
}
@@ -262,7 +253,6 @@ where
search_key: &[u8; L],
dir: VisitDirection,
mut visitor: V,
ctx: &RequestContext,
) -> Result<bool>
where
V: FnMut(&[u8], u64) -> bool,
@@ -272,9 +262,7 @@ where
let block_cursor = self.reader.block_cursor();
while let Some((node_blknum, opt_iter)) = stack.pop() {
// Locate the node.
let node_buf = block_cursor
.read_blk(self.start_blk + node_blknum, ctx)
.await?;
let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;
let node = OnDiskNode::deparse(node_buf.as_ref())?;
let prefix_len = node.prefix_len as usize;
@@ -363,14 +351,13 @@ where
#[allow(dead_code)]
pub async fn dump(&self) -> Result<()> {
let mut stack = Vec::new();
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
stack.push((self.root_blk, String::new(), 0, 0, 0));
let block_cursor = self.reader.block_cursor();
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?;
let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
let buf: &[u8] = blk.as_ref();
let node = OnDiskNode::<L>::deparse(buf)?;
@@ -701,8 +688,6 @@ impl<const L: usize> BuildNode<L> {
#[cfg(test)]
pub(crate) mod tests {
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
use rand::Rng;
use std::collections::BTreeMap;
@@ -740,8 +725,6 @@ pub(crate) mod tests {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let all_keys: Vec<&[u8; 6]> = vec![
b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb",
];
@@ -762,12 +745,12 @@ pub(crate) mod tests {
// Test the `get` function on all the keys.
for (key, val) in all_data.iter() {
assert_eq!(reader.get(key, &ctx).await?, Some(*val));
assert_eq!(reader.get(key).await?, Some(*val));
}
// And on some keys that don't exist
assert_eq!(reader.get(b"aaaaaa", &ctx).await?, None);
assert_eq!(reader.get(b"zzzzzz", &ctx).await?, None);
assert_eq!(reader.get(b"xaaabx", &ctx).await?, None);
assert_eq!(reader.get(b"aaaaaa").await?, None);
assert_eq!(reader.get(b"zzzzzz").await?, None);
assert_eq!(reader.get(b"xaaabx").await?, None);
// Test search with `visit` function
let search_key = b"xabaaa";
@@ -779,15 +762,10 @@ pub(crate) mod tests {
let mut data = Vec::new();
reader
.visit(
search_key,
VisitDirection::Forwards,
|key, value| {
data.push((key.to_vec(), value));
true
},
&ctx,
)
.visit(search_key, VisitDirection::Forwards, |key, value| {
data.push((key.to_vec(), value));
true
})
.await?;
assert_eq!(data, expected);
@@ -800,28 +778,18 @@ pub(crate) mod tests {
expected.reverse();
let mut data = Vec::new();
reader
.visit(
search_key,
VisitDirection::Backwards,
|key, value| {
data.push((key.to_vec(), value));
true
},
&ctx,
)
.visit(search_key, VisitDirection::Backwards, |key, value| {
data.push((key.to_vec(), value));
true
})
.await?;
assert_eq!(data, expected);
// Backward scan where nothing matches
reader
.visit(
b"aaaaaa",
VisitDirection::Backwards,
|key, value| {
panic!("found unexpected key {}: {}", hex::encode(key), value);
},
&ctx,
)
.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
panic!("found unexpected key {}: {}", hex::encode(key), value);
})
.await?;
// Full scan
@@ -831,15 +799,10 @@ pub(crate) mod tests {
.collect();
let mut data = Vec::new();
reader
.visit(
&[0u8; 6],
VisitDirection::Forwards,
|key, value| {
data.push((key.to_vec(), value));
true
},
&ctx,
)
.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
data.push((key.to_vec(), value));
true
})
.await?;
assert_eq!(data, expected);
@@ -850,7 +813,6 @@ pub(crate) mod tests {
async fn lots_of_keys() -> Result<()> {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
const NUM_KEYS: u64 = 1000;
@@ -889,14 +851,14 @@ pub(crate) mod tests {
for search_key_int in 0..(NUM_KEYS * 2 + 10) {
let search_key = u64::to_be_bytes(search_key_int);
assert_eq!(
reader.get(&search_key, &ctx).await?,
reader.get(&search_key).await?,
all_data.get(&search_key_int).cloned()
);
// Test a forward scan starting with this key
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Forwards, take_ten, &ctx)
.visit(&search_key, VisitDirection::Forwards, take_ten)
.await?;
let expected = all_data
.range(search_key_int..)
@@ -908,7 +870,7 @@ pub(crate) mod tests {
// And a backwards scan
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Backwards, take_ten, &ctx)
.visit(&search_key, VisitDirection::Backwards, take_ten)
.await?;
let expected = all_data
.range(..=search_key_int)
@@ -924,7 +886,7 @@ pub(crate) mod tests {
limit.store(usize::MAX, Ordering::Relaxed);
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Forwards, take_ten, &ctx)
.visit(&search_key, VisitDirection::Forwards, take_ten)
.await?;
let expected = all_data
.iter()
@@ -937,7 +899,7 @@ pub(crate) mod tests {
limit.store(usize::MAX, Ordering::Relaxed);
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Backwards, take_ten, &ctx)
.visit(&search_key, VisitDirection::Backwards, take_ten)
.await?;
let expected = all_data
.iter()
@@ -951,8 +913,6 @@ pub(crate) mod tests {
#[tokio::test]
async fn random_data() -> Result<()> {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
// Generate random keys with exponential distribution, to
// exercise the prefix compression
const NUM_KEYS: usize = 100000;
@@ -979,24 +939,22 @@ pub(crate) mod tests {
// Test get() operation on all the keys
for (&key, &val) in all_data.iter() {
let search_key = u128::to_be_bytes(key);
assert_eq!(reader.get(&search_key, &ctx).await?, Some(val));
assert_eq!(reader.get(&search_key).await?, Some(val));
}
// Test get() operations on random keys, most of which will not exist
for _ in 0..100000 {
let key_int = rand::thread_rng().gen::<u128>();
let search_key = u128::to_be_bytes(key_int);
assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned());
assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
}
// Test boundary cases
assert!(
reader.get(&u128::to_be_bytes(u128::MIN), &ctx).await?
== all_data.get(&u128::MIN).cloned()
reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
);
assert!(
reader.get(&u128::to_be_bytes(u128::MAX), &ctx).await?
== all_data.get(&u128::MAX).cloned()
reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
);
Ok(())
@@ -1027,7 +985,6 @@ pub(crate) mod tests {
// Build a tree from it
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
for (key, val) in disk_btree_test_data::TEST_DATA {
writer.append(&key, val)?;
@@ -1040,21 +997,16 @@ pub(crate) mod tests {
// Test get() operation on all the keys
for (key, val) in disk_btree_test_data::TEST_DATA {
assert_eq!(reader.get(&key, &ctx).await?, Some(val));
assert_eq!(reader.get(&key).await?, Some(val));
}
// Test full scan
let mut count = 0;
reader
.visit(
&[0u8; 26],
VisitDirection::Forwards,
|_key, _value| {
count += 1;
true
},
&ctx,
)
.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
count += 1;
true
})
.await?;
assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

View File

@@ -2,7 +2,6 @@
//! used to keep in-memory layers spilled on disk.
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::VirtualFile;
@@ -62,17 +61,13 @@ impl EphemeralFile {
self.len
}
pub(crate) async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let cache = page_cache::get();
loop {
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.read_immutable_buf(self.page_cache_file_id, blknum)
.await
.map_err(|e| {
std::io::Error::new(
@@ -108,11 +103,7 @@ impl EphemeralFile {
}
}
pub(crate) async fn write_blob(
&mut self,
srcbuf: &[u8],
ctx: &RequestContext,
) -> Result<u64, io::Error> {
pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
struct Writer<'a> {
ephemeral_file: &'a mut EphemeralFile,
/// The block to which the next [`push_bytes`] will write.
@@ -129,11 +120,7 @@ impl EphemeralFile {
})
}
#[inline(always)]
async fn push_bytes(
&mut self,
src: &[u8],
ctx: &RequestContext,
) -> Result<(), io::Error> {
async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
let mut src_remaining = src;
while !src_remaining.is_empty() {
let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
@@ -159,7 +146,6 @@ impl EphemeralFile {
.read_immutable_buf(
self.ephemeral_file.page_cache_file_id,
self.blknum,
ctx,
)
.await
{
@@ -213,15 +199,15 @@ impl EphemeralFile {
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
writer.push_bytes(&len_buf, ctx).await?;
writer.push_bytes(&len_buf).await?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
writer.push_bytes(&len_buf, ctx).await?;
writer.push_bytes(&len_buf).await?;
}
// Write the payload
writer.push_bytes(srcbuf, ctx).await?;
writer.push_bytes(srcbuf).await?;
if srcbuf.len() < 0x80 {
self.len += 1;
@@ -275,8 +261,6 @@ impl BlockReader for EphemeralFile {
#[cfg(test)]
mod tests {
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
use rand::{thread_rng, RngCore};
use std::fs;
@@ -284,15 +268,7 @@ mod tests {
fn harness(
test_name: &str,
) -> Result<
(
&'static PageServerConf,
TenantId,
TimelineId,
RequestContext,
),
io::Error,
> {
) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
let repo_dir = PageServerConf::test_repo_dir(test_name);
let _ = fs::remove_dir_all(&repo_dir);
let conf = PageServerConf::dummy_conf(repo_dir);
@@ -304,57 +280,46 @@ mod tests {
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
Ok((conf, tenant_id, timeline_id, ctx))
Ok((conf, tenant_id, timeline_id))
}
#[tokio::test]
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
let pos_foo = file.write_blob(b"foo", &ctx).await?;
let pos_foo = file.write_blob(b"foo").await?;
assert_eq!(
b"foo",
file.block_cursor()
.read_blob(pos_foo, &ctx)
.await?
.as_slice()
file.block_cursor().read_blob(pos_foo).await?.as_slice()
);
let pos_bar = file.write_blob(b"bar", &ctx).await?;
let pos_bar = file.write_blob(b"bar").await?;
assert_eq!(
b"foo",
file.block_cursor()
.read_blob(pos_foo, &ctx)
.await?
.as_slice()
file.block_cursor().read_blob(pos_foo).await?.as_slice()
);
assert_eq!(
b"bar",
file.block_cursor()
.read_blob(pos_bar, &ctx)
.await?
.as_slice()
file.block_cursor().read_blob(pos_bar).await?.as_slice()
);
let mut blobs = Vec::new();
for i in 0..10000 {
let data = Vec::from(format!("blob{}", i).as_bytes());
let pos = file.write_blob(&data, &ctx).await?;
let pos = file.write_blob(&data).await?;
blobs.push((pos, data));
}
// also test with a large blobs
for i in 0..100 {
let data = format!("blob{}", i).as_bytes().repeat(100);
let pos = file.write_blob(&data, &ctx).await?;
let pos = file.write_blob(&data).await?;
blobs.push((pos, data));
}
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
for (pos, expected) in blobs {
let actual = cursor.read_blob(pos, &ctx).await?;
let actual = cursor.read_blob(pos).await?;
assert_eq!(actual, expected);
}
@@ -362,8 +327,8 @@ mod tests {
let mut large_data = Vec::new();
large_data.resize(20000, 0);
thread_rng().fill_bytes(&mut large_data);
let pos_large = file.write_blob(&large_data, &ctx).await?;
let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
let pos_large = file.write_blob(&large_data).await?;
let result = file.block_cursor().read_blob(pos_large).await?;
assert_eq!(result, large_data);
Ok(())

View File

@@ -20,10 +20,7 @@ use utils::crashsafe;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::control_plane_client::{
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::control_plane_client::ControlPlaneClient;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::delete::DeleteTenantFlow;
@@ -119,23 +116,7 @@ pub async fn init_tenant_mgr(
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
let result = match client.re_attach().await {
Ok(tenants) => tenants,
Err(RetryForeverError::ShuttingDown) => {
anyhow::bail!("Shut down while waiting for control plane re-attach response")
}
};
// The deletion queue needs to know about the startup attachment state to decide which (if any) stored
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
// are processed, even though we don't block on recovery completing here.
resources
.deletion_queue_client
.recover(result.clone())
.await?;
Some(result)
Some(client.re_attach().await?)
} else {
info!("Control plane API not configured, tenant generations are disabled");
None
@@ -304,21 +285,29 @@ pub(crate) fn schedule_local_tenant_processing(
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if resources.remote_storage.is_none() {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(
if let Some(remote_storage) = resources.remote_storage {
match Tenant::spawn_attach(
conf,
tenant_id,
"attaching mark file present but no remote storage configured".to_string(),
)
} else {
match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
generation,
resources.broker_client,
tenants,
remote_storage,
ctx,
) {
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
}
}
} else {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(
conf,
tenant_id,
"attaching mark file present but no remote storage configured".to_string(),
)
}
} else {
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -449,7 +438,8 @@ pub async fn create_tenant(
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
generation: Generation,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -460,9 +450,13 @@ pub async fn create_tenant(
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let tenant_resources = TenantSharedResources {
broker_client,
remote_storage,
};
let created_tenant =
schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
generation, resources, None, &TENANTS, ctx)?;
generation, tenant_resources, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
@@ -628,7 +622,6 @@ pub async fn load_tenant(
generation: Generation,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -642,7 +635,6 @@ pub async fn load_tenant(
let resources = TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client
};
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx)
.with_context(|| {
@@ -710,7 +702,8 @@ pub async fn attach_tenant(
tenant_id: TenantId,
generation: Generation,
tenant_conf: TenantConfOpt,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -725,7 +718,10 @@ pub async fn attach_tenant(
.context("check for attach marker file existence")?;
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
let resources = TenantSharedResources {
broker_client,
remote_storage: Some(remote_storage),
};
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233

View File

@@ -116,12 +116,8 @@
//! # Completion
//!
//! Once an operation has completed, we update
//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
//! and submit a request through the DeletionQueue to update
//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
//! validated that our generation is not stale. It is this visible value
//! that is advertized to safekeepers as a signal that that they can
//! delete the WAL up to that LSN.
//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
//! to safekeepers that they can delete the WAL up to that LSN.
//!
//! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
//! for all pending operations to complete. It does not prevent more
@@ -204,6 +200,7 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
mod delete;
mod download;
pub mod index;
mod upload;
@@ -229,7 +226,6 @@ use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -328,8 +324,6 @@ pub struct RemoteTimelineClient {
metrics: Arc<RemoteTimelineClientMetrics>,
storage_impl: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
}
impl RemoteTimelineClient {
@@ -341,7 +335,6 @@ impl RemoteTimelineClient {
///
pub fn new(
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -359,7 +352,6 @@ impl RemoteTimelineClient {
timeline_id,
generation,
storage_impl: remote_storage,
deletion_queue_client,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
}
@@ -421,24 +413,13 @@ impl RemoteTimelineClient {
Ok(())
}
pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
match &mut *self.upload_queue.lock().unwrap() {
pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
match &*self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
UploadQueue::Stopped(q) => q
.upload_queue_for_deletion
.get_last_remote_consistent_lsn_projected(),
}
}
pub fn remote_consistent_lsn_visible(&self) -> Option<Lsn> {
match &mut *self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
UploadQueue::Stopped(q) => Some(
q.upload_queue_for_deletion
.get_last_remote_consistent_lsn_visible(),
),
UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
UploadQueue::Stopped(q) => {
Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
}
}
}
@@ -662,7 +643,7 @@ impl RemoteTimelineClient {
/// successfully.
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: Vec<LayerFileName>,
names: &[LayerFileName],
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -682,10 +663,10 @@ impl RemoteTimelineClient {
// Decorate our list of names with each name's generation, dropping
// makes that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
.into_iter()
.iter()
.filter_map(|name| {
// Remove from latest_files, learning the file's remote generation in the process
let meta = upload_queue.latest_files.remove(&name);
let meta = upload_queue.latest_files.remove(name);
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -707,16 +688,18 @@ impl RemoteTimelineClient {
self.schedule_index_upload(upload_queue, metadata);
}
for (name, gen) in &with_generations {
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
}
// schedule the actual deletions
let op = UploadOp::Delete(Delete {
layers: with_generations,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
for (name, generation) in with_generations {
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: false,
generation,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file deletion {name}");
}
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
@@ -850,7 +833,9 @@ impl RemoteTimelineClient {
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
debug_assert_current_span_has_tenant_and_timeline_id();
let layers: Vec<RemotePath> = {
let (mut receiver, deletions_queued) = {
let mut deletions_queued = 0;
let mut locked = self.upload_queue.lock().unwrap();
let stopped = locked.stopped_mut()?;
@@ -862,30 +847,42 @@ impl RemoteTimelineClient {
stopped
.upload_queue_for_deletion
.latest_files
.drain()
.map(|(file_name, meta)| {
remote_layer_path(
&self.tenant_id,
&self.timeline_id,
&file_name,
meta.generation,
)
})
.collect()
.queued_operations
.reserve(stopped.upload_queue_for_deletion.latest_files.len());
// schedule the actual deletions
for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: true,
generation: meta.generation,
});
self.calls_unfinished_metric_begin(&op);
stopped
.upload_queue_for_deletion
.queued_operations
.push_back(op);
info!("scheduled layer file deletion {name}");
deletions_queued += 1;
}
self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
(
self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
deletions_queued,
)
};
let layer_deletion_count = layers.len();
self.deletion_queue_client.push_immediate(layers).await?;
receiver.changed().await.context("upload queue shut down")?;
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
self.deletion_queue_client.flush_immediate().await?;
let remaining = backoff::retry(
|| async {
self.storage_impl
@@ -913,9 +910,17 @@ impl RemoteTimelineClient {
})
.collect();
let not_referenced_count = remaining.len();
if !remaining.is_empty() {
self.deletion_queue_client.push_immediate(remaining).await?;
backoff::retry(
|| async { self.storage_impl.delete_objects(&remaining).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_objects",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
)
.await
.context("delete_objects")?;
}
fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -926,14 +931,18 @@ impl RemoteTimelineClient {
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
debug!("enqueuing index part deletion");
self.deletion_queue_client
.push_immediate([index_file_path].to_vec())
.await?;
debug!("deleting index part");
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
self.deletion_queue_client.flush_immediate().await?;
backoff::retry(
|| async { self.storage_impl.delete(&index_file_path).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_index",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
)
.await
.context("delete_index")?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(
@@ -941,7 +950,7 @@ impl RemoteTimelineClient {
))?
});
info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
Ok(())
}
@@ -1131,16 +1140,21 @@ impl RemoteTimelineClient {
}
res
}
UploadOp::Delete(delete) => self
.deletion_queue_client
.push_layers(
self.tenant_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
)
.await
.map_err(|e| anyhow::anyhow!(e)),
UploadOp::Delete(delete) => {
let path = &self
.conf
.timeline_path(&self.tenant_id, &self.timeline_id)
.join(delete.layer_file_name.file_name());
delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
delete.file_kind,
RemoteOpKind::Delete,
Arc::clone(&self.metrics),
)
.await
}
UploadOp::Barrier(_) => {
// unreachable. Barrier operations are handled synchronously in
// launch_queued_tasks
@@ -1161,11 +1175,7 @@ impl RemoteTimelineClient {
// at info level at first, and only WARN if the operation fails repeatedly.
//
// (See similar logic for downloads in `download::download_retry`)
let is_simulated = cfg!(feature = "testing")
&& e.root_cause().is::<remote_storage::SimulatedError>();
if retries < FAILED_UPLOAD_WARN_THRESHOLD || is_simulated {
if retries < FAILED_UPLOAD_WARN_THRESHOLD {
info!(
"failed to perform remote task {}, will retry (attempt {}): {:#}",
task.op, retries, e
@@ -1200,12 +1210,18 @@ impl RemoteTimelineClient {
}
// The task has completed successfully. Remove it from the in-progress list.
let lsn_update = {
{
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
let upload_queue = match upload_queue_guard.deref_mut() {
UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
UploadQueue::Stopped(_stopped) => {
None
UploadQueue::Stopped(stopped) => {
// Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
// then stop() took care of it so we just return.
// For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
match &task.op {
UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
_ => None
}
},
UploadQueue::Initialized(qi) => { Some(qi) }
};
@@ -1220,51 +1236,23 @@ impl RemoteTimelineClient {
upload_queue.inprogress_tasks.remove(&task.task_id);
let lsn_update = match task.op {
match task.op {
UploadOp::UploadLayer(_, _) => {
upload_queue.num_inprogress_layer_uploads -= 1;
None
}
UploadOp::UploadMetadata(_, lsn) => {
upload_queue.num_inprogress_metadata_uploads -= 1;
// XXX monotonicity check?
upload_queue.projected_remote_consistent_lsn = Some(lsn);
if self.generation.is_none() {
// Legacy mode: skip validating generation
upload_queue.visible_remote_consistent_lsn.store(lsn);
None
} else {
Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
}
upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
}
UploadOp::Delete(_) => {
upload_queue.num_inprogress_deletions -= 1;
None
}
UploadOp::Barrier(_) => unreachable!(),
};
// Launch any queued tasks that were unblocked by this one.
self.launch_queued_tasks(upload_queue);
lsn_update
};
if let Some((lsn, slot)) = lsn_update {
// Updates to the remote_consistent_lsn we advertise to pageservers
// are all routed through the DeletionQueue, to enforce important
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
self.deletion_queue_client
.update_remote_consistent_lsn(
self.tenant_id,
self.timeline_id,
self.generation,
lsn,
slot,
)
.await;
}
self.calls_unfinished_metric_end(&task.op);
}
@@ -1290,8 +1278,8 @@ impl RemoteTimelineClient {
reason: "metadata uploads are tiny",
},
),
UploadOp::Delete(_delete) => (
RemoteOpFileKind::Layer,
UploadOp::Delete(delete) => (
delete.file_kind,
RemoteOpKind::Delete,
DontTrackSize {
reason: "should we track deletes? positive or negative sign?",
@@ -1353,10 +1341,7 @@ impl RemoteTimelineClient {
latest_files: initialized.latest_files.clone(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: initialized.latest_metadata.clone(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: initialized
.visible_remote_consistent_lsn
.clone(),
last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
@@ -1420,13 +1405,13 @@ pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
layer_file_name: &LayerFileName,
generation: Generation,
layer_meta: &LayerFileMetadata,
) -> RemotePath {
// Generation-aware key format
let path = format!(
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
layer_file_name.file_name(),
generation.get_suffix()
layer_meta.generation.get_suffix()
);
RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1569,6 +1554,7 @@ mod tests {
impl TestSetup {
async fn new(test_name: &str) -> anyhow::Result<Self> {
// Use a current-thread runtime in the test
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = harness.load().await;
@@ -1594,7 +1580,6 @@ mod tests {
timeline_id: TIMELINE_ID,
generation,
storage_impl: self.harness.remote_storage.clone(),
deletion_queue_client: self.harness.deletion_queue.new_client(),
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&self.harness.tenant_id,
@@ -1764,7 +1749,7 @@ mod tests {
)
.unwrap();
client
.schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
.schedule_layer_file_deletion(&[layer_file_name_1.clone()])
.unwrap();
{
let mut guard = client.upload_queue.lock().unwrap();
@@ -1790,7 +1775,6 @@ mod tests {
// Finish them
client.wait_completion().await.unwrap();
harness.deletion_queue.pump().await;
assert_remote_files(
&[

View File

@@ -0,0 +1,34 @@
//! Helper functions to delete files from remote storage with a RemoteStorage
use anyhow::Context;
use std::path::Path;
use tracing::debug;
use remote_storage::GenericRemoteStorage;
use crate::{
config::PageServerConf,
tenant::{remote_timeline_client::remote_path, Generation},
};
pub(super) async fn delete_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
local_layer_path: &'a Path,
generation: Generation,
) -> anyhow::Result<()> {
fail::fail_point!("before-delete-layer", |_| {
anyhow::bail!("failpoint before-delete-layer")
});
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
let path_to_delete = remote_path(conf, local_layer_path, generation)?;
// We don't want to print an error if the delete failed if the file has
// already been deleted. Thankfully, in this situation S3 already
// does not yield an error. While OS-provided local file system APIs do yield
// errors, we avoid them in the `LocalFs` wrapper.
storage
.delete(&path_to_delete)
.await
.with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
}

View File

@@ -50,12 +50,7 @@ pub async fn download_layer_file<'a>(
.timeline_path(&tenant_id, &timeline_id)
.join(layer_file_name.file_name());
let remote_path = remote_layer_path(
&tenant_id,
&timeline_id,
layer_file_name,
layer_metadata.generation,
);
let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:

View File

@@ -28,7 +28,7 @@
//! "values" part.
//!
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
@@ -317,11 +317,11 @@ impl DeltaLayer {
tree_reader.dump().await?;
let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;
let keys = DeltaLayerInner::load_keys(&inner).await?;
// A subroutine to dump a single blob
async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
let val = Value::des(&buf)?;
let desc = match val {
Value::Image(img) => {
@@ -342,7 +342,7 @@ impl DeltaLayer {
for entry in keys {
let DeltaEntry { key, lsn, val, .. } = entry;
let desc = match dump_blob(val, ctx).await {
let desc = match dump_blob(val).await {
Ok(desc) => desc,
Err(err) => {
let err: anyhow::Error = err;
@@ -370,7 +370,7 @@ impl DeltaLayer {
.load(LayerAccessKind::GetValueReconstructData, ctx)
.await?;
inner
.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
.get_value_reconstruct_data(key, lsn_range, reconstruct_state)
.await
}
@@ -453,12 +453,12 @@ impl DeltaLayer {
self.access_stats.record_access(access_kind, ctx);
// Quick exit if already loaded
self.inner
.get_or_try_init(|| self.load_inner(ctx))
.get_or_try_init(|| self.load_inner())
.await
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
}
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
let path = self.path();
let summary = match &self.path_or_conf {
@@ -466,7 +466,7 @@ impl DeltaLayer {
PathOrConf::Path(_) => None,
};
let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;
let loaded = DeltaLayerInner::load(&path, summary).await?;
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
@@ -554,7 +554,7 @@ impl DeltaLayer {
.load(LayerAccessKind::KeyIter, ctx)
.await
.context("load delta layer keys")?;
DeltaLayerInner::load_keys(inner, ctx)
DeltaLayerInner::load_keys(inner)
.await
.context("Layer index is corrupted")
}
@@ -849,14 +849,13 @@ impl DeltaLayerInner {
pub(super) async fn load(
path: &std::path::Path,
summary: Option<Summary>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let Some(mut expected_summary) = summary {
@@ -884,7 +883,6 @@ impl DeltaLayerInner {
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
let mut need_image = true;
// Scan the page versions backwards, starting from `lsn`.
@@ -899,38 +897,27 @@ impl DeltaLayerInner {
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
tree_reader
.visit(
&search_key.0,
VisitDirection::Backwards,
|key, value| {
let blob_ref = BlobRef(value);
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
return false;
}
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
if entry_lsn < lsn_range.start {
return false;
}
offsets.push((entry_lsn, blob_ref.pos()));
.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
let blob_ref = BlobRef(value);
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
return false;
}
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
if entry_lsn < lsn_range.start {
return false;
}
offsets.push((entry_lsn, blob_ref.pos()));
!blob_ref.will_init()
},
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
.build(),
)
!blob_ref.will_init()
})
.await?;
let ctx = &RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerValue)
.build();
// Ok, 'offsets' now contains the offsets of all the entries we need to read
let cursor = file.block_cursor();
let mut buf = Vec::new();
for (entry_lsn, pos) in offsets {
cursor
.read_blob_into_buf(pos, &mut buf, ctx)
.read_blob_into_buf(pos, &mut buf)
.await
.with_context(|| {
format!(
@@ -971,10 +958,9 @@ impl DeltaLayerInner {
}
}
pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
this: &'a T,
ctx: &'b RequestContext,
) -> Result<Vec<DeltaEntry<'a>>> {
pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
this: &T,
) -> Result<Vec<DeltaEntry<'_>>> {
let dl = this.as_ref();
let file = &dl.file;
@@ -1011,9 +997,6 @@ impl DeltaLayerInner {
all_keys.push(entry);
true
},
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
.build(),
)
.await?;
if let Some(last) = all_keys.last_mut() {
@@ -1043,9 +1026,9 @@ pub struct ValueRef<'a> {
impl<'a> ValueRef<'a> {
/// Loads the value from disk
pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
pub async fn load(&self) -> Result<Value> {
// theoretically we *could* record an access time for each, but it does not really matter
let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
let val = Value::des(&buf)?;
Ok(val)
}
@@ -1054,11 +1037,7 @@ impl<'a> ValueRef<'a> {
pub(crate) struct Adapter<T>(T);
impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
pub(crate) async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
self.0.as_ref().file.read_blk(blknum, ctx).await
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
self.0.as_ref().file.read_blk(blknum).await
}
}

View File

@@ -24,7 +24,7 @@
//! mapping from Key to an offset in the "values" part. The
//! actual page images are stored in the "values" part.
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
@@ -237,15 +237,10 @@ impl ImageLayer {
tree_reader.dump().await?;
tree_reader
.visit(
&[0u8; KEY_SIZE],
VisitDirection::Forwards,
|key, value| {
println!("key: {} offset {}", hex::encode(key), value);
true
},
ctx,
)
.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
println!("key: {} offset {}", hex::encode(key), value);
true
})
.await?;
Ok(())
@@ -266,7 +261,7 @@ impl ImageLayer {
.load(LayerAccessKind::GetValueReconstructData, ctx)
.await?;
inner
.get_value_reconstruct_data(key, reconstruct_state, ctx)
.get_value_reconstruct_data(key, reconstruct_state)
.await
// FIXME: makes no sense to dump paths
.with_context(|| format!("read {}", self.path().display()))
@@ -340,12 +335,12 @@ impl ImageLayer {
) -> Result<&ImageLayerInner> {
self.access_stats.record_access(access_kind, ctx);
self.inner
.get_or_try_init(|| self.load_inner(ctx))
.get_or_try_init(|| self.load_inner())
.await
.with_context(|| format!("Failed to load image layer {}", self.path().display()))
}
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
async fn load_inner(&self) -> Result<ImageLayerInner> {
let path = self.path();
let expected_summary = match &self.path_or_conf {
@@ -354,8 +349,7 @@ impl ImageLayer {
};
let loaded =
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
.await?;
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
@@ -442,13 +436,12 @@ impl ImageLayerInner {
path: &std::path::Path,
lsn: Lsn,
summary: Option<Summary>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let Some(mut expected_summary) = summary {
@@ -477,30 +470,16 @@ impl ImageLayerInner {
&self,
key: Key,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
let file = &self.file;
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
if let Some(offset) = tree_reader
.get(
&keybuf,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
.build(),
)
.await?
{
if let Some(offset) = tree_reader.get(&keybuf).await? {
let blob = file
.block_cursor()
.read_blob(
offset,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerValue)
.build(),
)
.read_blob(offset)
.await
.with_context(|| format!("failed to read value from offset {}", offset))?;
let value = Bytes::from(blob);

View File

@@ -5,7 +5,7 @@
//! its position in the file, is kept in memory, though.
//!
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::context::RequestContext;
use crate::repository::{Key, Value};
use crate::tenant::block_io::BlockReader;
use crate::tenant::ephemeral_file::EphemeralFile;
@@ -106,7 +106,7 @@ impl InMemoryLayer {
/// debugging function to print out the contents of the layer
///
/// this is likely completly unused
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
let inner = self.inner.read().await;
let end_str = self.end_lsn_or_max();
@@ -125,7 +125,7 @@ impl InMemoryLayer {
for (key, vec_map) in inner.index.iter() {
for (lsn, pos) in vec_map.as_slice() {
let mut desc = String::new();
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
cursor.read_blob_into_buf(*pos, &mut buf).await?;
let val = Value::des(&buf);
match val {
Ok(Value::Image(img)) => {
@@ -158,15 +158,11 @@ impl InMemoryLayer {
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
_ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.start_lsn);
let mut need_image = true;
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
let inner = self.inner.read().await;
let reader = inner.file.block_cursor();
@@ -175,7 +171,7 @@ impl InMemoryLayer {
if let Some(vec_map) = inner.index.get(&key) {
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
let buf = reader.read_blob(*pos, &ctx).await?;
let buf = reader.read_blob(*pos).await?;
let value = Value::des(&buf)?;
match value {
Value::Image(img) => {
@@ -267,13 +263,7 @@ impl InMemoryLayer {
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree
pub async fn put_value(
&self,
key: Key,
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let inner: &mut _ = &mut *self.inner.write().await;
self.assert_writable();
@@ -285,15 +275,7 @@ impl InMemoryLayer {
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
buf.clear();
val.ser_into(&mut buf)?;
inner
.file
.write_blob(
&buf,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build(),
)
.await?
inner.file.write_blob(&buf).await?
};
let vec_map = inner.index.entry(key).or_default();
@@ -331,7 +313,7 @@ impl InMemoryLayer {
/// Write this frozen in-memory layer to disk.
///
/// Returns a new delta layer with all the same data as this in-memory layer
pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
pub(crate) async fn write_to_disk(&self) -> Result<DeltaLayer> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. There's one exception
@@ -361,14 +343,11 @@ impl InMemoryLayer {
let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
keys.sort_by_key(|k| k.0);
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
for (key, vec_map) in keys.iter() {
let key = **key;
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
cursor.read_blob_into_buf(*pos, &mut buf).await?;
let will_init = Value::des(&buf)?.will_init();
delta_layer_writer
.put_value_bytes(key, *lsn, &buf, will_init)

View File

@@ -38,7 +38,6 @@ use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
@@ -144,7 +143,6 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: Option<RemoteTimelineClient>,
pub deletion_queue_client: DeletionQueueClient,
}
pub struct Timeline {
@@ -473,7 +471,7 @@ impl Timeline {
// The cached image can be returned directly if there is no WAL between the cached image
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
// for redo.
let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
let cached_page_img = match self.lookup_cached_page(&key, lsn).await {
Some((cached_lsn, cached_img)) => {
match cached_lsn.cmp(&lsn) {
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
@@ -523,23 +521,9 @@ impl Timeline {
self.disk_consistent_lsn.load()
}
/// remote_consistent_lsn from the perspective of the tenant's current generation,
/// not validated with control plane yet.
/// See [`Self::get_remote_consistent_lsn_visible`].
pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
pub fn get_remote_consistent_lsn(&self) -> Option<Lsn> {
if let Some(remote_client) = &self.remote_client {
remote_client.remote_consistent_lsn_projected()
} else {
None
}
}
/// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
/// i.e. a value of remote_consistent_lsn_projected which has undergone
/// generation validation in the deletion queue.
pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
if let Some(remote_client) = &self.remote_client {
remote_client.remote_consistent_lsn_visible()
remote_client.last_uploaded_consistent_lsn()
} else {
None
}
@@ -1836,7 +1820,7 @@ impl Timeline {
for (layer, m) in needs_upload {
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
}
rtc.schedule_layer_file_deletion(needs_cleanup)?;
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
rtc.schedule_index_upload_for_file_changes()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
@@ -2534,18 +2518,13 @@ impl Timeline {
}
}
async fn lookup_cached_page(
&self,
key: &Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, Bytes)> {
async fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
let cache = page_cache::get();
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
// We should look at the key to determine if it's a cacheable object
let (lsn, read_guard) = cache
.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn, ctx)
.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)
.await?;
let img = Bytes::from(read_guard.to_vec());
Some((lsn, img))
@@ -2579,16 +2558,10 @@ impl Timeline {
Ok(layer)
}
async fn put_value(
&self,
key: Key,
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> anyhow::Result<()> {
async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
//info!("PUT: key {} at {}", key, lsn);
let layer = self.get_layer_for_write(lsn).await?;
layer.put_value(key, lsn, val, ctx).await?;
layer.put_value(key, lsn, val).await?;
Ok(())
}
@@ -2760,7 +2733,7 @@ impl Timeline {
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
let layer = self.create_delta_layer(&frozen_layer).await?;
(
HashMap::from([(
layer.filename(),
@@ -2883,21 +2856,19 @@ impl Timeline {
async fn create_delta_layer(
self: &Arc<Self>,
frozen_layer: &Arc<InMemoryLayer>,
ctx: &RequestContext,
) -> anyhow::Result<DeltaLayer> {
let span = tracing::info_span!("blocking");
let new_delta: DeltaLayer = tokio::task::spawn_blocking({
let _g = span.entered();
let self_clone = Arc::clone(self);
let frozen_layer = Arc::clone(frozen_layer);
let ctx = ctx.attached_child();
move || {
// Write it out
// Keep this inside `spawn_blocking` and `Handle::current`
// as long as the write path is still sync and the read impl
// is still not fully async. Otherwise executor threads would
// be blocked.
let new_delta = Handle::current().block_on(frozen_layer.write_to_disk(&ctx))?;
let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
let new_delta_path = new_delta.path();
// Sync it to disk.
@@ -3603,7 +3574,7 @@ impl Timeline {
key, lsn, ref val, ..
} in all_values_iter
{
let value = val.load(ctx).await?;
let value = val.load().await?;
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
// We need to check key boundaries once we reach next key or end of layer with the same key
if !same_key || lsn == dup_end_lsn {
@@ -3891,7 +3862,7 @@ impl Timeline {
// Also schedule the deletions in remote storage
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
}
Ok(())
@@ -4226,7 +4197,7 @@ impl Timeline {
}
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
}
apply.flush();
@@ -4728,14 +4699,8 @@ impl<'a> TimelineWriter<'a> {
///
/// This will implicitly extend the relation, if the page is beyond the
/// current end-of-file.
pub async fn put(
&self,
key: Key,
lsn: Lsn,
value: &Value,
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.tl.put_value(key, lsn, value, ctx).await
pub async fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
self.tl.put_value(key, lsn, value).await
}
pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {

View File

@@ -14,7 +14,6 @@ use utils::{
use crate::{
config::PageServerConf,
deletion_queue::DeletionQueueClient,
task_mgr::{self, TaskKind},
tenant::{
metadata::TimelineMetadata,
@@ -408,7 +407,6 @@ impl DeleteTimelineFlow {
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: Option<RemoteTimelineClient>,
deletion_queue_client: DeletionQueueClient,
init_order: Option<&InitializationOrder>,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -418,10 +416,7 @@ impl DeleteTimelineFlow {
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
TimelineResources {
remote_client,
deletion_queue_client,
},
TimelineResources { remote_client },
init_order,
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.

View File

@@ -370,9 +370,8 @@ pub(super) async fn handle_walreceiver_connection(
})?;
if let Some(last_lsn) = status_update {
let timeline_remote_consistent_lsn = timeline
.get_remote_consistent_lsn_visible()
.unwrap_or(Lsn(0));
let timeline_remote_consistent_lsn =
timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
let last_received_lsn = last_lsn;

View File

@@ -1,3 +1,5 @@
use crate::metrics::RemoteOpFileKind;
use super::storage_layer::LayerFileName;
use super::Generation;
use crate::tenant::metadata::TimelineMetadata;
@@ -9,7 +11,6 @@ use std::fmt::Debug;
use chrono::NaiveDateTime;
use std::sync::Arc;
use tracing::info;
use utils::lsn::AtomicLsn;
use std::sync::atomic::AtomicU32;
use utils::lsn::Lsn;
@@ -57,12 +58,7 @@ pub(crate) struct UploadQueueInitialized {
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
/// Safekeeper can rely on it to make decisions for WAL storage.
///
/// visible_remote_consistent_lsn is only updated after our generation has been validated with
/// the control plane (unlesss a timeline's generation is None, in which case
/// we skip validation)
pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
pub(crate) last_uploaded_consistent_lsn: Lsn,
// Breakdown of different kinds of tasks currently in-progress
pub(crate) num_inprogress_layer_uploads: usize,
@@ -85,14 +81,6 @@ impl UploadQueueInitialized {
pub(super) fn no_pending_work(&self) -> bool {
self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
}
pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
self.visible_remote_consistent_lsn.load()
}
pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
self.projected_remote_consistent_lsn
}
}
#[derive(Clone, Copy)]
@@ -126,8 +114,9 @@ impl UploadQueue {
latest_files: HashMap::new(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: metadata.clone(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
// safekeepers from garbage-collecting anything.
last_uploaded_consistent_lsn: Lsn(0),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
@@ -169,10 +158,7 @@ impl UploadQueue {
latest_files: files,
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: index_part.metadata.clone(),
projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
visible_remote_consistent_lsn: Arc::new(
index_part.metadata.disk_consistent_lsn().into(),
),
last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
@@ -215,11 +201,12 @@ pub(crate) struct UploadTask {
pub(crate) op: UploadOp,
}
/// A deletion of some layers within the lifetime of a timeline. This is not used
/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
#[derive(Debug)]
pub(crate) struct Delete {
pub(crate) layers: Vec<(LayerFileName, Generation)>,
pub(crate) file_kind: RemoteOpFileKind,
pub(crate) layer_file_name: LayerFileName,
pub(crate) scheduled_from_timeline_delete: bool,
pub(crate) generation: Generation,
}
#[derive(Debug)]
@@ -230,7 +217,7 @@ pub(crate) enum UploadOp {
/// Upload the metadata file
UploadMetadata(IndexPart, Lsn),
/// Delete layer files
/// Delete a layer file
Delete(Delete),
/// Barrier. When the barrier operation is reached,
@@ -252,9 +239,13 @@ impl std::fmt::Display for UploadOp {
UploadOp::UploadMetadata(_, lsn) => {
write!(f, "UploadMetadata(lsn: {})", lsn)
}
UploadOp::Delete(delete) => {
write!(f, "Delete({} layers)", delete.layers.len(),)
}
UploadOp::Delete(delete) => write!(
f,
"Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
delete.layer_file_name.file_name(),
delete.scheduled_from_timeline_delete,
delete.generation
),
UploadOp::Barrier(_) => write!(f, "Barrier"),
}
}

View File

@@ -650,12 +650,6 @@ mod tests {
File(File),
}
impl From<VirtualFile> for MaybeVirtualFile {
fn from(vf: VirtualFile) -> Self {
MaybeVirtualFile::VirtualFile(vf)
}
}
impl MaybeVirtualFile {
async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
match self {
@@ -893,54 +887,4 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_atomic_overwrite_basic() {
let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
std::fs::create_dir_all(&testdir).unwrap();
let path = testdir.join("myfile");
let tmp_path = testdir.join("myfile.tmp");
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
.await
.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
let post = file.read_string().await.unwrap();
assert_eq!(post, "foo");
assert!(!tmp_path.exists());
drop(file);
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
.await
.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
let post = file.read_string().await.unwrap();
assert_eq!(post, "bar");
assert!(!tmp_path.exists());
drop(file);
}
#[tokio::test]
async fn test_atomic_overwrite_preexisting_tmp() {
let testdir =
crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
std::fs::create_dir_all(&testdir).unwrap();
let path = testdir.join("myfile");
let tmp_path = testdir.join("myfile.tmp");
std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
assert!(tmp_path.exists());
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
.await
.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
let post = file.read_string().await.unwrap();
assert_eq!(post, "foo");
assert!(!tmp_path.exists());
drop(file);
}
}

View File

@@ -151,56 +151,50 @@ impl<'a> WalIngest<'a> {
}
}
} else if self.timeline.pg_version == 15 {
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG
{
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY
{
// The XLOG record was renamed between v14 and v15,
// but the record format is the same.
// So we can reuse XlCreateDatabase here.
debug!("XLOG_DBASE_CREATE_FILE_COPY");
let createdb = XlCreateDatabase::decode(&mut buf);
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v15::bindings::XLOG_DBASE_DROP
{
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
match decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK {
postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG
| postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY => {
// The XLOG record was renamed between v14 and v15,
// but the record format is the same.
// So we can reuse XlCreateDatabase here.
debug!("XLOG_DBASE_CREATE_WAL_LOG");
let createdb = XlCreateDatabase::decode(&mut buf);
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
}
postgres_ffi::v15::bindings::XLOG_DBASE_DROP => {
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
.await?;
}
}
_ => {}
}
} else if self.timeline.pg_version == 16 {
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG
{
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY
{
// The XLOG record was renamed between v14 and v15,
// but the record format is the same.
// So we can reuse XlCreateDatabase here.
debug!("XLOG_DBASE_CREATE_FILE_COPY");
let createdb = XlCreateDatabase::decode(&mut buf);
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v16::bindings::XLOG_DBASE_DROP
{
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
match decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK {
postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG
| postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY => {
// The XLOG record was renamed between v14 and v15,
// but the record format is the same.
// So we can reuse XlCreateDatabase here.
debug!("XLOG_DBASE_CREATE_WAL_LOG");
let createdb = XlCreateDatabase::decode(&mut buf);
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
}
postgres_ffi::v16::bindings::XLOG_DBASE_DROP => {
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
.await?;
}
}
_ => {}
}
}
} else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
@@ -363,7 +357,7 @@ impl<'a> WalIngest<'a> {
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN
modification.commit(ctx).await?;
modification.commit().await?;
Ok(())
}
@@ -1561,7 +1555,7 @@ mod tests {
let mut m = tline.begin_modification(Lsn(0x10));
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
m.commit(ctx).await?;
m.commit().await?;
let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
Ok(walingest)
@@ -1580,22 +1574,22 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
let mut m = tline.begin_modification(Lsn(0x30));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
let mut m = tline.begin_modification(Lsn(0x40));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
let mut m = tline.begin_modification(Lsn(0x50));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_current_logical_size(&tline, Lsn(0x50));
@@ -1681,7 +1675,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_current_logical_size(&tline, Lsn(0x60));
// Check reported size and contents after truncation
@@ -1723,7 +1717,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1736,7 +1730,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1761,7 +1755,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1800,7 +1794,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
// Check that rel exists and size is correct
assert_eq!(
@@ -1819,7 +1813,7 @@ mod tests {
// Drop rel
let mut m = tline.begin_modification(Lsn(0x30));
walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
m.commit(&ctx).await?;
m.commit().await?;
// Check that rel is not visible anymore
assert_eq!(
@@ -1837,7 +1831,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
// Check that rel exists and size is correct
assert_eq!(
@@ -1876,7 +1870,7 @@ mod tests {
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
.await?;
}
m.commit(&ctx).await?;
m.commit().await?;
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
@@ -1921,7 +1915,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
// Check reported size and contents after truncation
assert_eq!(
@@ -1970,7 +1964,7 @@ mod tests {
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
.await?;
}
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline
@@ -2017,7 +2011,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
}
assert_current_logical_size(&tline, Lsn(lsn));
@@ -2033,7 +2027,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE
@@ -2046,7 +2040,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE - 1
@@ -2062,7 +2056,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
size as BlockNumber

View File

@@ -2,3 +2,4 @@ comment = '** Deprecated ** Please use pg_embedding instead'
default_version = '0.1.0'
module_pathname = '$libdir/hnsw'
relocatable = true
trusted = true

View File

@@ -222,9 +222,8 @@ lfc_change_limit_hook(int newval, void *extra)
/*
* Stats collector detach shared memory, so we should not try to access shared memory here.
* Parallel workers first assign default value (0), so not perform truncation in parallel workers.
* The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
*/
if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
return;
/* Open cache file if not done yet */

50
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -887,34 +887,34 @@ files = [
[[package]]
name = "cryptography"
version = "41.0.4"
version = "41.0.3"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
optional = false
python-versions = ">=3.7"
files = [
{file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839"},
{file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143"},
{file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397"},
{file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860"},
{file = "cryptography-41.0.4-cp37-abi3-win32.whl", hash = "sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd"},
{file = "cryptography-41.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311"},
{file = "cryptography-41.0.4.tar.gz", hash = "sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a"},
{file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
{file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
{file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
{file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
{file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
{file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
{file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
]
[package.dependencies]

View File

@@ -28,11 +28,10 @@ use tracing::{error, info, info_span, warn, Instrument};
use utils::measured_stream::MeasuredStream;
/// Number of times we should retry the `/proxy_wake_compute` http request.
/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
pub const NUM_RETRIES_CONNECT: u32 = 16;
/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
pub const NUM_RETRIES_CONNECT: u32 = 10;
const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -554,7 +553,8 @@ impl ShouldRetry for compute::ConnectionError {
}
pub fn retry_after(num_retries: u32) -> time::Duration {
BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
// 1.5 seems to be an ok growth factor heuristic
BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
}
/// Finish client connection initialization: confirm auth success, send params, etc.

View File

@@ -303,7 +303,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
#[test]
fn connect_compute_total_wait() {
let mut total_wait = tokio::time::Duration::ZERO;
for num_retries in 1..NUM_RETRIES_CONNECT {
for num_retries in 1..10 {
total_wait += retry_after(num_retries);
}
assert!(total_wait < tokio::time::Duration::from_secs(12));
@@ -494,11 +494,11 @@ async fn connect_to_compute_non_retry_2() {
/// Retry for at most `NUM_RETRIES_CONNECT` times.
#[tokio::test]
async fn connect_to_compute_non_retry_3() {
assert_eq!(NUM_RETRIES_CONNECT, 16);
assert_eq!(NUM_RETRIES_CONNECT, 10);
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![
Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
/* the 11th time */ Retry,
]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds)

View File

@@ -1,5 +1,5 @@
[toolchain]
channel = "1.72.1"
channel = "1.72.0"
profile = "default"
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
# https://rust-lang.github.io/rustup/concepts/profiles.html

View File

@@ -1,62 +0,0 @@
#!/usr/bin/env python3
#
# Script to download the basebackup from a pageserver to a tar file.
#
# This can be useful in disaster recovery.
#
import argparse
import psycopg2
from psycopg2.extensions import connection as PgConnection
def main(args: argparse.Namespace):
pageserver_connstr = args.pageserver_connstr
tenant_id = args.tenant
timeline_id = args.timeline
lsn = args.lsn
output_path = args.output_path
psconn: PgConnection = psycopg2.connect(pageserver_connstr)
psconn.autocommit = True
output = open(output_path, "wb")
with psconn.cursor() as pscur:
pscur.copy_expert(f"basebackup {tenant_id} {timeline_id} {lsn}", output)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tenant-id",
dest="tenant",
required=True,
help="Id of the tenant",
)
parser.add_argument(
"--timeline-id",
dest="timeline",
required=True,
help="Id of the timeline",
)
parser.add_argument(
"--lsn",
dest="lsn",
required=True,
help="LSN to take the basebackup at",
)
parser.add_argument(
"--pageserver-connstr",
dest="pageserver_connstr",
required=True,
help="libpq connection string of the pageserver",
)
parser.add_argument(
"--output",
dest="output_path",
required=True,
help="output path to write the basebackup to",
)
args = parser.parse_args()
main(args)

View File

@@ -1481,16 +1481,6 @@ class NeonAttachmentService:
self.running = False
return self
def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
response = requests.post(
f"{self.env.control_plane_api}/attach_hook",
json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
)
response.raise_for_status()
gen = response.json()["gen"]
assert isinstance(gen, int)
return gen
def __enter__(self) -> "NeonAttachmentService":
return self
@@ -1699,7 +1689,12 @@ class NeonPageserver(PgProtocol):
to call into the pageserver HTTP client.
"""
if self.env.attachment_service is not None:
generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
response = requests.post(
f"{self.env.control_plane_api}/attach_hook",
json={"tenant_id": str(tenant_id), "pageserver_id": self.id},
)
response.raise_for_status()
generation = response.json()["gen"]
else:
generation = None

View File

@@ -620,8 +620,3 @@ class PageserverHttpClient(requests.Session):
},
)
self.verbose_error(res)
def deletion_queue_flush(self, execute: bool = False):
self.put(
f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
).raise_for_status()

View File

@@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional
def list_prefix(
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
) -> ListObjectsV2OutputTypeDef:
"""
Note that this function takes into account prefix_in_bucket.
@@ -287,7 +287,7 @@ def list_prefix(
# Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
response = remote.client.list_objects_v2(
Delimiter=delimiter,
Delimiter="/",
Bucket=remote.bucket_name,
Prefix=prefix,
)

View File

@@ -1,44 +0,0 @@
import threading
import time
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, PgBin
#
# Test branching, when a transaction is in prepared state
#
@pytest.mark.timeout(600)
def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
env = neon_simple_env
env.neon_cli.create_branch("test_lfc_resize", "empty")
endpoint = env.endpoints.create_start(
"test_lfc_resize",
config_lines=[
"neon.file_cache_path='file.cache'",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
],
)
n_resize = 10
scale = 10
log.info("postgres is running on 'test_lfc_resize' branch")
def run_pgbench(connstr: str):
log.info(f"Start a pgbench workload on pg {connstr}")
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])
thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
thread.start()
conn = endpoint.connect()
cur = conn.cursor()
for i in range(n_resize):
cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
cur.execute("select pg_reload_conf()")
time.sleep(1)
thread.join()

View File

@@ -1,352 +0,0 @@
"""
Tests in this module exercise the pageserver's behavior around generation numbers,
as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require
of the pageserver are:
- Do not start a tenant without a generation number if control_plane_api is set
- Remote objects must be suffixed with generation
- Deletions may only be executed after validating generation
- Updates to remote_consistent_lsn may only be made visible after validating generation
"""
import re
import time
from typing import Optional
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.utils import list_prefix
from fixtures.remote_storage import (
RemoteStorageKind,
)
from fixtures.types import TenantId, TimelineId
from fixtures.utils import print_gc_result, wait_until
# A tenant configuration that is convenient for generating uploads and deletions
# without a large amount of postgres traffic.
TENANT_CONF = {
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": f"{128 * 1024}",
"compaction_threshold": "1",
"compaction_target_size": f"{128 * 1024}",
# no PITR horizon, we specify the horizon when we request on-demand GC
"pitr_interval": "0s",
# disable background compaction and GC. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# create image layers eagerly, so that GC can remove some layers
"image_creation_threshold": "1",
}
def generate_uploads_and_deletions(
env: NeonEnv,
*,
init: bool = True,
tenant_id: Optional[TenantId] = None,
timeline_id: Optional[TimelineId] = None,
data: Optional[str] = None,
):
"""
Using the environment's default tenant + timeline, generate a load pattern
that results in some uploads and some deletions to remote storage.
"""
if tenant_id is None:
tenant_id = env.initial_tenant
assert tenant_id is not None
if timeline_id is None:
timeline_id = env.initial_timeline
assert timeline_id is not None
ps_http = env.pageserver.http_client()
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
if init:
endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
def churn(data):
endpoint.safe_psql_many(
[
f"""
INSERT INTO foo (id, val)
SELECT g, '{data}'
FROM generate_series(1, 20000) g
ON CONFLICT (id) DO UPDATE
SET val = EXCLUDED.val
""",
# to ensure that GC can actually remove some layers
"VACUUM foo",
]
)
assert tenant_id is not None
assert timeline_id is not None
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
ps_http.timeline_checkpoint(tenant_id, timeline_id)
# Compaction should generate some GC-elegible layers
for i in range(0, 2):
churn(f"{i if data is None else data}")
gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
print_gc_result(gc_result)
assert gc_result["layers_removed"] > 0
def get_metric_or_0(ps_http, metric: str) -> int:
v = ps_http.get_metric_value(metric)
return 0 if v is None else int(v)
def get_deletion_queue_executed(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total")
def get_deletion_queue_submitted(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
def get_deletion_queue_dropped(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
def get_deletion_queue_unexpected_errors(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total")
def get_deletion_queue_dropped_lsn_updates(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total")
def get_deletion_queue_depth(ps_http) -> int:
"""
Queue depth if at least one deletion has been submitted, else None
"""
submitted = get_deletion_queue_submitted(ps_http)
executed = get_deletion_queue_executed(ps_http)
dropped = get_deletion_queue_dropped(ps_http)
depth = submitted - executed - dropped
log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})")
assert depth >= 0
return int(depth)
def assert_deletion_queue(ps_http, size_fn) -> None:
v = get_deletion_queue_depth(ps_http)
assert v is not None
assert size_fn(v) is True
def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
"""
Validate behavior when a pageserver is run without generation support enabled,
then started again after activating it:
- Before upgrade, no objects should have generation suffixes
- After upgrade, the bucket should contain a mixture.
- In both cases, postgres I/O should work.
"""
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_configs()
env.broker.try_start()
for sk in env.safekeepers:
sk.start()
assert env.attachment_service is not None
env.attachment_service.start()
env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
env.neon_cli.create_tenant(
tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
)
generate_uploads_and_deletions(env)
def parse_generation_suffix(key):
m = re.match(".+-([0-9a-zA-Z]{8})$", key)
if m is None:
return None
else:
log.info(f"match: {m}")
log.info(f"group: {m.group(1)}")
return int(m.group(1), 16)
pre_upgrade_keys = list(
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
)
for key in pre_upgrade_keys:
assert parse_generation_suffix(key) is None
env.pageserver.stop()
# Starting without the override that disabled control_plane_api
env.pageserver.start()
generate_uploads_and_deletions(env, init=False)
legacy_objects: list[str] = []
suffixed_objects = []
post_upgrade_keys = list(
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
)
for key in post_upgrade_keys:
log.info(f"post-upgrade key: {key}")
if parse_generation_suffix(key) is not None:
suffixed_objects.append(key)
else:
legacy_objects.append(key)
# Bucket now contains a mixture of suffixed and non-suffixed objects
assert len(suffixed_objects) > 0
assert len(legacy_objects) > 0
assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
some_other_pageserver = 1234
ps_http = env.pageserver.http_client()
generate_uploads_and_deletions(env)
# Flush: pending deletions should all complete
assert_deletion_queue(ps_http, lambda n: n > 0)
ps_http.deletion_queue_flush(execute=True)
assert_deletion_queue(ps_http, lambda n: n == 0)
assert get_deletion_queue_dropped(ps_http) == 0
# Our visible remote_consistent_lsn should match projected
timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
env.pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)
# Now advance the generation in the control plane: subsequent validations
# from the running pageserver will fail. No more deletions should happen.
env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
generate_uploads_and_deletions(env, init=False)
assert_deletion_queue(ps_http, lambda n: n > 0)
queue_depth_before = get_deletion_queue_depth(ps_http)
executed_before = get_deletion_queue_executed(ps_http)
ps_http.deletion_queue_flush(execute=True)
# Queue drains to zero because we dropped deletions
assert_deletion_queue(ps_http, lambda n: n == 0)
# The executed counter has not incremented
assert get_deletion_queue_executed(ps_http) == executed_before
# The dropped counter has incremented to consume all of the deletions that were previously enqueued
assert get_deletion_queue_dropped(ps_http) == queue_depth_before
# Flush to S3 and see that remote_consistent_lsn does not advance: it cannot
# because generation validation fails.
timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"]
assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0
# TODO: list bucket and confirm all objects have a generation suffix.
assert get_deletion_queue_unexpected_errors(ps_http) == 0
@pytest.mark.parametrize("keep_attachment", [True, False])
def test_deletion_queue_recovery(
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
):
"""
:param keep_attachment: If true, we re-attach after restart. Else, we act as if some other
node took the attachment while we were restarting.
"""
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
ps_http = env.pageserver.http_client()
# Prevent deletion lists from being executed, to build up some backlog of deletions
ps_http.configure_failpoints(
[
("deletion-queue-before-execute", "return"),
]
)
generate_uploads_and_deletions(env)
# There should be entries in the deletion queue
assert_deletion_queue(ps_http, lambda n: n > 0)
ps_http.deletion_queue_flush()
before_restart_depth = get_deletion_queue_depth(ps_http)
assert get_deletion_queue_unexpected_errors(ps_http) == 0
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
env.pageserver.stop(immediate=True)
if not keep_attachment:
some_other_pageserver = 101010
assert env.attachment_service is not None
env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
env.pageserver.start()
def assert_deletions_submitted(n: int):
assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
# After restart, issue a flush to kick the deletion frontend to do recovery.
# It should recover all the operations we submitted before the restart.
ps_http.deletion_queue_flush(execute=False)
wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
# The queue should drain through completely if we flush it
ps_http.deletion_queue_flush(execute=True)
wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
if keep_attachment:
# If we kept the attachment, then our pre-restart deletions should have executed
# successfully
assert get_deletion_queue_executed(ps_http) == before_restart_depth
else:
# If we lost the attachment, we should have dropped our pre-restart deletions.
assert get_deletion_queue_dropped(ps_http) == before_restart_depth
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
assert get_deletion_queue_unexpected_errors(ps_http) == 0
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
# Restart again
env.pageserver.stop(immediate=True)
env.pageserver.start()
# No deletion lists should be recovered: this demonstrates that deletion lists
# were cleaned up after being executed or dropped in the previous process lifetime.
time.sleep(1)
assert_deletion_queue(ps_http, lambda n: n == 0)
assert get_deletion_queue_unexpected_errors(ps_http) == 0
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

View File

@@ -43,12 +43,6 @@ def test_tenant_delete_smoke(
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[
# The deletion queue will complain when it encounters simulated S3 errors
".*deletion executor: DeleteObjects request failed.*",
]
)
# lucky race with stopping from flushing a layer we fail to schedule any uploads
env.pageserver.allowed_errors.append(
@@ -201,14 +195,6 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
]
)
if simulate_failures:
env.pageserver.allowed_errors.extend(
[
# The deletion queue will complain when it encounters simulated S3 errors
".*deletion executor: DeleteObjects request failed.*",
]
)
ps_http = env.pageserver.http_client()
timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
@@ -397,7 +383,6 @@ def test_tenant_delete_is_resumed_on_attach(
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():
ps_http.deletion_queue_flush(execute=True)
assert_prefix_empty(
neon_env_builder,
prefix="/".join(

View File

@@ -807,8 +807,6 @@ def test_delete_orphaned_objects(
reason = timeline_info["state"]["Broken"]["reason"]
assert reason.endswith(f"failpoint: {failpoint}"), reason
ps_http.deletion_queue_flush(execute=True)
for orphan in orphans:
assert not orphan.exists()
assert env.pageserver.log_contains(

View File

@@ -1,5 +1,5 @@
{
"postgres-v16": "389ce36b4b3da7aa654a25e1b3f10b641319a87f",
"postgres-v15": "74cfe3e681836747a31fdbd47bdd14b3d81b0772",
"postgres-v16": "7a50f139c6269454ab9260c7a9752874b9089943",
"postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027",
"postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568"
}

View File

@@ -64,7 +64,6 @@ toml_edit = { version = "0.19", features = ["serde"] }
tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
tracing = { version = "0.1", features = ["log"] }
tracing-core = { version = "0.1" }
tungstenite = { version = "0.20" }
url = { version = "2", features = ["serde"] }
uuid = { version = "1", features = ["serde", "v4"] }