Compare commits

...

9 Commits

Author SHA1 Message Date
Konstantin Knizhnik
93b0f44a06 Set lfc_size_limit as Min(lfc_size_limit, lfc_max_mem) 2023-08-24 17:54:22 +03:00
Konstantin Knizhnik
fc206e60d2 Add lfc_max_mem GUC 2023-08-24 17:54:22 +03:00
Konstantin Knizhnik
7b8ccee8db Restore neon.lfs_file_parth GUC 2023-08-24 17:54:22 +03:00
Konstantin Knizhnik
d8d38f2c42 Monitor avaiable memory size in local file cache and shrink cache if watermark is reached 2023-08-24 17:54:22 +03:00
Chengpeng Yan
fa74d5649e rename EphmeralFile::size to EphemeralFile::len (#5076)
## Problem
close https://github.com/neondatabase/neon/issues/5034

## Summary of changes
Based on the
[comment](https://github.com/neondatabase/neon/pull/4994#discussion_r1297277922).
Just rename the `EphmeralFile::size` to `EphemeralFile::len`.
2023-08-24 16:41:57 +02:00
Joonas Koivunen
f70871dfd0 internal-devx: pageserver future layers (#5092)
I've personally forgotten why/how can we have future layers during
reconciliation. Adds `#[cfg(feature = "testing")]` logging when we
upload such index_part.json, with a cross reference to where the cleanup
happens.

Latest private slack thread:
https://neondb.slack.com/archives/C033RQ5SPDH/p1692879032573809?thread_ts=1692792276.173979&cid=C033RQ5SPDH

Builds upon #5074. Should had been considered on #4837.
2023-08-24 17:22:36 +03:00
Alek Westover
99a1be6c4e remove upload step from neon, it is in private repo now (#5085) 2023-08-24 17:14:40 +03:00
Joonas Koivunen
76aa01c90f refactor: single phase Timeline::load_layer_map (#5074)
Current implementation first calls `load_layer_map`, which loads all
local layers, cleans up files, leave cleaning up stuff to "second
function". Then the "second function" is finally called, it does not do
the cleanup and some of the first functions setup can torn down. "Second
function" is actually both `reconcile_with_remote` and
`create_remote_layers`.

This change makes it a bit more verbose but in one phase with the
following sub-steps:
1. scan the timeline directory
2. delete extra files
    - now including on-demand download files
    - fixes #3660
3. recoincile the two sources of layers (directory, index_part)
4. rename_to_backup future layers, short layers
5. create the remaining as layers

Needed by #4938.

It was also noticed that this is blocking code in an `async fn` so just
do it in a `spawn_blocking`, which should be healthy for our startup
times. Other effects includes hopefully halving of `stat` calls; extra
calls which were not done previously are now done for the future layers.

Co-authored-by: Christian Schwarz <christian@neon.tech>
Co-authored-by: John Spray <john@neon.tech>
2023-08-24 16:07:40 +03:00
John Spray
3e2f0ffb11 libs: make backoff::retry() take a cancellation token (#5065)
## Problem

Currently, anything that uses backoff::retry will delay the join of its
task by however long its backoff sleep is, multiplied by its max
retries.

Whenever we call a function that sleeps, we should be passing in a
CancellationToken.

## Summary of changes

- Add a `Cancel` type to backoff::retry that wraps a CancellationToken
and an error `Fn` to generate an error if the cancellation token fires.
- In call sites that already run in a `task_mgr` task, use
`shutdown_token()` to provide the token. In other locations, use a dead
`CancellationToken` to satisfy the interface, and leave a TODO to fix it
up when we broaden the use of explicit cancellation tokens.
2023-08-24 14:54:46 +03:00
21 changed files with 634 additions and 723 deletions

View File

@@ -737,34 +737,6 @@ jobs:
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--cleanup
# Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
# During the transition period we need to have extensions in both places (in S3 and in compute-node image),
# so we won't build extension twice, but extract them from compute-node.
#
# For now we use extensions image only for new custom extensitons
- name: Kaniko build extensions only
run: |
# Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
# Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
# it still fails with error:
# error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
#
# Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--context . \
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
--build-arg PG_VERSION=${{ matrix.version }} \
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
--dockerfile Dockerfile.compute-node \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--cleanup \
--target postgres-extensions
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
@@ -973,57 +945,10 @@ jobs:
}
}"
upload-postgres-extensions-to-s3:
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
needs: [ tag, promote-images ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15 ]
env:
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ github.ref_name == 'release' && 'latest' || needs.tag.outputs.build-tag }}
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
steps:
- name: Pull postgres-extensions image
run: |
docker pull ${EXTENSIONS_IMAGE}
- name: Create postgres-extensions container
id: create-container
run: |
EID=$(docker create ${EXTENSIONS_IMAGE} true)
echo "EID=${EID}" >> $GITHUB_OUTPUT
- name: Extract postgres-extensions from container
run: |
rm -rf ./extensions-to-upload # Just in case
mkdir -p extensions-to-upload
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
- name: Upload postgres-extensions to S3
run: |
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
done
- name: Cleanup
if: ${{ always() && steps.create-container.outputs.EID }}
run: |
docker rm ${{ steps.create-container.outputs.EID }} || true
deploy:
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
needs: [ promote-images, tag, regress-tests ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
steps:
- name: Fix git ownership

1
Cargo.lock generated
View File

@@ -4915,6 +4915,7 @@ dependencies = [
"thiserror",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-error",
"tracing-subscriber",

View File

@@ -764,29 +764,6 @@ RUN rm -r /usr/local/pgsql/include
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Extenstion only
#
#########################################################################################
FROM python:3.9-slim-bullseye AS generate-ext-index
ARG PG_VERSION
ARG BUILD_TAG
RUN apt update && apt install -y zstd
# copy the control files here
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
COPY --from=pg-anon-pg-build /extensions/ /extensions/
COPY --from=postgis-build /extensions/ /extensions/
COPY scripts/combine_control_files.py ./combine_control_files.py
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
FROM scratch AS postgres-extensions
# After the transition this layer will include all extensitons.
# As for now, it's only a couple for testing purposses
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
COPY --from=generate-ext-index /ext_index.json /ext_index.json
#########################################################################################
#
# Final layer

View File

@@ -26,6 +26,7 @@ serde_json.workspace = true
signal-hook.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber = { workspace = true, features = ["json", "registry"] }

View File

@@ -1,18 +1,31 @@
use std::fmt::{Debug, Display};
use futures::Future;
use tokio_util::sync::CancellationToken;
pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
pub async fn exponential_backoff(
n: u32,
base_increment: f64,
max_seconds: f64,
cancel: &CancellationToken,
) {
let backoff_duration_seconds =
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
if backoff_duration_seconds > 0.0 {
tracing::info!(
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
);
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
drop(
tokio::time::timeout(
std::time::Duration::from_secs_f64(backoff_duration_seconds),
cancel.cancelled(),
)
.await,
)
}
}
@@ -24,28 +37,57 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
}
}
/// Configure cancellation for a retried operation: when to cancel (the token), and
/// what kind of error to return on cancellation
pub struct Cancel<E, CF>
where
E: Display + Debug + 'static,
CF: Fn() -> E,
{
token: CancellationToken,
on_cancel: CF,
}
impl<E, CF> Cancel<E, CF>
where
E: Display + Debug + 'static,
CF: Fn() -> E,
{
pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
Self { token, on_cancel }
}
}
/// retries passed operation until one of the following conditions are met:
/// Encountered error is considered as permanent (non-retryable)
/// Retries have been exhausted.
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
/// When attempts cross `warn_threshold` function starts to emit log warnings.
/// `description` argument is added to log messages. Its value should identify the `op` is doing
pub async fn retry<T, O, F, E>(
/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
/// to drop out promptly on shutdown.
pub async fn retry<T, O, F, E, CF>(
mut op: O,
is_permanent: impl Fn(&E) -> bool,
warn_threshold: u32,
max_retries: u32,
description: &str,
cancel: Cancel<E, CF>,
) -> Result<T, E>
where
// Not std::error::Error because anyhow::Error doesnt implement it.
// For context see https://github.com/dtolnay/anyhow/issues/63
E: Display + Debug,
E: Display + Debug + 'static,
O: FnMut() -> F,
F: Future<Output = Result<T, E>>,
CF: Fn() -> E,
{
let mut attempts = 0;
loop {
if cancel.token.is_cancelled() {
return Err((cancel.on_cancel)());
}
let result = op().await;
match result {
Ok(_) => {
@@ -80,6 +122,7 @@ where
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
&cancel.token,
)
.await;
attempts += 1;
@@ -132,6 +175,7 @@ mod tests {
1,
1,
"work",
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
)
.await;
@@ -157,6 +201,7 @@ mod tests {
2,
2,
"work",
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
)
.await
.unwrap();
@@ -179,6 +224,7 @@ mod tests {
2,
2,
"work",
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
)
.await
.unwrap_err();

View File

@@ -422,13 +422,53 @@ impl Tenant {
init_order,
CreateTimelineCause::Load,
)?;
let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
anyhow::ensure!(
new_disk_consistent_lsn.is_valid(),
disk_consistent_lsn.is_valid(),
"Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
);
assert_eq!(
disk_consistent_lsn,
up_to_date_metadata.disk_consistent_lsn(),
"these are used interchangeably"
);
// Save the metadata file to local disk.
if !picked_local {
save_metadata(
self.conf,
&tenant_id,
&timeline_id,
up_to_date_metadata,
first_save,
)
.context("save_metadata")?;
}
let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
if let Some(index_part) = index_part {
timeline
.remote_client
.as_ref()
.unwrap()
.init_upload_queue(index_part)?;
} else if self.remote_storage.is_some() {
// No data on the remote storage, but we have local metadata file. We can end up
// here with timeline_create being interrupted before finishing index part upload.
// By doing what we do here, the index part upload is retried.
// If control plane retries timeline creation in the meantime, the mgmt API handler
// for timeline creation will coalesce on the upload we queue here.
let rtc = timeline.remote_client.as_ref().unwrap();
rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
}
timeline
.load_layer_map(new_disk_consistent_lsn)
.load_layer_map(
disk_consistent_lsn,
remote_startup_data.map(|x| x.index_part),
)
.await
.with_context(|| {
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
@@ -452,19 +492,6 @@ impl Tenant {
}
};
if self.remote_storage.is_some() {
// Reconcile local state with remote storage, downloading anything that's
// missing locally, and scheduling uploads for anything that's missing
// in remote storage.
timeline
.reconcile_with_remote(
up_to_date_metadata,
remote_startup_data.as_ref().map(|r| &r.index_part),
)
.await
.context("failed to reconcile with remote")?
}
// Sanity check: a timeline should have some content.
anyhow::ensure!(
ancestor.is_some()
@@ -479,18 +506,6 @@ impl Tenant {
"Timeline has no ancestor and no layer files"
);
// Save the metadata file to local disk.
if !picked_local {
save_metadata(
self.conf,
&tenant_id,
&timeline_id,
up_to_date_metadata,
first_save,
)
.context("save_metadata")?;
}
Ok(())
}

View File

@@ -7,6 +7,7 @@ use anyhow::Context;
use pageserver_api::models::TenantState;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use tokio::sync::OwnedMutexGuard;
use tokio_util::sync::CancellationToken;
use tracing::{error, info, instrument, warn, Instrument, Span};
use utils::{
@@ -82,6 +83,8 @@ async fn create_remote_delete_mark(
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"mark_upload",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await
.context("mark_upload")?;
@@ -171,6 +174,8 @@ async fn remove_tenant_remote_delete_mark(
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await
.context("remove_tenant_remote_delete_mark")?;
@@ -252,6 +257,8 @@ pub(crate) async fn remote_delete_mark_exists(
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
"fetch_tenant_deletion_mark",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await;

View File

@@ -22,7 +22,7 @@ pub struct EphemeralFile {
_tenant_id: TenantId,
_timeline_id: TimelineId,
file: VirtualFile,
size: u64,
len: u64,
/// An ephemeral file is append-only.
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
/// The other pages, which can no longer be modified, are accessed through the page cache.
@@ -53,13 +53,13 @@ impl EphemeralFile {
_tenant_id: tenant_id,
_timeline_id: timeline_id,
file,
size: 0,
len: 0,
mutable_tail: [0u8; PAGE_SZ],
})
}
pub(crate) fn size(&self) -> u64 {
self.size
pub(crate) fn len(&self) -> u64 {
self.len
}
}
@@ -84,8 +84,8 @@ impl BlobWriter for EphemeralFile {
impl<'a> Writer<'a> {
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
Ok(Writer {
blknum: (ephemeral_file.size / PAGE_SZ as u64) as u32,
off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
ephemeral_file,
})
}
@@ -154,7 +154,7 @@ impl BlobWriter for EphemeralFile {
}
}
let pos = self.size;
let pos = self.len;
let mut writer = Writer::new(self)?;
// Write the length field
@@ -172,11 +172,11 @@ impl BlobWriter for EphemeralFile {
writer.push_bytes(srcbuf)?;
if srcbuf.len() < 0x80 {
self.size += 1;
self.len += 1;
} else {
self.size += 4;
self.len += 4;
}
self.size += srcbuf.len() as u64;
self.len += srcbuf.len() as u64;
Ok(pos)
}
@@ -208,7 +208,7 @@ impl Drop for EphemeralFile {
impl BlockReader for EphemeralFile {
fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
let flushed_blknums = 0..self.size / PAGE_SZ as u64;
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let cache = page_cache::get();
loop {
@@ -242,7 +242,7 @@ impl BlockReader for EphemeralFile {
};
}
} else {
debug_assert_eq!(blknum as u64, self.size / PAGE_SZ as u64);
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
}
}

View File

@@ -135,7 +135,7 @@
//! - Initiate upload queue with that [`IndexPart`].
//! - Reschedule all lost operations by comparing the local filesystem state
//! and remote state as per [`IndexPart`]. This is done in
//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
//! [`Tenant::timeline_init_and_sync`].
//!
//! Note that if we crash during file deletion between the index update
//! that removes the file from the list of files, and deleting the remote file,
@@ -172,7 +172,6 @@
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
//!
//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
//! We keep track of the fact that a client is in `Attaching` state in a marker
//! file on the local disk. This is critical because, when we restart the pageserver,
//! we do not want to do the `List timelines` step for each tenant that has already
@@ -192,14 +191,14 @@
//! not created and the uploads are skipped.
//! Theoretically, it should be ok to remove and re-add remote storage configuration to
//! the pageserver config at any time, since it doesn't make a difference to
//! `reconcile_with_remote`.
//! [`Timeline::load_layer_map`].
//! Of course, the remote timeline dir must not change while we have de-configured
//! remote storage, i.e., the pageserver must remain the owner of the given prefix
//! in remote storage.
//! But note that we don't test any of this right now.
//!
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
mod delete;
mod download;
@@ -211,6 +210,7 @@ use chrono::{NaiveDateTime, Utc};
// re-export these
pub use download::{is_temp_download_file, list_remote_timelines};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use utils::backoff::{
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
@@ -231,6 +231,7 @@ use crate::metrics::{
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
};
use crate::task_mgr::shutdown_token;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::upload_queue::Delete;
@@ -353,6 +354,10 @@ impl RemoteTimelineClient {
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part)?;
self.update_remote_physical_size_gauge(Some(index_part));
info!(
"initialized upload queue from remote index with {} layer files",
index_part.layer_metadata.len()
);
Ok(())
}
@@ -365,6 +370,7 @@ impl RemoteTimelineClient {
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_empty_remote(local_metadata)?;
self.update_remote_physical_size_gauge(None);
info!("initialized upload queue as empty");
Ok(())
}
@@ -754,7 +760,7 @@ impl RemoteTimelineClient {
pausable_failpoint!("persist_deleted_index_part");
backoff::retry(
|| async {
|| {
upload::upload_index_part(
self.conf,
&self.storage_impl,
@@ -762,7 +768,6 @@ impl RemoteTimelineClient {
&self.timeline_id,
&index_part_with_deleted_at,
)
.await
},
|_e| false,
1,
@@ -771,6 +776,8 @@ impl RemoteTimelineClient {
// when executed as part of tenant deletion this happens in the background
2,
"persist_index_part_with_deleted_flag",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await?;
@@ -857,6 +864,7 @@ impl RemoteTimelineClient {
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"list_prefixes",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
)
.await
.context("list prefixes")?;
@@ -880,6 +888,7 @@ impl RemoteTimelineClient {
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_objects",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
)
.await
.context("delete_objects")?;
@@ -901,6 +910,7 @@ impl RemoteTimelineClient {
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_index",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
)
.await
.context("delete_index")?;
@@ -1066,6 +1076,15 @@ impl RemoteTimelineClient {
.await
}
UploadOp::UploadMetadata(ref index_part, _lsn) => {
let mention_having_future_layers = if cfg!(feature = "testing") {
index_part
.layer_metadata
.keys()
.any(|x| x.is_in_future(*_lsn))
} else {
false
};
let res = upload::upload_index_part(
self.conf,
&self.storage_impl,
@@ -1083,6 +1102,10 @@ impl RemoteTimelineClient {
.await;
if res.is_ok() {
self.update_remote_physical_size_gauge(Some(index_part));
if mention_having_future_layers {
// find rationale near crate::tenant::timeline::init::cleanup_future_layer
tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
}
}
res
}
@@ -1134,14 +1157,13 @@ impl RemoteTimelineClient {
}
// sleep until it's time to retry, or we're cancelled
tokio::select! {
_ = task_mgr::shutdown_watcher() => { },
_ = exponential_backoff(
retries,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
) => { },
};
exponential_backoff(
retries,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
&shutdown_token(),
)
.await;
}
}
}

View File

@@ -11,6 +11,7 @@ use std::time::Duration;
use anyhow::{anyhow, Context};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tokio_util::sync::CancellationToken;
use utils::{backoff, crashsafe};
use crate::config::PageServerConf;
@@ -280,6 +281,10 @@ where
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
description,
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
unreachable!()
}),
)
.await
}

View File

@@ -41,8 +41,6 @@ pub use inmemory_layer::InMemoryLayer;
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub use remote_layer::RemoteLayer;
use super::timeline::layer_manager::LayerManager;
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
where
T: PartialOrd<T>,
@@ -175,16 +173,9 @@ impl LayerAccessStats {
///
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
/// [`record_residence_event`]: Self::record_residence_event
pub(crate) fn for_loading_layer(
layer_map_lock_held_witness: &LayerManager,
status: LayerResidenceStatus,
) -> Self {
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
new.record_residence_event(
layer_map_lock_held_witness,
status,
LayerResidenceEventReason::LayerLoad,
);
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
new
}
@@ -197,7 +188,6 @@ impl LayerAccessStats {
/// [`record_residence_event`]: Self::record_residence_event
pub(crate) fn clone_for_residence_change(
&self,
layer_map_lock_held_witness: &LayerManager,
new_status: LayerResidenceStatus,
) -> LayerAccessStats {
let clone = {
@@ -205,11 +195,7 @@ impl LayerAccessStats {
inner.clone()
};
let new = LayerAccessStats(Mutex::new(clone));
new.record_residence_event(
layer_map_lock_held_witness,
new_status,
LayerResidenceEventReason::ResidenceChange,
);
new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
new
}
@@ -229,7 +215,6 @@ impl LayerAccessStats {
///
pub(crate) fn record_residence_event(
&self,
_layer_map_lock_held_witness: &LayerManager,
status: LayerResidenceStatus,
reason: LayerResidenceEventReason,
) {

View File

@@ -212,9 +212,20 @@ pub enum LayerFileName {
}
impl LayerFileName {
pub fn file_name(&self) -> String {
pub(crate) fn file_name(&self) -> String {
self.to_string()
}
/// Determines if this layer file is considered to be in future meaning we will discard these
/// layers during timeline initialization from the given disk_consistent_lsn.
pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
use LayerFileName::*;
match self {
Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
_ => false,
}
}
}
impl fmt::Display for LayerFileName {
@@ -263,8 +274,8 @@ impl serde::Serialize for LayerFileName {
S: serde::Serializer,
{
match self {
Self::Image(fname) => serializer.serialize_str(&fname.to_string()),
Self::Delta(fname) => serializer.serialize_str(&fname.to_string()),
Self::Image(fname) => serializer.collect_str(fname),
Self::Delta(fname) => serializer.collect_str(fname),
}
}
}

View File

@@ -238,7 +238,7 @@ impl InMemoryLayer {
///
pub async fn size(&self) -> Result<u64> {
let inner = self.inner.read().await;
Ok(inner.file.size())
Ok(inner.file.len())
}
///

View File

@@ -185,7 +185,7 @@ impl RemoteLayer {
/// Create a Layer struct representing this layer, after it has been downloaded.
pub(crate) fn create_downloaded_layer(
&self,
layer_map_lock_held_witness: &LayerManager,
_layer_map_lock_held_witness: &LayerManager,
conf: &'static PageServerConf,
file_size: u64,
) -> Arc<dyn PersistentLayer> {
@@ -197,10 +197,8 @@ impl RemoteLayer {
self.desc.tenant_id,
&fname,
file_size,
self.access_stats.clone_for_residence_change(
layer_map_lock_held_witness,
LayerResidenceStatus::Resident,
),
self.access_stats
.clone_for_residence_change(LayerResidenceStatus::Resident),
))
} else {
let fname = self.desc.image_file_name();
@@ -210,10 +208,8 @@ impl RemoteLayer {
self.desc.tenant_id,
&fname,
file_size,
self.access_stats.clone_for_residence_change(
layer_map_lock_held_witness,
LayerResidenceStatus::Resident,
),
self.access_stats
.clone_for_residence_change(LayerResidenceStatus::Resident),
))
}
}

View File

@@ -1,5 +1,6 @@
pub mod delete;
mod eviction_task;
mod init;
pub mod layer_manager;
mod logical_size;
pub mod span;
@@ -27,7 +28,6 @@ use utils::id::TenantTimelineId;
use std::cmp::{max, min, Ordering};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::fs;
use std::ops::{Deref, Range};
use std::path::{Path, PathBuf};
use std::pin::pin;
@@ -38,15 +38,13 @@ use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
LayerAccessStats, LayerFileName, RemoteLayer,
DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
};
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::tenant::{
ephemeral_file::is_ephemeral_file,
layer_map::{LayerMap, SearchResult},
metadata::{save_metadata, TimelineMetadata},
par_fsync,
@@ -78,11 +76,10 @@ use utils::{
use crate::page_cache;
use crate::repository::GcResult;
use crate::repository::{Key, Value};
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::walredo::WalRedoManager;
use crate::METADATA_FILE_NAME;
use crate::ZERO_PAGE;
use crate::{is_temporary, task_mgr};
use self::delete::DeleteTimelineFlow;
pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -1211,7 +1208,7 @@ impl Timeline {
&layer_metadata,
local_layer
.access_stats()
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
.clone_for_residence_change(LayerResidenceStatus::Evicted),
),
LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
self.tenant_id,
@@ -1220,7 +1217,7 @@ impl Timeline {
&layer_metadata,
local_layer
.access_stats()
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
.clone_for_residence_change(LayerResidenceStatus::Evicted),
),
});
@@ -1518,7 +1515,7 @@ impl Timeline {
let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
let self_clone = Arc::clone(self);
info!("spawning flush loop");
debug!("spawning flush loop");
*flush_loop_state = FlushLoopState::Running {
#[cfg(test)]
expect_initdb_optimization: false,
@@ -1589,9 +1586,7 @@ impl Timeline {
));
}
///
/// Initialize with an empty layer map. Used when creating a new timeline.
///
pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
let mut layers = self.layers.try_write().expect(
"in the context where we call this function, no other task has access to the object",
@@ -1599,10 +1594,16 @@ impl Timeline {
layers.initialize_empty(Lsn(start_lsn.0));
}
///
/// Scan the timeline directory to populate the layer map.
///
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
/// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only
/// files.
pub(super) async fn load_layer_map(
&self,
disk_consistent_lsn: Lsn,
index_part: Option<IndexPart>,
) -> anyhow::Result<()> {
use init::{Decision::*, Discovered, FutureLayer};
use LayerFileName::*;
let mut guard = self.layers.write().await;
let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1610,102 +1611,150 @@ impl Timeline {
// Scan timeline directory and create ImageFileName and DeltaFilename
// structs representing all files on disk
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
// total size of layer files in the current timeline directory
let mut total_physical_size = 0;
let (conf, tenant_id, timeline_id) = (self.conf, self.tenant_id, self.timeline_id);
let span = tracing::Span::current();
let mut loaded_layers = Vec::<Arc<dyn PersistentLayer>>::new();
let (loaded_layers, needs_upload, total_physical_size) = tokio::task::spawn_blocking({
move || {
let _g = span.entered();
let discovered = init::scan_timeline_dir(&timeline_path)?;
let mut discovered_layers = Vec::with_capacity(discovered.len());
let mut unrecognized_files = Vec::new();
for direntry in fs::read_dir(timeline_path)? {
let direntry = direntry?;
let direntry_path = direntry.path();
let fname = direntry.file_name();
let fname = fname.to_string_lossy();
let mut path = timeline_path;
if let Some(filename) = ImageFileName::parse_str(&fname) {
// create an ImageLayer struct for each image file.
if filename.lsn > disk_consistent_lsn {
info!(
"found future image layer {} on timeline {} disk_consistent_lsn is {}",
filename, self.timeline_id, disk_consistent_lsn
);
rename_to_backup(&direntry_path)?;
continue;
for discovered in discovered {
let (name, kind) = match discovered {
Discovered::Layer(file_name, file_size) => {
discovered_layers.push((file_name, file_size));
continue;
}
Discovered::Metadata | Discovered::IgnoredBackup => {
continue;
}
Discovered::Unknown(file_name) => {
// we will later error if there are any
unrecognized_files.push(file_name);
continue;
}
Discovered::Ephemeral(name) => (name, "old ephemeral file"),
Discovered::Temporary(name) => (name, "temporary timeline file"),
Discovered::TemporaryDownload(name) => (name, "temporary download"),
};
path.push(name);
init::cleanup(&path, kind)?;
path.pop();
}
let file_size = direntry_path.metadata()?.len();
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
let layer = ImageLayer::new(
self.conf,
self.timeline_id,
self.tenant_id,
&filename,
file_size,
stats,
);
total_physical_size += file_size;
loaded_layers.push(Arc::new(layer));
} else if let Some(filename) = DeltaFileName::parse_str(&fname) {
// Create a DeltaLayer struct for each delta file.
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
// OK for a delta layer to have end LSN 101, but if the end LSN
// is 102, then it might not have been fully flushed to disk
// before crash.
if filename.lsn_range.end > disk_consistent_lsn + 1 {
info!(
"found future delta layer {} on timeline {} disk_consistent_lsn is {}",
filename, self.timeline_id, disk_consistent_lsn
if !unrecognized_files.is_empty() {
// assume that if there are any there are many many.
let n = unrecognized_files.len();
let first = &unrecognized_files[..n.min(10)];
anyhow::bail!(
"unrecognized files in timeline dir (total {n}), first 10: {first:?}"
);
rename_to_backup(&direntry_path)?;
continue;
}
let file_size = direntry_path.metadata()?.len();
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
let decided =
init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
let layer = DeltaLayer::new(
self.conf,
self.timeline_id,
self.tenant_id,
&filename,
file_size,
stats,
);
let mut loaded_layers = Vec::new();
let mut needs_upload = Vec::new();
let mut total_physical_size = 0;
total_physical_size += file_size;
loaded_layers.push(Arc::new(layer));
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
// ignore these
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
info!(
"skipping temp download file, reconcile_with_remote will resume / clean up: {}",
fname
);
} else if is_ephemeral_file(&fname) {
// Delete any old ephemeral files
trace!("deleting old ephemeral file in timeline dir: {}", fname);
fs::remove_file(&direntry_path)?;
} else if is_temporary(&direntry_path) {
info!("removing temp timeline file at {}", direntry_path.display());
fs::remove_file(&direntry_path).with_context(|| {
format!(
"failed to remove temp download file at {}",
direntry_path.display()
)
})?;
} else {
warn!("unrecognized filename in timeline dir: {}", fname);
for (name, decision) in decided {
let decision = match decision {
Ok(UseRemote { local, remote }) => {
path.push(name.file_name());
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
path.pop();
UseRemote { local, remote }
}
Ok(decision) => decision,
Err(FutureLayer { local }) => {
if local.is_some() {
path.push(name.file_name());
init::cleanup_future_layer(&path, name, disk_consistent_lsn)?;
path.pop();
} else {
// we cannot do anything for remote layers, but not continuing to
// process it will leave it out index_part.json as well.
}
//
// we do not currently schedule deletions for these.
continue;
}
};
match &name {
Delta(d) => assert!(d.lsn_range.end <= disk_consistent_lsn + 1),
Image(i) => assert!(i.lsn <= disk_consistent_lsn),
}
let status = match &decision {
UseLocal(_) | NeedsUpload(_) => LayerResidenceStatus::Resident,
Evicted(_) | UseRemote { .. } => LayerResidenceStatus::Evicted,
};
let stats = LayerAccessStats::for_loading_layer(status);
let layer: Arc<dyn PersistentLayer> = match (name, &decision) {
(Delta(d), UseLocal(m) | NeedsUpload(m)) => {
total_physical_size += m.file_size();
Arc::new(DeltaLayer::new(
conf,
timeline_id,
tenant_id,
&d,
m.file_size(),
stats,
))
}
(Image(i), UseLocal(m) | NeedsUpload(m)) => {
total_physical_size += m.file_size();
Arc::new(ImageLayer::new(
conf,
timeline_id,
tenant_id,
&i,
m.file_size(),
stats,
))
}
(Delta(d), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
RemoteLayer::new_delta(tenant_id, timeline_id, &d, remote, stats),
),
(Image(i), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
RemoteLayer::new_img(tenant_id, timeline_id, &i, remote, stats),
),
};
if let NeedsUpload(m) = decision {
needs_upload.push((layer.clone(), m));
}
loaded_layers.push(layer);
}
Ok((loaded_layers, needs_upload, total_physical_size))
}
}
})
.await
.map_err(anyhow::Error::new)
.and_then(|x| x)?;
let num_layers = loaded_layers.len();
guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);
guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
if let Some(rtc) = self.remote_client.as_ref() {
for (layer, m) in needs_upload {
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
}
rtc.schedule_index_upload_for_file_changes()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
}
info!(
"loaded layer map with {} layers at {}, total physical size: {}",
@@ -1716,236 +1765,6 @@ impl Timeline {
.set(total_physical_size);
timer.stop_and_record();
Ok(())
}
async fn create_remote_layers(
&self,
index_part: &IndexPart,
local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
up_to_date_disk_consistent_lsn: Lsn,
) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
// Are we missing some files that are present in remote storage?
// Create RemoteLayer instances for them.
let mut local_only_layers = local_layers;
// We're holding a layer map lock for a while but this
// method is only called during init so it's fine.
let mut guard = self.layers.write().await;
let mut corrupted_local_layers = Vec::new();
let mut added_remote_layers = Vec::new();
for remote_layer_name in index_part.layer_metadata.keys() {
let local_layer = local_only_layers.remove(remote_layer_name);
let remote_layer_metadata = index_part
.layer_metadata
.get(remote_layer_name)
.map(LayerFileMetadata::from)
.with_context(|| {
format!(
"No remote layer metadata found for layer {}",
remote_layer_name.file_name()
)
})?;
// Is the local layer's size different from the size stored in the
// remote index file?
// If so, rename_to_backup those files & replace their local layer with
// a RemoteLayer in the layer map so that we re-download them on-demand.
if let Some(local_layer) = local_layer {
let local_layer_path = local_layer
.local_path()
.expect("caller must ensure that local_layers only contains local layers");
ensure!(
local_layer_path.exists(),
"every layer from local_layers must exist on disk: {}",
local_layer_path.display()
);
let remote_size = remote_layer_metadata.file_size();
let metadata = local_layer_path.metadata().with_context(|| {
format!(
"get file size of local layer {}",
local_layer_path.display()
)
})?;
let local_size = metadata.len();
if local_size != remote_size {
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
if let Err(err) = rename_to_backup(&local_layer_path) {
assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
} else {
self.metrics.resident_physical_size_gauge.sub(local_size);
corrupted_local_layers.push(local_layer);
// fall-through to adding the remote layer
}
} else {
debug!(
"layer is present locally and file size matches remote, using it: {}",
local_layer_path.display()
);
continue;
}
}
info!(
"remote layer does not exist locally, creating remote layer: {}",
remote_layer_name.file_name()
);
match remote_layer_name {
LayerFileName::Image(imgfilename) => {
if imgfilename.lsn > up_to_date_disk_consistent_lsn {
info!(
"found future image layer {} on timeline {} remote_consistent_lsn is {}",
imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
);
continue;
}
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
let remote_layer = RemoteLayer::new_img(
self.tenant_id,
self.timeline_id,
imgfilename,
&remote_layer_metadata,
stats,
);
let remote_layer = Arc::new(remote_layer);
added_remote_layers.push(remote_layer);
}
LayerFileName::Delta(deltafilename) => {
// Create a RemoteLayer for the delta file.
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
// OK for a delta layer to have end LSN 101, but if the end LSN
// is 102, then it might not have been fully flushed to disk
// before crash.
if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
info!(
"found future delta layer {} on timeline {} remote_consistent_lsn is {}",
deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
);
continue;
}
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
let remote_layer = RemoteLayer::new_delta(
self.tenant_id,
self.timeline_id,
deltafilename,
&remote_layer_metadata,
stats,
);
let remote_layer = Arc::new(remote_layer);
added_remote_layers.push(remote_layer);
}
}
}
guard.initialize_remote_layers(corrupted_local_layers, added_remote_layers);
Ok(local_only_layers)
}
/// This function will synchronize local state with what we have in remote storage.
///
/// Steps taken:
/// 1. Initialize upload queue based on `index_part`.
/// 2. Create `RemoteLayer` instances for layers that exist only on the remote.
/// The list of layers on the remote comes from `index_part`.
/// The list of local layers is given by the layer map's `iter_historic_layers()`.
/// So, the layer map must have been loaded already.
/// 3. Schedule upload of local-only layer files (which will then also update the remote
/// IndexPart to include the new layer files).
///
/// Refer to the [`remote_timeline_client`] module comment for more context.
///
/// # TODO
/// May be a bit cleaner to do things based on populated remote client,
/// and then do things based on its upload_queue.latest_files.
#[instrument(skip(self, index_part, up_to_date_metadata))]
pub async fn reconcile_with_remote(
&self,
up_to_date_metadata: &TimelineMetadata,
index_part: Option<&IndexPart>,
) -> anyhow::Result<()> {
info!("starting");
let remote_client = self
.remote_client
.as_ref()
.ok_or_else(|| anyhow!("cannot download without remote storage"))?;
let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
let local_layers = {
let guard = self.layers.read().await;
let layers = guard.layer_map();
layers
.iter_historic_layers()
.map(|l| (l.filename(), guard.get_from_desc(&l)))
.collect::<HashMap<_, _>>()
};
// If no writes happen, new branches do not have any layers, only the metadata file.
let has_local_layers = !local_layers.is_empty();
let local_only_layers = match index_part {
Some(index_part) => {
info!(
"initializing upload queue from remote index with {} layer files",
index_part.layer_metadata.len()
);
remote_client.init_upload_queue(index_part)?;
self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
.await?
}
None => {
info!("initializing upload queue as empty");
remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
local_layers
}
};
if has_local_layers {
// Are there local files that don't exist remotely? Schedule uploads for them.
// Local timeline metadata will get uploaded to remove along witht he layers.
for (layer_name, layer) in &local_only_layers {
// XXX solve this in the type system
let layer_path = layer
.local_path()
.expect("local_only_layers only contains local layers");
let layer_size = layer_path
.metadata()
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
.len();
info!("scheduling {layer_path:?} for upload");
remote_client
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
}
remote_client.schedule_index_upload_for_file_changes()?;
} else if index_part.is_none() {
// No data on the remote storage, no local layers, local metadata file.
//
// TODO https://github.com/neondatabase/neon/issues/3865
// Currently, console does not wait for the timeline data upload to the remote storage
// and considers the timeline created, expecting other pageserver nodes to work with it.
// Branch metadata upload could get interrupted (e.g pageserver got killed),
// hence any locally existing branch metadata with no remote counterpart should be uploaded,
// otherwise any other pageserver won't see the branch on `attach`.
//
// After the issue gets implemented, pageserver should rather remove the branch,
// since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
// no need to keep the old files.
remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
} else {
// Local timeline has a metadata file, remote one too, both have no layers to sync.
}
info!("Done");
Ok(())
}
@@ -2852,7 +2671,6 @@ impl Timeline {
if let Some(ref l) = delta_layer_to_add {
// TODO: move access stats, metrics update, etc. into layer manager.
l.access_stats().record_residence_event(
&guard,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
@@ -3241,7 +3059,6 @@ impl Timeline {
.add(metadata.len());
let l = Arc::new(l);
l.access_stats().record_residence_event(
&guard,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
@@ -3921,7 +3738,6 @@ impl Timeline {
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
l.access_stats().record_residence_event(
&guard,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
@@ -4840,7 +4656,8 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
for i in 0u32.. {
new_path.set_file_name(format!("{filename}.{i}.old"));
if !new_path.exists() {
std::fs::rename(path, &new_path)?;
std::fs::rename(path, &new_path)
.with_context(|| format!("rename {path:?} to {new_path:?}"))?;
return Ok(());
}
}

View File

@@ -0,0 +1,199 @@
use crate::{
is_temporary,
tenant::{
ephemeral_file::is_ephemeral_file,
remote_timeline_client::{
self,
index::{IndexPart, LayerFileMetadata},
},
storage_layer::LayerFileName,
},
METADATA_FILE_NAME,
};
use anyhow::Context;
use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
use utils::lsn::Lsn;
/// Identified files in the timeline directory.
pub(super) enum Discovered {
/// The only one we care about
Layer(LayerFileName, u64),
/// Old ephmeral files from previous launches, should be removed
Ephemeral(OsString),
/// Old temporary timeline files, unsure what these really are, should be removed
Temporary(OsString),
/// Temporary on-demand download files, should be removed
TemporaryDownload(OsString),
/// "metadata" file we persist locally and include in `index_part.json`
Metadata,
/// Backup file from previously future layers
IgnoredBackup,
/// Unrecognized, warn about these
Unknown(OsString),
}
/// Scans the timeline directory for interesting files.
pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
let mut ret = Vec::new();
for direntry in std::fs::read_dir(path)? {
let direntry = direntry?;
let direntry_path = direntry.path();
let file_name = direntry.file_name();
let fname = file_name.to_string_lossy();
let discovered = match LayerFileName::from_str(&fname) {
Ok(file_name) => {
let file_size = direntry.metadata()?.len();
Discovered::Layer(file_name, file_size)
}
Err(_) => {
if fname == METADATA_FILE_NAME {
Discovered::Metadata
} else if fname.ends_with(".old") {
// ignore these
Discovered::IgnoredBackup
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
Discovered::TemporaryDownload(file_name)
} else if is_ephemeral_file(&fname) {
Discovered::Ephemeral(file_name)
} else if is_temporary(&direntry_path) {
Discovered::Temporary(file_name)
} else {
Discovered::Unknown(file_name)
}
}
};
ret.push(discovered);
}
Ok(ret)
}
/// Decision on what to do with a layer file after considering its local and remote metadata.
#[derive(Clone)]
pub(super) enum Decision {
/// The layer is not present locally.
Evicted(LayerFileMetadata),
/// The layer is present locally, but local metadata does not match remote; we must
/// delete it and treat it as evicted.
UseRemote {
local: LayerFileMetadata,
remote: LayerFileMetadata,
},
/// The layer is present locally, and metadata matches.
UseLocal(LayerFileMetadata),
/// The layer is only known locally, it needs to be uploaded.
NeedsUpload(LayerFileMetadata),
}
/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
#[derive(Debug)]
pub(super) struct FutureLayer {
/// The local metadata. `None` if the layer is only known through [`IndexPart`].
pub(super) local: Option<LayerFileMetadata>,
}
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
///
/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
/// the checks earlier to [`scan_timeline_dir`].
pub(super) fn reconcile(
discovered: Vec<(LayerFileName, u64)>,
index_part: Option<&IndexPart>,
disk_consistent_lsn: Lsn,
) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
use Decision::*;
// name => (local, remote)
type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
let mut discovered = discovered
.into_iter()
.map(|(name, file_size)| (name, (Some(LayerFileMetadata::new(file_size)), None)))
.collect::<Collected>();
// merge any index_part information, when available
index_part
.as_ref()
.map(|ip| ip.layer_metadata.iter())
.into_iter()
.flatten()
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
.for_each(|(name, metadata)| {
if let Some(existing) = discovered.get_mut(name) {
existing.1 = Some(metadata);
} else {
discovered.insert(name.to_owned(), (None, Some(metadata)));
}
});
discovered
.into_iter()
.map(|(name, (local, remote))| {
let decision = if name.is_in_future(disk_consistent_lsn) {
Err(FutureLayer { local })
} else {
Ok(match (local, remote) {
(Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
(Some(x), Some(_)) => UseLocal(x),
(None, Some(x)) => Evicted(x),
(Some(x), None) => NeedsUpload(x),
(None, None) => {
unreachable!("there must not be any non-local non-remote files")
}
})
};
(name, decision)
})
.collect::<Vec<_>>()
}
pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
let file_name = path.file_name().expect("must be file path");
tracing::debug!(kind, ?file_name, "cleaning up");
std::fs::remove_file(path)
.with_context(|| format!("failed to remove {kind} at {}", path.display()))
}
pub(super) fn cleanup_local_file_for_remote(
path: &Path,
local: &LayerFileMetadata,
remote: &LayerFileMetadata,
) -> anyhow::Result<()> {
let local_size = local.file_size();
let remote_size = remote.file_size();
let file_name = path.file_name().expect("must be file path");
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
assert!(
path.exists(),
"we would leave the local_layer without a file if this does not hold: {}",
path.display()
);
Err(err)
} else {
Ok(())
}
}
pub(super) fn cleanup_future_layer(
path: &Path,
name: LayerFileName,
disk_consistent_lsn: Lsn,
) -> anyhow::Result<()> {
use LayerFileName::*;
let kind = match name {
Delta(_) => "delta",
Image(_) => "image",
};
// future image layers are allowed to be produced always for not yet flushed to disk
// lsns stored in InMemoryLayer.
tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
crate::tenant::timeline::rename_to_backup(path)?;
Ok(())
}

View File

@@ -13,7 +13,7 @@ use crate::{
layer_map::{BatchedUpdates, LayerMap},
storage_layer::{
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
PersistentLayerDesc, PersistentLayerKey,
},
timeline::compare_arced_layers,
},
@@ -85,21 +85,6 @@ impl LayerManager {
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
}
pub(crate) fn initialize_remote_layers(
&mut self,
corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
remote_layers: Vec<Arc<RemoteLayer>>,
) {
let mut updates = self.layer_map.batch_update();
for layer in corrupted_local_layers {
Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
}
for layer in remote_layers {
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
}
updates.flush();
}
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
/// called within `get_layer_for_write`.
pub(crate) fn get_layer_for_write(
@@ -265,16 +250,6 @@ impl LayerManager {
mapping.insert(layer);
}
/// Helper function to remove a layer into the layer map and file manager
fn remove_historic_layer(
layer: Arc<dyn PersistentLayer>,
updates: &mut BatchedUpdates<'_>,
mapping: &mut LayerFileManager,
) {
updates.remove_historic(layer.layer_desc());
mapping.remove(layer);
}
/// Removes the layer from local FS (if present) and from memory.
/// Remote storage is not affected by this operation.
fn delete_historic_layer(

View File

@@ -17,7 +17,7 @@ use crate::metrics::{
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
};
use crate::task_mgr::TaskKind;
use crate::task_mgr::{shutdown_token, TaskKind};
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
@@ -211,11 +211,14 @@ async fn subscribe_for_timeline_updates(
id: TenantTimelineId,
) -> Streaming<SafekeeperTimelineInfo> {
let mut attempt = 0;
let cancel = shutdown_token();
loop {
exponential_backoff(
attempt,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
&cancel,
)
.await;
attempt += 1;

View File

@@ -14,6 +14,7 @@
*/
#include <sys/file.h>
#include <sys/mman.h>
#include <sys/statvfs.h>
#include <unistd.h>
#include <fcntl.h>
@@ -59,12 +60,17 @@
* 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
*/
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
#define CHUNK_SIZE (BLOCKS_PER_CHUNK * BLCKSZ)
#define MB ((uint64)1024*1024)
#ifndef MADV_REMOVE
#define MADV_REMOVE MADV_FREE /* MacOS doesn't have MADV_REMOVE and at Linux MADV_FREE works only for MAP_PRIVATE */
#endif
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */
#define MAX_MEM_WRITE_RATE 10000 /* MB/sec */
typedef struct FileCacheEntry
{
@@ -83,21 +89,56 @@ typedef struct FileCacheControl
} FileCacheControl;
static HTAB* lfc_hash;
static int lfc_desc;
static LWLockId lfc_lock;
static int lfc_max_size;
static int lfc_max_mem;
static int lfc_size_limit;
static int lfc_free_space_watermark;
static int lfc_free_memory_watermark;
static char* lfc_base_addr;
static char* lfc_path;
static FileCacheControl* lfc_ctl;
static shmem_startup_hook_type prev_shmem_startup_hook;
#if PG_VERSION_NUM>=150000
static shmem_request_hook_type prev_shmem_request_hook;
#endif
static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark or lfc_free_memory_watermak are reached */
void FileCacheMonitorMain(Datum main_arg);
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
static size_t
get_available_memory(void)
{
size_t total;
size_t sizeof_total = sizeof(total);
if (sysctlbyname("hw.memsize", &total, &sizeof_total, NULL, 0) < 0)
elog(ERROR, "Failed to get amount of RAM: %m");
return total;
}
#else
#include <sys/sysinfo.h>
static size_t
get_available_memory(void)
{
struct sysinfo si;
if (sysinfo(&si) < 0)
elog(ERROR, "Failed to get amount of RAM: %m");
return si.totalram*si.mem_unit;
}
#endif
static void
lfc_shmem_startup(void)
{
@@ -111,7 +152,7 @@ lfc_shmem_startup(void)
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl) + lfc_max_size*MB + CHUNK_SIZE, &found);
if (!found)
{
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
@@ -126,10 +167,10 @@ lfc_shmem_startup(void)
lfc_ctl->size = 0;
lfc_ctl->used = 0;
dlist_init(&lfc_ctl->lru);
/* Remove file cache on restart */
(void)unlink(lfc_path);
}
lfc_base_addr = (char*)TYPEALIGN(CHUNK_SIZE, lfc_ctl+1);
if (!found)
madvise(lfc_base_addr, lfc_max_size*MB, MADV_REMOVE);
LWLockRelease(AddinShmemInitLock);
}
@@ -141,7 +182,9 @@ lfc_shmem_request(void)
prev_shmem_request_hook();
#endif
RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
lfc_max_size = Min(lfc_max_size, lfc_max_mem);
lfc_size_limit = Min(lfc_size_limit, lfc_max_mem);
RequestAddinShmemSpace(sizeof(FileCacheControl) + lfc_max_size*MB + CHUNK_SIZE + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
RequestNamedLWLockTranche("lfc_lock", 1);
}
@@ -167,26 +210,14 @@ lfc_change_limit_hook(int newval, void *extra)
if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
return;
/* Open cache file if not done yet */
if (lfc_desc == 0)
{
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
if (lfc_desc < 0) {
elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
lfc_size_limit = 0; /* disable file cache */
return;
}
}
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
{
/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
Assert(victim->access_count == 0);
#ifdef FALLOC_FL_PUNCH_HOLE
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
elog(LOG, "Failed to punch hole in file: %m");
#endif
if (madvise(lfc_base_addr + victim->offset*CHUNK_SIZE, CHUNK_SIZE, MADV_REMOVE) < 0)
elog(LOG, "Failed to punch hole in memory: %m");
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
lfc_ctl->used -= 1;
}
@@ -195,10 +226,11 @@ lfc_change_limit_hook(int newval, void *extra)
}
/*
* Local file system state monitor check available free space.
* If it is lower than lfc_free_space_watermark then we shrink size of local cache
* Local file system state monitor check available free space and memory.
* If available disk space is lower than lfc_free_space_watermark or
* available memory is lower than lfc_free_memory_watermark then we shrink size of local cache
* but throwing away least recently accessed chunks.
* First time low space watermark is reached cache size is divided by two,
* First time the watermark is reached cache size is divided by two,
* second time by four,... Finally we remove all chunks from local cache.
*
* Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
@@ -213,9 +245,9 @@ FileCacheMonitorMain(Datum main_arg)
{
/*
* Choose file system state monitor interval so that space can not be exosted
* during this period but not longer than MAX_MONITOR_INTERVAL (10 sec)
* during this period but not longer than MAX_MONITOR_INTERVAL (1 sec)
*/
uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_MEM_WRITE_RATE);
/* Establish signal handlers. */
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
@@ -226,25 +258,17 @@ FileCacheMonitorMain(Datum main_arg)
/* Periodically dump buffers until terminated. */
while (!ShutdownRequestPending)
{
if (lfc_size_limit != 0)
if (lfc_size_limit != 0 && lfc_free_memory_watermark != 0 )
{
struct statvfs sfs;
if (statvfs(lfc_path, &sfs) < 0)
if (get_available_memory() < lfc_free_memory_watermark*MB)
{
elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
if (lfc_shrinking_factor < 31) {
lfc_shrinking_factor += 1;
}
lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
}
else
{
if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
{
if (lfc_shrinking_factor < 31) {
lfc_shrinking_factor += 1;
}
lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
}
else
lfc_shrinking_factor = 0; /* reset to initial value */
}
lfc_shrinking_factor = 0; /* reset to initial value */
}
pg_usleep(monitor_interval);
}
@@ -278,6 +302,18 @@ lfc_init(void)
if (!process_shared_preload_libraries_in_progress)
elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
/* TODO: left only for compatibility with on-disk cache */
DefineCustomStringVariable("neon.file_cache_path",
"Path to local file cache (can be raw device)",
NULL,
&lfc_path,
"file.cache",
PGC_POSTMASTER,
0,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.max_file_cache_size",
"Maximal size of Neon local file cache",
NULL,
@@ -291,6 +327,19 @@ lfc_init(void)
NULL,
NULL);
DefineCustomIntVariable("neon.max_inmem_cache_size",
"Maximal size used by Neon local file cache in memory",
NULL,
&lfc_max_mem,
128, /* 128Mb */
0,
INT_MAX,
PGC_POSTMASTER,
GUC_UNIT_MB,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.file_cache_size_limit",
"Current limit for size of Neon local file cache",
NULL,
@@ -304,11 +353,11 @@ lfc_init(void)
lfc_change_limit_hook,
NULL);
DefineCustomIntVariable("neon.free_space_watermark",
"Minimal free space in local file system after reaching which local file cache will be truncated",
DefineCustomIntVariable("neon.free_memory_watermark",
"Minimal free memory in system after reaching which local file cache will be truncated",
NULL,
&lfc_free_space_watermark,
1024, /* 1GB */
&lfc_free_memory_watermark,
0, /* disabled by default, because iurt makes sense only when local file cache is located i tmpfs */
0,
INT_MAX,
PGC_SIGHUP,
@@ -317,17 +366,6 @@ lfc_init(void)
NULL,
NULL);
DefineCustomStringVariable("neon.file_cache_path",
"Path to local file cache (can be raw device)",
NULL,
&lfc_path,
"file.cache",
PGC_POSTMASTER,
0,
NULL,
NULL,
NULL);
if (lfc_max_size == 0)
return;
@@ -476,27 +514,7 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
dlist_delete(&entry->lru_node);
LWLockRelease(lfc_lock);
/* Open cache file if not done yet */
if (lfc_desc == 0)
{
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
if (lfc_desc < 0) {
elog(LOG, "Failed to open file cache %s: %m", lfc_path);
lfc_size_limit = 0; /* disable file cache */
result = false;
}
}
if (lfc_desc > 0)
{
rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
if (rc != BLCKSZ)
{
elog(INFO, "Failed to read file cache: %m");
lfc_size_limit = 0; /* disable file cache */
result = false;
}
}
memcpy(buffer, lfc_base_addr + ((size_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ, BLCKSZ);
/* Place entry to the head of LRU list */
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -569,24 +587,8 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
}
LWLockRelease(lfc_lock);
/* Open cache file if not done yet */
if (lfc_desc == 0)
{
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
if (lfc_desc < 0) {
elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
lfc_size_limit = 0; /* disable file cache */
}
}
if (lfc_desc > 0)
{
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
if (rc != BLCKSZ)
{
elog(WARNING, "Failed to write file cache: %m, disabling file cache");
lfc_size_limit = 0; /* disable file cache */
}
}
memcpy(lfc_base_addr + ((size_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ, buffer, BLCKSZ);
/* Place entry to the head of LRU list */
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
Assert(entry->access_count > 0);

View File

@@ -1,76 +0,0 @@
#! /usr/bin/env python3
# Script to generate ext_index.json metadata file
# that stores content of the control files and location of extension archives
# for all extensions in extensions subdir.
import argparse
import json
import subprocess
from pathlib import Path
"""
# ext_index.json example:
{
"public_extensions": [
"anon"
],
"library_index": {
"anon": "anon",
// for more complex extensions like postgis
// we might have something like:
// address_standardizer: postgis
// postgis_tiger: postgis
},
"extension_data": {
"anon": {
"control_data": {
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
},
"archive_path": "5648391853/v15/extensions/anon.tar.zst"
}
}
}
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="generate ext_index.json")
parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
parser.add_argument("--public_extensions", type=str, help="list of public extensions")
args = parser.parse_args()
pg_version = args.pg_version
BUILD_TAG = args.BUILD_TAG
public_ext_list = args.public_extensions.split(",")
ext_index = {}
library_index = {}
EXT_PATH = Path("extensions")
for extension in EXT_PATH.iterdir():
if extension.is_dir():
control_data = {}
for control_file in extension.glob("*.control"):
if control_file.suffix != ".control":
continue
with open(control_file, "r") as f:
control_data[control_file.name] = f.read()
ext_index[extension.name] = {
"control_data": control_data,
"archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
}
elif extension.suffix == ".zst":
file_list = (
str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
.strip()
.split("\n")
)
for file in file_list:
if file.endswith(".so") and file.startswith("lib/"):
lib_name = file[4:-3]
library_index[lib_name] = extension.name.replace(".tar.zst", "")
all_data = {
"public_extensions": public_ext_list,
"library_index": library_index,
"extension_data": ext_index,
}
with open("ext_index.json", "w") as f:
json.dump(all_data, f)

View File

@@ -369,7 +369,7 @@ def test_download_remote_layers_api(
filled_current_physical = get_api_current_physical_size()
log.info(filled_current_physical)
filled_size = get_resident_physical_size()
log.info(filled_size)
log.info(f"filled_size: {filled_size}")
assert filled_current_physical == filled_size, "we don't yet do layer eviction"
env.pageserver.stop()
@@ -377,7 +377,7 @@ def test_download_remote_layers_api(
# remove all the layer files
# XXX only delete some of the layer files, to show that it really just downloads all the layers
for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer}")
log.info(f"unlinking layer {layer.name}")
layer.unlink()
# Shut down safekeepers before starting the pageserver.
@@ -403,7 +403,7 @@ def test_download_remote_layers_api(
filled_current_physical == get_api_current_physical_size()
), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
post_unlink_size = get_resident_physical_size()
log.info(post_unlink_size)
log.info(f"post_unlink_size: {post_unlink_size}")
assert (
post_unlink_size < filled_size
), "we just deleted layers and didn't cause anything to re-download them yet"