Compare commits

..

22 Commits

Author SHA1 Message Date
Alex Chi
19180e167f remove LayerKey usage
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-28 10:00:02 -04:00
Alex Chi
b2cd142836 fix tests
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-28 09:58:31 -04:00
Alex Chi
f2d7baf0ba rename DeleteGuard -> LayerDeletionGuard
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 16:57:11 -04:00
Alex Chi
113a4256d4 rename lcache to layer_cache
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 16:55:56 -04:00
Alex Chi
be4999713a add comments for LayerCache
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 16:54:51 -04:00
Alex Chi
7335f155c3 fmt
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:31:15 -04:00
Alex Chi
a1ca70ff35 Merge branch 'skyzh/layermap-ref-2' of https://github.com/neondatabase/neon into skyzh/layermap-as-cache 2023-06-27 10:26:13 -04:00
Alex Chi
ce1e57faea fix merge conflicts
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:25:06 -04:00
Alex Chi
6f50bec781 fix merge conflicts
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:24:48 -04:00
Alex Chi
b981702ecf Merge branch 'skyzh/layermap-ref-2' of https://github.com/neondatabase/neon into skyzh/layermap-as-cache 2023-06-27 10:23:08 -04:00
Alex Chi
21d30fc43f use layer_desc key for replace cmp
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:16:30 -04:00
Alex Chi
137ad83f37 fix merge conflicts
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:14:09 -04:00
Alex Chi
22da36bc02 fix merge conflicts
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:04:21 -04:00
Alex Chi
900ef3d92b Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/layermap-ref-2 2023-06-27 10:02:57 -04:00
Alex Chi
b7923fa0be use new errmsg
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-23 16:27:49 -04:00
Alex Chi
4c4a531d5e rename LayerMapping -> LayerFileManager
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-23 15:52:09 -04:00
Alex Chi
2b4f96345b resolve comments
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-23 15:32:57 -04:00
Alex Chi
b775ca8a58 resolve conflicts
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-20 09:52:28 -04:00
Alex Chi
ddb5862be2 Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/layermap-ref-2 2023-06-20 09:12:15 -04:00
Alex Chi
a2056666ae pgserver: move mapping logic to layer cache
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-14 15:07:38 -04:00
Alex Chi
fc190a2a19 resolve merge conflicts
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-13 13:56:50 -04:00
Alex Chi
faee3152f3 refactor: use LayerDesc in LayerMap (part 2)
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-13 13:54:59 -04:00
35 changed files with 591 additions and 849 deletions

View File

@@ -722,35 +722,6 @@ jobs:
--dockerfile Dockerfile.compute-node
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--cleanup
# Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
# During the transition period we need to have extensions in both places (in S3 and in compute-node image),
# so we won't build extension twice, but extract them from compute-node.
#
# For now we use extensions image only for new custom extensitons
- name: Kaniko build extensions only
run: |
# Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
# Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
# it still fails with error:
# error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
#
# Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--context . \
--build-arg GIT_VERSION=${{ github.sha }} \
--build-arg PG_VERSION=${{ matrix.version }} \
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
--dockerfile Dockerfile.compute-node \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--cleanup \
--target postgres-extensions
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
@@ -869,10 +840,8 @@ jobs:
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
- name: Push images to production ECR
if: |
@@ -883,10 +852,8 @@ jobs:
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
- name: Configure Docker Hub login
run: |
@@ -908,89 +875,16 @@ jobs:
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
upload-postgres-extensions-to-s3:
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
needs: [ tag, promote-images ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15 ]
env:
# While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
# Later all the extensions will be moved to extensions image.
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
S3_BUCKETS: |
${{ github.ref_name == 'release' &&
'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
steps:
- name: Pull postgres-extensions image
run: |
docker pull ${EXTENSIONS_IMAGE}
docker pull ${COMPUTE_NODE_IMAGE}
- name: Create postgres-extensions container
id: create-container
run: |
EID=$(docker create ${EXTENSIONS_IMAGE} true)
echo "EID=${EID}" >> $GITHUB_OUTPUT
CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
echo "CID=${CID}" >> $GITHUB_OUTPUT
- name: Extract postgres-extensions from container
run: |
rm -rf ./extensions-to-upload ./custom-extensions # Just in case
# In compute image we have a bit different directory layout
mkdir -p extensions-to-upload/share
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib
# Delete Neon extensitons (they always present on compute-node image)
rm -rf ./extensions-to-upload/share/extension/neon*
rm -rf ./extensions-to-upload/lib/neon*
docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
for EXT_NAME in $(ls ./custom-extensions); do
mkdir -p ./extensions-to-upload/${EXT_NAME}/share
mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib
done
- name: Upload postgres-extensions to S3
run: |
for BUCKET in $(echo ${S3_BUCKETS}); do
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
done
- name: Cleanup
if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
run: |
docker rm ${{ steps.create-container.outputs.CID }} || true
docker rm ${{ steps.create-container.outputs.EID }} || true
deploy:
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
needs: [ promote-images, tag, regress-tests ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
steps:
- name: Fix git ownership

6
Cargo.lock generated
View File

@@ -4223,7 +4223,8 @@ dependencies = [
[[package]]
name = "tokio"
version = "1.28.1"
source = "git+https://github.com/problame/tokio.git?branch=problame/distinguish-core-and-worker-by-thread-name#d88791686cfc7fc7d010889ad7638d09646b3de7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105"
dependencies = [
"autocfg",
"bytes",
@@ -4250,7 +4251,8 @@ dependencies = [
[[package]]
name = "tokio-macros"
version = "2.1.0"
source = "git+https://github.com/problame/tokio.git?branch=problame/distinguish-core-and-worker-by-thread-name#d88791686cfc7fc7d010889ad7638d09646b3de7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
dependencies = [
"proc-macro2",
"quote",

View File

@@ -187,8 +187,6 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", re
# until async safekeepers patch is merged to the main.
sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
tokio = { git = "https://github.com/problame/tokio.git", branch="problame/distinguish-core-and-worker-by-thread-name" }
################# Binary contents sections
[profile.release]

View File

@@ -515,26 +515,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
#########################################################################################
#
# Layer "pg-anon-pg-build"
# compile anon extension
#
#########################################################################################
FROM build-deps AS pg-anon-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sort > /before.txt && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
find /usr/local/pgsql -type f | sort > /after.txt && \
/bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
#########################################################################################
#
# Layer "rust extensions"
@@ -643,7 +623,6 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
#
#########################################################################################
FROM build-deps AS neon-pg-ext-build
# Public extensions
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=postgis-build /sfcgal/* /
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -719,22 +698,6 @@ RUN rm -r /usr/local/pgsql/include
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Extenstion only
#
#########################################################################################
FROM scratch AS postgres-extensions
# After the transition this layer will include all extensitons.
# As for now, it's only for new custom ones
#
# # Default extensions
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib
# Custom extensions
COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
#########################################################################################
#
# Final layer

View File

@@ -235,7 +235,7 @@ impl ComputeNode {
// Get basebackup from the libpq connection to pageserver using `connstr` and
// unarchive it to `pgdata` directory overriding all its previous content.
#[instrument(skip_all, fields(%lsn))]
#[instrument(skip(self, compute_state))]
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Utc::now();
@@ -277,7 +277,7 @@ impl ComputeNode {
// Run `postgres` in a special mode with `--sync-safekeepers` argument
// and return the reported LSN back to the caller.
#[instrument(skip_all)]
#[instrument(skip(self, storage_auth_token))]
fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
let start_time = Utc::now();
@@ -322,7 +322,7 @@ impl ComputeNode {
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
#[instrument(skip_all)]
#[instrument(skip(self, compute_state))]
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
@@ -380,7 +380,7 @@ impl ComputeNode {
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
#[instrument(skip_all)]
#[instrument(skip(self))]
pub fn start_postgres(
&self,
storage_auth_token: Option<String>,
@@ -404,7 +404,7 @@ impl ComputeNode {
}
/// Do initial configuration of the already started Postgres.
#[instrument(skip_all)]
#[instrument(skip(self, compute_state))]
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
// If connection fails,
// it may be the old node with `zenith_admin` superuser.
@@ -458,7 +458,7 @@ impl ComputeNode {
// We could've wrapped this around `pg_ctl reload`, but right now we don't use
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
// have opened connection to Postgres and superuser access.
#[instrument(skip_all)]
#[instrument(skip(self, client))]
fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
client.simple_query("SELECT pg_reload_conf()")?;
Ok(())
@@ -466,7 +466,7 @@ impl ComputeNode {
/// Similar to `apply_config()`, but does a bit different sequence of operations,
/// as it's used to reconfigure a previously started and configured Postgres node.
#[instrument(skip_all)]
#[instrument(skip(self))]
pub fn reconfigure(&self) -> Result<()> {
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
@@ -501,7 +501,7 @@ impl ComputeNode {
Ok(())
}
#[instrument(skip_all)]
#[instrument(skip(self))]
pub fn start_compute(&self) -> Result<std::process::Child> {
let compute_state = self.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");

View File

@@ -8,7 +8,7 @@ use compute_api::responses::ComputeStatus;
use crate::compute::ComputeNode;
#[instrument(skip_all)]
#[instrument(skip(compute))]
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
info!("waiting for reconfiguration requests");
loop {

View File

@@ -215,7 +215,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
/// Wait for Postgres to become ready to accept connections. It's ready to
/// accept connections when the state-field in `pgdata/postmaster.pid` says
/// 'ready'.
#[instrument(skip_all, fields(pgdata = %pgdata.display()))]
#[instrument(skip(pg))]
pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
let pid_path = pgdata.join("postmaster.pid");

View File

@@ -308,8 +308,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let mut env =
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
let force = init_match.get_flag("force");
env.init(pg_version, force)
env.init(pg_version)
.context("Failed to initialize neon repository")?;
// Initialize pageserver, create initial tenant and timeline.
@@ -1014,13 +1013,6 @@ fn cli() -> Command {
.help("If set, the node will be a hot replica on the specified timeline")
.required(false);
let force_arg = Arg::new("force")
.value_parser(value_parser!(bool))
.long("force")
.action(ArgAction::SetTrue)
.help("Force initialization even if the repository is not empty")
.required(false);
Command::new("Neon CLI")
.arg_required_else_help(true)
.version(GIT_VERSION)
@@ -1036,7 +1028,6 @@ fn cli() -> Command {
.value_name("config"),
)
.arg(pg_version_arg.clone())
.arg(force_arg)
)
.subcommand(
Command::new("timeline")

View File

@@ -364,7 +364,7 @@ impl LocalEnv {
//
// Initialize a new Neon repository
//
pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
// check if config already exists
let base_path = &self.base_data_dir;
ensure!(
@@ -372,29 +372,11 @@ impl LocalEnv {
"repository base path is missing"
);
if base_path.exists() {
if force {
println!("removing all contents of '{}'", base_path.display());
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
// all contents inside. This helps if the developer symbol links another directory (i.e.,
// S3 local SSD) to the `.neon` base directory.
for entry in std::fs::read_dir(base_path)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
fs::remove_dir_all(&path)?;
} else {
fs::remove_file(&path)?;
}
}
} else {
bail!(
"directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
base_path.display()
);
}
}
ensure!(
!base_path.exists(),
"directory '{}' already exists. Perhaps already initialized?",
base_path.display()
);
if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
bail!(
"Can't find postgres binary at {}",
@@ -410,9 +392,7 @@ impl LocalEnv {
}
}
if !base_path.exists() {
fs::create_dir(base_path)?;
}
fs::create_dir(base_path)?;
// Generate keypair for JWT.
//

View File

@@ -23,6 +23,7 @@ use super::models::{
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use crate::context::{DownloadBehavior, RequestContext};
use crate::disk_usage_eviction_task;
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
@@ -34,7 +35,6 @@ use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant};
use utils::{
auth::JwtAuth,
http::{
@@ -328,17 +328,15 @@ async fn timeline_create_handler(
&ctx,
)
.await {
Ok(new_timeline) => {
Ok(Some(new_timeline)) => {
// Created. Construct a TimelineInfo for it.
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
Err(tenant::CreateTimelineError::AlreadyExists) => {
json_response(StatusCode::CONFLICT, ())
}
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
Err(err) => Err(ApiError::InternalServerError(err)),
}
}
.instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))

View File

@@ -130,66 +130,6 @@ pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_read_accesses_total",
"Number of read accesses to the page cache",
&["key_kind"]
)
.expect("failed to define a metric")
});
pub static PAGE_CACHE_READ_ACCESSES_MATERIALIZED_PAGE: Lazy<IntCounter> = Lazy::new(|| {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&["materialized_page"])
.unwrap()
});
pub static PAGE_CACHE_READ_ACCESSES_EPHEMERAL: Lazy<IntCounter> = Lazy::new(|| {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&["ephemeral"])
.unwrap()
});
pub static PAGE_CACHE_READ_ACCESSES_IMMUTABLE: Lazy<IntCounter> = Lazy::new(|| {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&["immutable"])
.unwrap()
});
pub static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_read_hits_total",
"Number of read accesses to the page cache that hit",
&["key_kind", "hit_kind"]
)
.expect("failed to define a metric")
});
pub static PAGE_CACHE_READ_HITS_EPHEMERAL: Lazy<IntCounter> = Lazy::new(|| {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["ephemeral", "-"])
.unwrap()
});
pub static PAGE_CACHE_READ_HITS_IMMUTABLE: Lazy<IntCounter> = Lazy::new(|| {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["immutable", "-"])
.unwrap()
});
pub static PAGE_CACHE_READ_HITS_MATERIALIZED_PAGE_EXACT: Lazy<IntCounter> = Lazy::new(|| {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["materialized_page", "exact"])
.unwrap()
});
pub static PAGE_CACHE_READ_HITS_MATERIALIZED_PAGE_OLDER_LSN: Lazy<IntCounter> = Lazy::new(|| {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["materialized_page", "older_lsn"])
.unwrap()
});
static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_wait_lsn_seconds",
@@ -674,79 +614,6 @@ pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_STARTED_COUNT: Lazy<IntCounter> =
Lazy::new(|| {
register_int_counter!(
"pageserver_layer_get_value_reconstruct_data_spawn_blocking_started_count",
"Number of spawn_blocking calls made in Layer::get_value_reconstruct_data"
)
.expect("failed to define a metric")
});
pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_ACTIVE_GAUGE: Lazy<IntGauge> =
Lazy::new(|| {
register_int_gauge!(
"pageserver_layer_get_value_reconstruct_data_spawn_blocking_active_gauge",
"Number of spawn_blocking calls active in Layer::get_value_reconstruct_data"
)
.expect("failed to define a metric")
});
pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_QUEUE_DELAY: Lazy<Histogram> = Lazy::new(
|| {
register_histogram!(
"pageserver_layer_get_value_reconstruct_data_spawn_blocking_queue_delay_seconds",
"Time a Layer::get_value_reconstruct_data call spends in spawn_blocking queue until the first line of blockign code runs inside spawn_blocking",
vec![
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
1.000_000, 2.000_000, 5.000_000, 10.000_000, 25.000_000, 50.000_000, 100.000_000,
],
)
.expect("failed to define a metric")
},
);
pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_layer_get_value_reconstruct_data_completion_time_seconds",
"Time a Layer::get_value_reconstruct_data call takes to complete",
&["result"],
vec![
0.000_005,
0.000_010,
0.000_025,
0.000_050,
0.000_100,
0.000_250,
0.000_500,
0.001_000,
0.002_500,
0.005_000,
0.010_000,
0.025_000,
0.050_000,
0.100_000,
0.250_000,
0.500_000,
1.000_000,
2.000_000,
5.000_000,
10.000_000,
25.000_000,
50.000_000,
100.000_000,
]
)
.expect("failed to define a metric")
});
pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME_OK: Lazy<Histogram> =
Lazy::new(|| LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME.with_label_values(&["ok"]));
pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME_ERROR: Lazy<Histogram> =
Lazy::new(|| LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME.with_label_values(&["error"]));
// Metrics collected on WAL redo operations
//
// We collect the time spent in actual WAL redo ('redo'), and time waiting

View File

@@ -313,8 +313,6 @@ impl PageCache {
key: &Key,
lsn: Lsn,
) -> Option<(Lsn, PageReadGuard)> {
crate::metrics::PAGE_CACHE_READ_ACCESSES_MATERIALIZED_PAGE.inc();
let mut cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_id,
@@ -325,17 +323,8 @@ impl PageCache {
};
if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
if let CacheKey::MaterializedPage {
hash_key: _,
lsn: available_lsn,
} = cache_key
{
if available_lsn == lsn {
crate::metrics::PAGE_CACHE_READ_HITS_MATERIALIZED_PAGE_EXACT.inc();
} else {
crate::metrics::PAGE_CACHE_READ_HITS_MATERIALIZED_PAGE_OLDER_LSN.inc();
}
Some((available_lsn, guard))
if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
Some((lsn, guard))
} else {
panic!("unexpected key type in slot");
}
@@ -510,31 +499,11 @@ impl PageCache {
/// ```
///
fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
let (read_access, hit) = match cache_key {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
}
CacheKey::EphemeralPage { .. } => (
&crate::metrics::PAGE_CACHE_READ_ACCESSES_EPHEMERAL,
&crate::metrics::PAGE_CACHE_READ_HITS_EPHEMERAL,
),
CacheKey::ImmutableFilePage { .. } => (
&crate::metrics::PAGE_CACHE_READ_ACCESSES_IMMUTABLE,
&crate::metrics::PAGE_CACHE_READ_HITS_IMMUTABLE,
),
};
read_access.inc();
let mut is_first_iteration = true;
loop {
// First check if the key already exists in the cache.
if let Some(read_guard) = self.try_lock_for_read(cache_key) {
if is_first_iteration {
hit.inc();
}
return Ok(ReadBufResult::Found(read_guard));
}
is_first_iteration = false;
// Not found. Find a victim buffer
let (slot_idx, mut inner) =

View File

@@ -904,7 +904,7 @@ where
self.check_permission(Some(tenant_id))?;
let lsn = if params.len() >= 3 {
let lsn = if params.len() == 3 {
Some(
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,

View File

@@ -887,7 +887,7 @@ impl<'a> DatadirModification<'a> {
ctx: &RequestContext,
) -> Result<(), RelationError> {
if rel.relnode == 0 {
return Err(RelationError::InvalidRelnode);
return Err(RelationError::AlreadyExists);
}
// It's possible that this is the first rel for this db in this
// tablespace. Create the reldir entry for it if so.

View File

@@ -102,33 +102,11 @@ use crate::shutdown_pageserver;
// It's also good to avoid hogging all threads that would be needed to process
// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
// happen, but still.
static PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE: Lazy<Option<usize>> = Lazy::new(|| {
let env_var: String = match std::env::var("PAGESERVER_TOKIO_MAX_BLOCKING_THREADS") {
Ok(v) => v,
Err(std::env::VarError::NotPresent) => {
debug!("env var PAGESERVER_TOKIO_MAX_BLOCKING_THREADS not set, using default");
return None;
}
Err(std::env::VarError::NotUnicode(_)) => {
panic!("env var PAGESERVER_TOKIO_MAX_BLOCKING_THREADS is not valid UTF-8");
}
};
let pool_size = match env_var.parse() {
Ok(v) => v,
Err(e) => {
panic!("Failed to parse PAGESERVER_TOKIO_MAX_BLOCKING_THREADS: {e:?}");
}
};
eprintln!("using spawn_blocking pool size override from env var PAGESERVER_TOKIO_MAX_BLOCKING_THREADS: {pool_size:?}");
Some(pool_size)
});
//
pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("compute request worker")
.enable_all()
.max_blocking_threads((*PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE).unwrap_or(512))
.build()
.expect("Failed to create compute request runtime")
});
@@ -137,7 +115,6 @@ pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("mgmt request worker")
.enable_all()
.max_blocking_threads((*PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE).unwrap_or(512))
.build()
.expect("Failed to create mgmt request runtime")
});
@@ -146,7 +123,6 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("walreceiver worker")
.enable_all()
.max_blocking_threads((*PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE).unwrap_or(512))
.build()
.expect("Failed to create walreceiver runtime")
});
@@ -155,7 +131,6 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("background op worker")
.enable_all()
.max_blocking_threads((*PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE).unwrap_or(512))
.build()
.expect("Failed to create background op runtime")
});
@@ -531,17 +506,17 @@ pub async fn shutdown_tasks(
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
}
}
let join_handle = tokio::select! {
let completed = tokio::select! {
biased;
_ = &mut join_handle => { None },
_ = &mut join_handle => { true },
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
// allow some time to elapse before logging to cut down the number of log
// lines.
info!("waiting for {} to shut down", task.name);
Some(join_handle)
false
}
};
if let Some(join_handle) = join_handle {
if !completed {
// we never handled this return value, but:
// - we don't deschedule which would lead to is_cancelled
// - panics are already logged (is_panicked)

View File

@@ -85,6 +85,7 @@ pub mod blob_io;
pub mod block_io;
pub mod disk_btree;
pub(crate) mod ephemeral_file;
pub mod layer_cache;
pub mod layer_map;
pub mod manifest;
@@ -501,14 +502,6 @@ impl DeletionGuard {
}
}
#[derive(thiserror::Error, Debug)]
pub enum CreateTimelineError {
#[error("a timeline with the given ID already exists")]
AlreadyExists,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl Tenant {
/// Yet another helper for timeline initialization.
/// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -1383,7 +1376,8 @@ impl Tenant {
/// Returns the new timeline ID and reference to its Timeline object.
///
/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given,
/// a new unique ID is generated.
pub async fn create_timeline(
&self,
new_timeline_id: TimelineId,
@@ -1392,12 +1386,11 @@ impl Tenant {
pg_version: u32,
broker_client: storage_broker::BrokerClientChannel,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
if !self.is_active() {
return Err(CreateTimelineError::Other(anyhow::anyhow!(
"Cannot create timelines on inactive tenant"
)));
}
) -> anyhow::Result<Option<Arc<Timeline>>> {
anyhow::ensure!(
self.is_active(),
"Cannot create timelines on inactive tenant"
);
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
debug!("timeline {new_timeline_id} already exists");
@@ -1417,7 +1410,7 @@ impl Tenant {
.context("wait for timeline uploads to complete")?;
}
return Err(CreateTimelineError::AlreadyExists);
return Ok(None);
}
let loaded_timeline = match ancestor_timeline_id {
@@ -1432,12 +1425,12 @@ impl Tenant {
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
if ancestor_ancestor_lsn > *lsn {
// can we safely just branch from the ancestor instead?
return Err(CreateTimelineError::Other(anyhow::anyhow!(
bail!(
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
lsn,
ancestor_timeline_id,
ancestor_ancestor_lsn,
)));
);
}
// Wait for the WAL to arrive and be processed on the parent branch up
@@ -1471,7 +1464,7 @@ impl Tenant {
})?;
}
Ok(loaded_timeline)
Ok(Some(loaded_timeline))
}
/// perform one garbage collection iteration, removing old data files from disk.
@@ -1625,7 +1618,7 @@ impl Tenant {
// No timeout here, GC & Compaction should be responsive to the
// `TimelineState::Stopping` change.
info!("waiting for layer_removal_cs.lock()");
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
let layer_removal_guard = timeline.layer_cache.delete_guard().await;
info!("got layer_removal_cs.lock(), deleting layer files");
// NB: storage_sync upload tasks that reference these layers have been cancelled

View File

@@ -0,0 +1,146 @@
use super::storage_layer::{PersistentLayer, PersistentLayerDesc, PersistentLayerKey, RemoteLayer};
use super::Timeline;
use crate::tenant::layer_map::LayerMap;
use crate::tenant::timeline::compare_arced_layers;
use anyhow::Result;
use std::sync::{Mutex, Weak};
use std::{collections::HashMap, sync::Arc};
/// LayerCache is meant to facilitate mapping to/from whatever `PersistentLayerDesc` to an actual in-memory layer
/// object. In the future, operations that do not modify layer map (i.e., eviction and download) will be implemented
/// here.
pub struct LayerCache {
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
pub layers_removal_lock: Arc<tokio::sync::Mutex<()>>,
/// We need this lock b/c we do not have any way to prevent GC/compaction from removing files in-use.
/// We need to do reference counting on Arc to prevent this from happening, and we can safely remove this lock.
pub layers_operation_lock: Arc<tokio::sync::RwLock<()>>,
/// Will be useful when we move evict / download to layer cache.
#[allow(unused)]
timeline: Weak<Timeline>,
mapping: Mutex<HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>>,
}
pub struct LayerInUseWrite(tokio::sync::OwnedRwLockWriteGuard<()>);
pub struct LayerInUseRead(tokio::sync::OwnedRwLockReadGuard<()>);
#[derive(Clone)]
pub struct LayerDeletionGuard(Arc<tokio::sync::OwnedMutexGuard<()>>);
impl LayerCache {
pub fn new(timeline: Weak<Timeline>) -> Self {
Self {
layers_operation_lock: Arc::new(tokio::sync::RwLock::new(())),
layers_removal_lock: Arc::new(tokio::sync::Mutex::new(())),
mapping: Mutex::new(HashMap::new()),
timeline,
}
}
pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
let guard = self.mapping.lock().unwrap();
guard.get(&desc.key()).expect("not found").clone()
}
/// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
/// we won't delete files that are being read.
pub async fn layer_in_use_write(&self) -> LayerInUseWrite {
LayerInUseWrite(self.layers_operation_lock.clone().write_owned().await)
}
/// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
/// we won't delete files that are being read.
pub async fn layer_in_use_read(&self) -> LayerInUseRead {
LayerInUseRead(self.layers_operation_lock.clone().read_owned().await)
}
/// Ensures only one of compaction / gc can happen at a time.
pub async fn delete_guard(&self) -> LayerDeletionGuard {
LayerDeletionGuard(Arc::new(
self.layers_removal_lock.clone().lock_owned().await,
))
}
/// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
pub fn remove_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.remove(&layer.layer_desc().key());
}
/// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
pub fn populate_remote_when_init(&self, layer: Arc<RemoteLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.insert(layer.layer_desc().key(), layer);
}
/// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
pub fn populate_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.insert(layer.layer_desc().key(), layer);
}
/// Called within read path.
pub fn replace_and_verify(
&self,
expected: Arc<dyn PersistentLayer>,
new: Arc<dyn PersistentLayer>,
) -> Result<()> {
let mut guard = self.mapping.lock().unwrap();
let key = expected.layer_desc().key();
let other = new.layer_desc().key();
let expected_l0 = LayerMap::is_l0(expected.layer_desc());
let new_l0 = LayerMap::is_l0(new.layer_desc());
fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
"layermap-replace-notfound"
));
anyhow::ensure!(
key == other,
"replacing downloaded layer into layermap failed because two layers have different keys: {key:?} != {other:?}"
);
anyhow::ensure!(
expected_l0 == new_l0,
"replacing downloaded layer into layermap failed because one layer is l0 while the other is not: {expected_l0} != {new_l0}"
);
if let Some(layer) = guard.get_mut(&expected.layer_desc().key()) {
anyhow::ensure!(
compare_arced_layers(&expected, layer),
"replacing downloaded layer into layermap failed because another layer was found instead of expected, expected={expected:?}, new={new:?}",
expected = Arc::as_ptr(&expected),
new = Arc::as_ptr(layer),
);
*layer = new;
Ok(())
} else {
anyhow::bail!(
"replacing downloaded layer into layermap failed because layer was not found"
);
}
}
/// Called within write path. When compaction and image layer creation we will create new layers.
pub fn create_new_layer(&self, layer: Arc<dyn PersistentLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.insert(layer.layer_desc().key(), layer);
}
/// Called within write path. When GC and compaction we will remove layers and delete them on disk.
/// Will move logic to delete files here later.
pub fn delete_layer(&self, layer: Arc<dyn PersistentLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.remove(&layer.layer_desc().key());
}
}

View File

@@ -58,7 +58,7 @@ use std::sync::Arc;
use utils::lsn::Lsn;
use historic_layer_coverage::BufferedHistoricLayerCoverage;
pub use historic_layer_coverage::LayerKey;
pub use historic_layer_coverage::{LayerKey, Replacement};
use super::storage_layer::range_eq;
use super::storage_layer::PersistentLayerDesc;
@@ -658,10 +658,7 @@ mod tests {
mod l0_delta_layers_updated {
use crate::tenant::{
storage_layer::{PersistentLayer, PersistentLayerDesc},
timeline::LayerFileManager,
};
use crate::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
use super::*;
@@ -694,31 +691,6 @@ mod tests {
)
}
#[test]
fn replacing_missing_l0_is_notfound() {
// original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
// however only happen for precondition failures.
let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
let layer = LayerFileName::from_str(layer).unwrap();
let layer = LayerDescriptor::from(layer);
// same skeletan construction; see scenario below
let not_found = Arc::new(layer.clone());
let new_version = Arc::new(layer);
// after the immutable storage state refactor, the replace operation
// will not use layer map any more. We keep it here for consistency in test cases
// and can remove it in the future.
let _map = LayerMap::default();
let mut mapping = LayerFileManager::new();
mapping
.replace_and_verify(not_found, new_version)
.unwrap_err();
}
fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
let name = LayerFileName::from_str(layer_name).unwrap();
let skeleton = LayerDescriptor::from(name);
@@ -727,7 +699,6 @@ mod tests {
let downloaded = Arc::new(skeleton);
let mut map = LayerMap::default();
let mut mapping = LayerFileManager::new();
// two disjoint Arcs in different lifecycle phases. even if it seems they must be the
// same layer, we use LayerMap::compare_arced_layers as the identity of layers.
@@ -737,20 +708,11 @@ mod tests {
map.batch_update()
.insert_historic(remote.layer_desc().clone());
mapping.insert(remote.clone());
assert_eq!(
count_layer_in(&map, remote.layer_desc()),
expected_in_counts
);
mapping
.replace_and_verify(remote, downloaded.clone())
.expect("name derived attributes are the same");
assert_eq!(
count_layer_in(&map, downloaded.layer_desc()),
expected_in_counts
);
map.batch_update()
.remove_historic(downloaded.layer_desc().clone());
assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));

View File

@@ -43,6 +43,18 @@ impl Ord for LayerKey {
}
}
impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerKey {
fn from(layer: &'a L) -> Self {
let kr = layer.get_key_range();
let lr = layer.get_lsn_range();
LayerKey {
key: kr.start.to_i128()..kr.end.to_i128(),
lsn: lr.start.0..lr.end.0,
is_image: !layer.is_incremental(),
}
}
}
impl From<&PersistentLayerDesc> for LayerKey {
fn from(layer: &PersistentLayerDesc) -> Self {
let kr = layer.get_key_range();
@@ -456,6 +468,64 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
self.buffer.insert(layer_key, None);
}
/// Replaces a previous layer with a new layer value.
///
/// The replacement is conditional on:
/// - there is an existing `LayerKey` record
/// - there is no buffered removal for the given `LayerKey`
/// - the given closure returns true for the current `Value`
///
/// The closure is used to compare the latest value (buffered insert, or existing layer)
/// against some expectation. This allows to use `Arc::ptr_eq` or similar which would be
/// inaccessible via `PartialEq` trait.
///
/// Returns a `Replacement` value describing the outcome; only the case of
/// `Replacement::Replaced` modifies the map and requires a rebuild.
///
/// This function is unlikely to be used in the future because LayerMap now only records the
/// layer descriptors. Therefore, anything added to the layer map will only be removed or
/// added, and never replaced.
#[cfg(test)]
pub fn replace<F>(
&mut self,
layer_key: &LayerKey,
new: Value,
check_expected: F,
) -> Replacement<Value>
where
F: FnOnce(&Value) -> bool,
{
let (slot, in_buffered) = match self.buffer.get(layer_key) {
Some(inner @ Some(_)) => {
// we compare against the buffered version, because there will be a later
// rebuild before querying
(inner.as_ref(), true)
}
Some(None) => {
// buffer has removal for this key; it will not be equivalent by any check_expected.
return Replacement::RemovalBuffered;
}
None => {
// no pending modification for the key, check layers
(self.layers.get(layer_key), false)
}
};
match slot {
Some(existing) if !check_expected(existing) => {
// unfortunate clone here, but otherwise the nll borrowck grows the region of
// 'a to cover the whole function, and we could not mutate in the other
// Some(existing) branch
Replacement::Unexpected(existing.clone())
}
None => Replacement::NotFound,
Some(_existing) => {
self.insert(layer_key.to_owned(), new);
Replacement::Replaced { in_buffered }
}
}
}
pub fn rebuild(&mut self) {
// Find the first LSN that needs to be rebuilt
let rebuild_since: u64 = match self.buffer.iter().next() {
@@ -524,6 +594,22 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
}
}
/// Outcome of the replace operation.
#[derive(Debug)]
pub enum Replacement<Value> {
/// Previous value was replaced with the new value.
Replaced {
/// Replacement happened for a scheduled insert.
in_buffered: bool,
},
/// Key was not found buffered updates or existing layers.
NotFound,
/// Key has been scheduled for removal, it was not replaced.
RemovalBuffered,
/// Previous value was rejected by the closure.
Unexpected(Value),
}
#[test]
fn test_retroactive_regression_1() {
let mut map = BufferedHistoricLayerCoverage::new();
@@ -632,3 +718,139 @@ fn test_retroactive_simple() {
assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
}
}
#[test]
fn test_retroactive_replacement() {
let mut map = BufferedHistoricLayerCoverage::new();
let keys = [
LayerKey {
key: 0..5,
lsn: 100..101,
is_image: true,
},
LayerKey {
key: 3..9,
lsn: 110..111,
is_image: true,
},
LayerKey {
key: 4..6,
lsn: 120..121,
is_image: true,
},
];
let layers = [
"Image 1".to_string(),
"Image 2".to_string(),
"Image 3".to_string(),
];
for (key, layer) in keys.iter().zip(layers.iter()) {
map.insert(key.to_owned(), layer.to_owned());
}
// rebuild is not necessary here, because replace works for both buffered updates and existing
// layers.
for (key, orig_layer) in keys.iter().zip(layers.iter()) {
let replacement = format!("Remote {orig_layer}");
// evict
let ret = map.replace(key, replacement.clone(), |l| l == orig_layer);
assert!(
matches!(ret, Replacement::Replaced { .. }),
"replace {orig_layer}: {ret:?}"
);
map.rebuild();
let at = key.lsn.end + 1;
let version = map.get().expect("rebuilt").get_version(at).unwrap();
assert_eq!(
version.image_coverage.query(4).as_deref(),
Some(replacement.as_str()),
"query for 4 at version {at} after eviction",
);
// download
let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement);
assert!(
matches!(ret, Replacement::Replaced { .. }),
"replace {orig_layer} back: {ret:?}"
);
map.rebuild();
let version = map.get().expect("rebuilt").get_version(at).unwrap();
assert_eq!(
version.image_coverage.query(4).as_deref(),
Some(orig_layer.as_str()),
"query for 4 at version {at} after download",
);
}
}
#[test]
fn missing_key_is_not_inserted_with_replace() {
let mut map = BufferedHistoricLayerCoverage::new();
let key = LayerKey {
key: 0..5,
lsn: 100..101,
is_image: true,
};
let ret = map.replace(&key, "should not replace", |_| true);
assert!(matches!(ret, Replacement::NotFound), "{ret:?}");
map.rebuild();
assert!(map
.get()
.expect("no changes to rebuild")
.get_version(102)
.is_none());
}
#[test]
fn replacing_buffered_insert_and_remove() {
let mut map = BufferedHistoricLayerCoverage::new();
let key = LayerKey {
key: 0..5,
lsn: 100..101,
is_image: true,
};
map.insert(key.clone(), "Image 1");
let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1");
assert!(
matches!(ret, Replacement::Replaced { in_buffered: true }),
"{ret:?}"
);
map.rebuild();
assert_eq!(
map.get()
.expect("rebuilt")
.get_version(102)
.unwrap()
.image_coverage
.query(4),
Some("Remote Image 1")
);
map.remove(key.clone());
let ret = map.replace(&key, "should not replace", |_| true);
assert!(
matches!(ret, Replacement::RemovalBuffered),
"cannot replace after scheduled remove: {ret:?}"
);
map.rebuild();
let ret = map.replace(&key, "should not replace", |_| true);
assert!(
matches!(ret, Replacement::NotFound),
"cannot replace after remove + rebuild: {ret:?}"
);
let at_version = map.get().expect("rebuilt").get_version(102);
assert!(at_version.is_none());
}

View File

@@ -12,7 +12,7 @@ use crate::context::RequestContext;
use crate::repository::{Key, Value};
use crate::task_mgr::TaskKind;
use crate::walrecord::NeonWalRecord;
use anyhow::{Context, Result};
use anyhow::Result;
use bytes::Bytes;
use enum_map::EnumMap;
use enumset::EnumSet;
@@ -24,7 +24,7 @@ use pageserver_api::models::{
use std::ops::Range;
use std::path::PathBuf;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::warn;
use utils::history_buffer::HistoryBufferWithDropCounter;
use utils::rate_limit::RateLimit;
@@ -335,8 +335,7 @@ impl LayerAccessStats {
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
/// timeline names, because those are known in the context of which the layers
/// are used in (timeline).
#[async_trait::async_trait]
pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
pub trait Layer: std::fmt::Debug + Send + Sync {
/// Range of keys that this layer covers
fn get_key_range(&self) -> Range<Key>;
@@ -366,74 +365,13 @@ pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
/// is available. If this returns ValueReconstructResult::Continue, look up
/// the predecessor layer and call again with the same 'reconstruct_data' to
/// collect more data.
fn get_value_reconstruct_data_blocking(
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_data: ValueReconstructState,
ctx: RequestContext,
) -> Result<(ValueReconstructState, ValueReconstructResult)>;
/// CANCEL SAFETY: if the returned future is dropped,
/// the wrapped closure still run to completion and the return value discarded.
/// For the case of get_value_reconstruct_data, we expect the closure to not
/// have any side effects, as it only attempts to read a layer (and stuff like
/// page cache isn't considered a real side effect).
/// But, ...
/// TRACING:
/// If the returned future is cancelled, the spawn_blocking span can outlive
/// the caller's span.
/// So, technically, we should be using `parent: None` and `follows_from: current`
/// instead. However, in practice, the advantage of maintaining the span stack
/// in logs outweighs the disadvantage of having a dangling span in a case that
/// is not expected to happen because in pageserver we generally don't drop pending futures.
async fn get_value_reconstruct_data(
self: Arc<Self>,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_data: ValueReconstructState,
ctx: RequestContext,
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
let span = tracing::info_span!("get_value_reconstruct_data_spawn_blocking");
static USE_SPAWN_BLOCKING: Lazy<bool> = Lazy::new(|| {
let val = std::env::var("PAGESERVER_LAYER_GET_RECONSTRUCT_DATA_USE_SPAWN_BLOCKING")
.map(|s| s == "1")
.unwrap_or(false);
tracing::info!("PAGESERVER_LAYER_GET_RECONSTRUCT_DATA_USE_SPAWN_BLOCKING={val}");
val
});
let use_spawn_blocking = *USE_SPAWN_BLOCKING;
let start = Instant::now();
let res = if !use_spawn_blocking {
anyhow::Ok(self.get_value_reconstruct_data_blocking(
key,
lsn_range,
reconstruct_data,
ctx,
))
} else {
crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_STARTED_COUNT.inc();
crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_ACTIVE_GAUGE.inc();
let res = tokio::task::spawn_blocking(move || {
crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_QUEUE_DELAY
.observe(start.elapsed().as_secs_f64());
let _enter = span.enter();
self.get_value_reconstruct_data_blocking(key, lsn_range, reconstruct_data, ctx)
})
.await
.context("spawn_blocking");
crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_ACTIVE_GAUGE.dec();
res
};
let histo = match &res {
Ok(Ok(_)) => &crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME_OK,
Ok(Err(_)) | Err(_) => {
&crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME_ERROR
}
};
histo.observe(start.elapsed().as_secs_f64());
res?
}
reconstruct_data: &mut ValueReconstructState,
ctx: &RequestContext,
) -> Result<ValueReconstructResult>;
/// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
fn short_id(&self) -> String;
@@ -545,8 +483,17 @@ pub mod tests {
}
}
#[async_trait::async_trait]
impl Layer for LayerDescriptor {
fn get_value_reconstruct_data(
&self,
_key: Key,
_lsn_range: Range<Lsn>,
_reconstruct_data: &mut ValueReconstructState,
_ctx: &RequestContext,
) -> Result<ValueReconstructResult> {
todo!("This method shouldn't be part of the Layer trait")
}
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
todo!()
}
@@ -561,16 +508,6 @@ pub mod tests {
self.layer_desc().lsn_range.clone()
}
fn get_value_reconstruct_data_blocking(
&self,
_key: Key,
_lsn_range: Range<Lsn>,
_reconstruct_data: ValueReconstructState,
_ctx: RequestContext,
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
todo!("This method shouldn't be part of the Layer trait")
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental

View File

@@ -218,7 +218,6 @@ impl std::fmt::Debug for DeltaLayerInner {
}
}
#[async_trait::async_trait]
impl Layer for DeltaLayer {
/// debugging function to print out the contents of the layer
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
@@ -295,13 +294,13 @@ impl Layer for DeltaLayer {
Ok(())
}
fn get_value_reconstruct_data_blocking(
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
mut reconstruct_state: ValueReconstructState,
ctx: RequestContext,
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.desc.lsn_range.start);
let mut need_image = true;
@@ -309,7 +308,7 @@ impl Layer for DeltaLayer {
{
// Open the file and lock the metadata in memory
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
// Scan the page versions backwards, starting from `lsn`.
let file = &inner.file;
@@ -375,9 +374,9 @@ impl Layer for DeltaLayer {
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok((reconstruct_state, ValueReconstructResult::Continue))
Ok(ValueReconstructResult::Continue)
} else {
Ok((reconstruct_state, ValueReconstructResult::Complete))
Ok(ValueReconstructResult::Complete)
}
}

View File

@@ -149,7 +149,6 @@ impl std::fmt::Debug for ImageLayerInner {
}
}
#[async_trait::async_trait]
impl Layer for ImageLayer {
/// debugging function to print out the contents of the layer
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
@@ -182,18 +181,18 @@ impl Layer for ImageLayer {
}
/// Look up given page in the file
fn get_value_reconstruct_data_blocking(
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
mut reconstruct_state: ValueReconstructState,
ctx: RequestContext,
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
assert!(self.desc.key_range.contains(&key));
assert!(lsn_range.start >= self.lsn);
assert!(lsn_range.end >= self.lsn);
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
@@ -211,9 +210,9 @@ impl Layer for ImageLayer {
let value = Bytes::from(blob);
reconstruct_state.img = Some((self.lsn, value));
Ok((reconstruct_state, ValueReconstructResult::Complete))
Ok(ValueReconstructResult::Complete)
} else {
Ok((reconstruct_state, ValueReconstructResult::Missing))
Ok(ValueReconstructResult::Missing)
}
}

View File

@@ -110,7 +110,6 @@ impl InMemoryLayer {
}
}
#[async_trait::async_trait]
impl Layer for InMemoryLayer {
fn get_key_range(&self) -> Range<Key> {
Key::MIN..Key::MAX
@@ -191,13 +190,13 @@ impl Layer for InMemoryLayer {
}
/// Look up given value in the layer.
fn get_value_reconstruct_data_blocking(
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
mut reconstruct_state: ValueReconstructState,
_ctx: RequestContext,
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
reconstruct_state: &mut ValueReconstructState,
_ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.start_lsn);
let mut need_image = true;
@@ -214,7 +213,7 @@ impl Layer for InMemoryLayer {
match value {
Value::Image(img) => {
reconstruct_state.img = Some((*entry_lsn, img));
return Ok((reconstruct_state, ValueReconstructResult::Complete));
return Ok(ValueReconstructResult::Complete);
}
Value::WalRecord(rec) => {
let will_init = rec.will_init();
@@ -234,9 +233,9 @@ impl Layer for InMemoryLayer {
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok((reconstruct_state, ValueReconstructResult::Continue))
Ok(ValueReconstructResult::Continue)
} else {
Ok((reconstruct_state, ValueReconstructResult::Complete))
Ok(ValueReconstructResult::Complete)
}
}
}

View File

@@ -6,7 +6,7 @@ use crate::context::RequestContext;
use crate::repository::Key;
use crate::tenant::layer_map::BatchedUpdates;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::storage_layer::{Layer, ValueReconstructState};
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use anyhow::{bail, Result};
use pageserver_api::models::HistoricLayerInfo;
use std::ops::Range;
@@ -21,7 +21,7 @@ use utils::{
use super::filename::{DeltaFileName, ImageFileName};
use super::{
DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, ValueReconstructResult,
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
};
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -63,15 +63,14 @@ impl std::fmt::Debug for RemoteLayer {
}
}
#[async_trait::async_trait]
impl Layer for RemoteLayer {
fn get_value_reconstruct_data_blocking(
fn get_value_reconstruct_data(
&self,
_key: Key,
_lsn_range: Range<Lsn>,
_reconstruct_state: ValueReconstructState,
_ctx: RequestContext,
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
_reconstruct_state: &mut ValueReconstructState,
_ctx: &RequestContext,
) -> Result<ValueReconstructResult> {
bail!(
"layer {} needs to be downloaded",
self.filename().file_name()

View File

@@ -8,7 +8,7 @@ use bytes::Bytes;
use fail::fail_point;
use futures::StreamExt;
use itertools::Itertools;
use once_cell::sync::{Lazy, OnceCell};
use once_cell::sync::OnceCell;
use pageserver_api::models::{
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus,
@@ -82,11 +82,12 @@ use self::eviction_task::EvictionTaskTimelineState;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::layer_cache::{LayerCache, LayerDeletionGuard};
use super::layer_map::BatchedUpdates;
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::RemoteTimelineClient;
use super::storage_layer::{
DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc, PersistentLayerKey,
DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc,
};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -120,77 +121,11 @@ impl PartialOrd for Hole {
}
}
pub struct LayerFileManager(HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>);
pub struct LayerFileManager(());
impl LayerFileManager {
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
// The assumption for the `expect()` is that all code maintains the following invariant:
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
self.0
.get(&desc.key())
.with_context(|| format!("get layer from desc: {}", desc.filename().file_name()))
.expect("not found")
.clone()
}
pub(crate) fn insert(&mut self, layer: Arc<dyn PersistentLayer>) {
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
if present.is_some() && cfg!(debug_assertions) {
panic!("overwriting a layer: {:?}", layer.layer_desc())
}
}
pub(crate) fn new() -> Self {
Self(HashMap::new())
}
pub(crate) fn remove(&mut self, layer: Arc<dyn PersistentLayer>) {
let present = self.0.remove(&layer.layer_desc().key());
if present.is_none() && cfg!(debug_assertions) {
panic!(
"removing layer that is not present in layer mapping: {:?}",
layer.layer_desc()
)
}
}
pub(crate) fn replace_and_verify(
&mut self,
expected: Arc<dyn PersistentLayer>,
new: Arc<dyn PersistentLayer>,
) -> Result<()> {
let key = expected.layer_desc().key();
let other = new.layer_desc().key();
let expected_l0 = LayerMap::is_l0(expected.layer_desc());
let new_l0 = LayerMap::is_l0(new.layer_desc());
fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
"layermap-replace-notfound"
));
anyhow::ensure!(
key == other,
"expected and new layer have different keys: {key:?} != {other:?}"
);
anyhow::ensure!(
expected_l0 == new_l0,
"one layer is l0 while the other is not: {expected_l0} != {new_l0}"
);
if let Some(layer) = self.0.get_mut(&expected.layer_desc().key()) {
anyhow::ensure!(
compare_arced_layers(&expected, layer),
"another layer was found instead of expected, expected={expected:?}, new={new:?}",
expected = Arc::as_ptr(&expected),
new = Arc::as_ptr(layer),
);
*layer = new;
Ok(())
} else {
anyhow::bail!("layer was not found");
}
Self(())
}
}
@@ -207,7 +142,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
}
pub struct Timeline {
conf: &'static PageServerConf,
pub(super) conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<TenantConfOpt>>,
myself: Weak<Self>,
@@ -217,25 +152,10 @@ pub struct Timeline {
pub pg_version: u32,
/// The tuple has two elements.
/// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote).
/// 2. `LayerMap`, the acceleration data structure for `get_reconstruct_data`.
///
/// `LayerMap` maps out the `(PAGE,LSN) / (KEY,LSN)` space, which is composed of `(KeyRange, LsnRange)` rectangles.
/// We describe these rectangles through the `PersistentLayerDesc` struct.
///
/// When we want to reconstruct a page, we first find the `PersistentLayerDesc`'s that we need for page reconstruction,
/// using `LayerMap`. Then, we use `LayerFileManager` to get the `PersistentLayer`'s that correspond to the
/// `PersistentLayerDesc`'s.
///
/// Hence, it's important to keep things coherent. The `LayerFileManager` must always have an entry for all
/// `PersistentLayerDesc`'s in the `LayerMap`. If it doesn't, `LayerFileManager::get_from_desc` will panic at
/// runtime, e.g., during page reconstruction.
///
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
pub(crate) layers: Arc<tokio::sync::RwLock<(LayerMap, LayerFileManager)>>,
pub(super) layer_cache: LayerCache,
/// Set of key ranges which should be covered by image layers to
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
/// It is used by compaction task when it checks if new image layer should be created.
@@ -307,13 +227,6 @@ pub struct Timeline {
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
@@ -660,33 +573,16 @@ impl Timeline {
None => None,
};
let reconstruct_state = ValueReconstructState {
let mut reconstruct_state = ValueReconstructState {
records: Vec::new(),
img: cached_page_img,
};
static GET_RECONSTRUCT_DATA_CONCURRENCY: Lazy<Option<usize>> = Lazy::new(|| {
std::env::var("PAGESERVER_TIMELINE_GET_RECONSTRUCT_DATA_CONCURRENCY_LIMIT")
.ok()
.and_then(|s| s.parse().ok())
});
static GET_RECONSTRUCT_DATA_SEMAPHORE: Lazy<Option<Semaphore>> =
Lazy::new(|| (*GET_RECONSTRUCT_DATA_CONCURRENCY).map(Semaphore::new));
let permit = if let Some(sem) = GET_RECONSTRUCT_DATA_SEMAPHORE.as_ref() {
Some(sem.acquire().await)
} else {
None
};
let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
let reconstruct_state = self
.get_reconstruct_data(key, lsn, reconstruct_state, ctx)
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
.await?;
timer.stop_and_record();
drop(permit);
RECONSTRUCT_TIME
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
}
@@ -942,7 +838,7 @@ impl Timeline {
// Below are functions compact_level0() and create_image_layers()
// but they are a bit ad hoc and don't quite work like it's explained
// above. Rewrite it.
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
let layer_removal_cs = self.layer_cache.delete_guard().await;
// Is the timeline being deleted?
if self.is_stopping() {
return Err(anyhow::anyhow!("timeline is Stopping").into());
@@ -1165,7 +1061,7 @@ impl Timeline {
pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
let guard = self.layers.read().await;
let (layer_map, mapping) = &*guard;
let (layer_map, _) = &*guard;
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
if let Some(open_layer) = &layer_map.open_layer {
in_memory_layers.push(open_layer.info());
@@ -1176,7 +1072,7 @@ impl Timeline {
let mut historic_layers = Vec::new();
for historic_layer in layer_map.iter_historic_layers() {
let historic_layer = mapping.get_from_desc(&historic_layer);
let historic_layer = self.layer_cache.get_from_desc(&historic_layer);
historic_layers.push(historic_layer.info(reset));
}
@@ -1274,7 +1170,7 @@ impl Timeline {
.context("wait for layer upload ops to complete")?;
// now lock out layer removal (compaction, gc, timeline deletion)
let layer_removal_guard = self.layer_removal_cs.lock().await;
let layer_removal_guard = self.layer_cache.delete_guard().await;
{
// to avoid racing with detach and delete_timeline
@@ -1287,7 +1183,7 @@ impl Timeline {
// start the batch update
let mut guard = self.layers.write().await;
let (layer_map, mapping) = &mut *guard;
let (layer_map, _) = &mut *guard;
let mut batch_updates = layer_map.batch_update();
let mut results = Vec::with_capacity(layers_to_evict.len());
@@ -1296,12 +1192,7 @@ impl Timeline {
let res = if cancel.is_cancelled() {
None
} else {
Some(self.evict_layer_batch_impl(
&layer_removal_guard,
l,
&mut batch_updates,
mapping,
))
Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates))
};
results.push(res);
}
@@ -1317,10 +1208,9 @@ impl Timeline {
fn evict_layer_batch_impl(
&self,
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
_layer_removal_cs: &LayerDeletionGuard,
local_layer: &Arc<dyn PersistentLayer>,
batch_updates: &mut BatchedUpdates<'_>,
mapping: &mut LayerFileManager,
) -> anyhow::Result<bool> {
if local_layer.is_remote_layer() {
// TODO(issue #3851): consider returning an err here instead of false,
@@ -1371,7 +1261,10 @@ impl Timeline {
assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());
let succeed = match mapping.replace_and_verify(local_layer.clone(), new_remote_layer) {
let succeed = match self
.layer_cache
.replace_and_verify(local_layer.clone(), new_remote_layer)
{
Ok(()) => {
if let Err(e) = local_layer.delete_resident_layer_file() {
error!("failed to remove layer file on evict after replacement: {e:#?}");
@@ -1419,9 +1312,6 @@ impl Timeline {
}
}
/// Number of times we will compute partition within a checkpoint distance.
const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
// Private functions
impl Timeline {
fn get_checkpoint_distance(&self) -> u64 {
@@ -1546,6 +1436,7 @@ impl Timeline {
LayerMap::default(),
LayerFileManager::new(),
))),
layer_cache: LayerCache::new(myself.clone()),
wanted_image_layers: Mutex::new(None),
walredo_mgr,
@@ -1581,7 +1472,6 @@ impl Timeline {
layer_flush_done_tx,
write_lock: tokio::sync::Mutex::new(()),
layer_removal_cs: Default::default(),
gc_info: std::sync::RwLock::new(GcInfo {
retain_lsns: Vec::new(),
@@ -1619,8 +1509,7 @@ impl Timeline {
initial_logical_size_can_start,
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
result.repartition_threshold = result.get_checkpoint_distance() / 10;
result
.metrics
.last_record_gauge
@@ -1738,7 +1627,7 @@ impl Timeline {
///
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
let mut guard = self.layers.write().await;
let (layers, mapping) = &mut *guard;
let (layers, _) = &mut *guard;
let mut updates = layers.batch_update();
let mut num_layers = 0;
@@ -1781,7 +1670,8 @@ impl Timeline {
trace!("found layer {}", layer.path().display());
total_physical_size += file_size;
self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
updates.insert_historic(layer.layer_desc().clone());
self.layer_cache.populate_local_when_init(Arc::new(layer));
num_layers += 1;
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
// Create a DeltaLayer struct for each delta file.
@@ -1813,7 +1703,8 @@ impl Timeline {
trace!("found layer {}", layer.path().display());
total_physical_size += file_size;
self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
updates.insert_historic(layer.layer_desc().clone());
self.layer_cache.populate_local_when_init(Arc::new(layer));
num_layers += 1;
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
// ignore these
@@ -1868,7 +1759,7 @@ impl Timeline {
// We're holding a layer map lock for a while but this
// method is only called during init so it's fine.
let mut guard = self.layers.write().await;
let (layer_map, mapping) = &mut *guard;
let (layer_map, _) = &mut *guard;
let mut updates = layer_map.batch_update();
for remote_layer_name in &index_part.timeline_layers {
let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1913,7 +1804,8 @@ impl Timeline {
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
} else {
self.metrics.resident_physical_size_gauge.sub(local_size);
self.remove_historic_layer(local_layer, &mut updates, mapping);
updates.remove_historic(local_layer.layer_desc().clone());
self.layer_cache.remove_local_when_init(local_layer);
// fall-through to adding the remote layer
}
} else {
@@ -1952,7 +1844,8 @@ impl Timeline {
);
let remote_layer = Arc::new(remote_layer);
self.insert_historic_layer(remote_layer, &mut updates, mapping);
updates.insert_historic(remote_layer.layer_desc().clone());
self.layer_cache.populate_remote_when_init(remote_layer);
}
LayerFileName::Delta(deltafilename) => {
// Create a RemoteLayer for the delta file.
@@ -1979,7 +1872,8 @@ impl Timeline {
),
);
let remote_layer = Arc::new(remote_layer);
self.insert_historic_layer(remote_layer, &mut updates, mapping);
updates.insert_historic(remote_layer.layer_desc().clone());
self.layer_cache.populate_remote_when_init(remote_layer);
}
}
}
@@ -2020,10 +1914,10 @@ impl Timeline {
let local_layers = {
let guard = self.layers.read().await;
let (layers, mapping) = &*guard;
let (layers, _) = &*guard;
layers
.iter_historic_layers()
.map(|l| (l.filename(), mapping.get_from_desc(&l)))
.map(|l| (l.filename(), self.layer_cache.get_from_desc(&l)))
.collect::<HashMap<_, _>>()
};
@@ -2397,52 +2291,27 @@ impl Timeline {
async fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
let guard = self.layers.read().await;
let (layers, mapping) = &*guard;
let (layers, _) = &*guard;
for historic_layer in layers.iter_historic_layers() {
let historic_layer_name = historic_layer.filename().file_name();
if layer_file_name == historic_layer_name {
return Some(mapping.get_from_desc(&historic_layer));
return Some(self.layer_cache.get_from_desc(&historic_layer));
}
}
None
}
/// Helper function to insert a layer from both layer map and layer file manager. Will be removed in the future
/// after we introduce `LayerMapManager`.
fn insert_historic_layer(
&self,
layer: Arc<dyn PersistentLayer>,
updates: &mut BatchedUpdates<'_>,
mapping: &mut LayerFileManager,
) {
updates.insert_historic(layer.layer_desc().clone());
mapping.insert(layer);
}
/// Helper function to remove a layer from both layer map and layer file manager. Will be removed in the future
/// after we introduce `LayerMapManager`.
fn remove_historic_layer(
&self,
layer: Arc<dyn PersistentLayer>,
updates: &mut BatchedUpdates<'_>,
mapping: &mut LayerFileManager,
) {
updates.remove_historic(layer.layer_desc().clone());
mapping.remove(layer);
}
/// Removes the layer from local FS (if present) and from memory.
/// Remote storage is not affected by this operation.
fn delete_historic_layer(
&self,
// we cannot remove layers otherwise, since gc and compaction will race
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
_layer_removal_cs: LayerDeletionGuard,
layer: Arc<PersistentLayerDesc>,
updates: &mut BatchedUpdates<'_>,
mapping: &mut LayerFileManager,
) -> anyhow::Result<()> {
let layer = mapping.get_from_desc(&layer);
let layer = self.layer_cache.get_from_desc(&layer);
if !layer.is_remote_layer() {
layer.delete_resident_layer_file()?;
let layer_file_size = layer.file_size();
@@ -2457,7 +2326,7 @@ impl Timeline {
// and mark what we can't delete yet as deleted from the layer
// map index without actually rebuilding the index.
updates.remove_historic(layer.layer_desc().clone());
mapping.remove(layer);
self.layer_cache.delete_layer(layer);
Ok(())
}
@@ -2512,9 +2381,9 @@ impl Timeline {
&self,
key: Key,
request_lsn: Lsn,
mut reconstruct_state: ValueReconstructState,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> Result<ValueReconstructState, PageReconstructError> {
) -> Result<(), PageReconstructError> {
// Start from the current timeline.
let mut timeline_owned;
let mut timeline = self;
@@ -2544,12 +2413,12 @@ impl Timeline {
// The function should have updated 'state'
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
match result {
ValueReconstructResult::Complete => return Ok(reconstruct_state),
ValueReconstructResult::Complete => return Ok(()),
ValueReconstructResult::Continue => {
// If we reached an earlier cached page image, we're done.
if cont_lsn == cached_lsn + 1 {
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
return Ok(reconstruct_state);
return Ok(());
}
if prev_lsn <= cont_lsn {
// Didn't make any progress in last iteration. Error out to avoid
@@ -2643,7 +2512,7 @@ impl Timeline {
'layer_map_search: loop {
let remote_layer = {
let guard = timeline.layers.read().await;
let (layers, mapping) = &*guard;
let (layers, _) = &*guard;
// Check the open and frozen in-memory layers first, in order from newest
// to oldest.
@@ -2654,19 +2523,13 @@ impl Timeline {
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, start_lsn);
result = match Arc::clone(open_layer)
.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx.attached_child(),
)
.await
{
Ok((new_reconstruct_state, result)) => {
reconstruct_state = new_reconstruct_state;
result
}
result = match open_layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx,
) {
Ok(result) => result,
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
@@ -2687,19 +2550,13 @@ impl Timeline {
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
let lsn_floor = max(cached_lsn + 1, start_lsn);
result = match Arc::clone(frozen_layer)
.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx.attached_child(),
)
.await
{
Ok((new_reconstruct_state, result)) => {
reconstruct_state = new_reconstruct_state;
result
}
result = match frozen_layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx,
) {
Ok(result) => result,
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
@@ -2717,7 +2574,7 @@ impl Timeline {
}
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
let layer = mapping.get_from_desc(&layer);
let layer = timeline.layer_cache.get_from_desc(&layer);
// If it's a remote layer, download it and retry.
if let Some(remote_layer) =
super::storage_layer::downcast_remote_layer(&layer)
@@ -2729,19 +2586,13 @@ impl Timeline {
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, lsn_floor);
result = match Arc::clone(&layer)
.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx.attached_child(),
)
.await
{
Ok((new_reconstruct_state, result)) => {
reconstruct_state = new_reconstruct_state;
result
}
result = match layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx,
) {
Ok(result) => result,
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
@@ -3230,14 +3081,15 @@ impl Timeline {
// Add it to the layer map
let l = Arc::new(new_delta);
let mut guard = self.layers.write().await;
let (layers, mapping) = &mut *guard;
let (layers, _) = &mut *guard;
let mut batch_updates = layers.batch_update();
l.access_stats().record_residence_event(
&batch_updates,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
self.insert_historic_layer(l, &mut batch_updates, mapping);
batch_updates.insert_historic(l.layer_desc().clone());
self.layer_cache.create_new_layer(l);
batch_updates.flush();
// update metrics
@@ -3466,7 +3318,7 @@ impl Timeline {
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
let mut guard = self.layers.write().await;
let (layers, mapping) = &mut *guard;
let (layers, _) = &mut *guard;
let mut updates = layers.batch_update();
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
@@ -3488,7 +3340,8 @@ impl Timeline {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
self.insert_historic_layer(l, &mut updates, mapping);
updates.insert_historic(l.layer_desc().clone());
self.layer_cache.create_new_layer(l);
}
updates.flush();
drop_wlock(guard);
@@ -3644,14 +3497,9 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
}
impl Timeline {
/// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
///
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
/// start of level0 files compaction, the on-demand download should be revisited as well.
fn compact_level0_phase1(
self: Arc<Self>,
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
_layer_removal_cs: LayerDeletionGuard,
guard: tokio::sync::OwnedRwLockReadGuard<(LayerMap, LayerFileManager)>,
mut stats: CompactLevel0Phase1StatsBuilder,
target_file_size: u64,
@@ -3659,11 +3507,11 @@ impl Timeline {
) -> Result<CompactLevel0Phase1Result, CompactionError> {
stats.read_lock_held_spawn_blocking_startup_micros =
stats.read_lock_acquisition_micros.till_now(); // set by caller
let (layers, mapping) = &*guard;
let (layers, _) = &*guard;
let level0_deltas = layers.get_level0_deltas()?;
let mut level0_deltas = level0_deltas
.into_iter()
.map(|x| mapping.get_from_desc(&x))
.map(|x| self.layer_cache.get_from_desc(&x))
.collect_vec();
stats.level0_deltas_count = Some(level0_deltas.len());
// Only compact if enough layers have accumulated.
@@ -4024,7 +3872,7 @@ impl Timeline {
///
async fn compact_level0(
self: &Arc<Self>,
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
layer_removal_cs: LayerDeletionGuard,
target_file_size: u64,
ctx: &RequestContext,
) -> Result<(), CompactionError> {
@@ -4079,7 +3927,7 @@ impl Timeline {
}
let mut guard = self.layers.write().await;
let (layers, mapping) = &mut *guard;
let (layers, _) = &mut *guard;
let mut updates = layers.batch_update();
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
for l in new_layers {
@@ -4111,7 +3959,8 @@ impl Timeline {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
self.insert_historic_layer(x, &mut updates, mapping);
updates.insert_historic(x.layer_desc().clone());
self.layer_cache.create_new_layer(x);
}
// Now that we have reshuffled the data to set of new delta layers, we can
@@ -4119,10 +3968,7 @@ impl Timeline {
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
for l in deltas_to_compact {
layer_names_to_delete.push(l.filename());
// NB: the layer file identified by descriptor `l` is guaranteed to be present
// in the LayerFileManager because we kept holding `layer_removal_cs` the entire
// time, even though we dropped `Timeline::layers` inbetween.
self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates, mapping)?;
self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
}
updates.flush();
drop_wlock(guard);
@@ -4243,7 +4089,7 @@ impl Timeline {
fail_point!("before-timeline-gc");
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
let layer_removal_cs = self.layer_cache.delete_guard().await;
// Is the timeline being deleted?
if self.is_stopping() {
anyhow::bail!("timeline is Stopping");
@@ -4281,7 +4127,7 @@ impl Timeline {
async fn gc_timeline(
&self,
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
layer_removal_cs: LayerDeletionGuard,
horizon_cutoff: Lsn,
pitr_cutoff: Lsn,
retain_lsns: Vec<Lsn>,
@@ -4343,7 +4189,7 @@ impl Timeline {
//
// TODO holding a write lock is too agressive and avoidable
let mut guard = self.layers.write().await;
let (layers, mapping) = &mut *guard;
let (layers, _) = &mut *guard;
'outer: for l in layers.iter_historic_layers() {
result.layers_total += 1;
@@ -4459,7 +4305,6 @@ impl Timeline {
layer_removal_cs.clone(),
doomed_layer,
&mut updates,
mapping,
)?; // FIXME: schedule succeeded deletions before returning?
result.layers_removed += 1;
}
@@ -4645,13 +4490,14 @@ impl Timeline {
// Download complete. Replace the RemoteLayer with the corresponding
// Delta- or ImageLayer in the layer map.
let mut guard = self_clone.layers.write().await;
let (layers, mapping) = &mut *guard;
let (layers, _) = &mut *guard;
let updates = layers.batch_update();
let new_layer =
remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
{
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
let failure = match mapping.replace_and_verify(l, new_layer) {
let failure = match self_clone.layer_cache.replace_and_verify(l, new_layer)
{
Ok(()) => false,
Err(e) => {
// this is a precondition failure, the layer filename derived
@@ -4780,10 +4626,10 @@ impl Timeline {
let mut downloads = Vec::new();
{
let guard = self.layers.read().await;
let (layers, mapping) = &*guard;
let (layers, _) = &*guard;
layers
.iter_historic_layers()
.map(|l| mapping.get_from_desc(&l))
.map(|l| self.layer_cache.get_from_desc(&l))
.filter_map(|l| l.downcast_remote_layer())
.map(|l| self.download_remote_layer(l))
.for_each(|dl| downloads.push(dl))
@@ -4885,7 +4731,7 @@ impl LocalLayerInfoForDiskUsageEviction {
impl Timeline {
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
let guard = self.layers.read().await;
let (layers, mapping) = &*guard;
let (layers, _) = &*guard;
let mut max_layer_size: Option<u64> = None;
let mut resident_layers = Vec::new();
@@ -4894,7 +4740,7 @@ impl Timeline {
let file_size = l.file_size();
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
let l = mapping.get_from_desc(&l);
let l = self.layer_cache.get_from_desc(&l);
if l.is_remote_layer() {
continue;

View File

@@ -198,10 +198,10 @@ impl Timeline {
// So, we just need to deal with this.
let candidates: Vec<Arc<dyn PersistentLayer>> = {
let guard = self.layers.read().await;
let (layers, mapping) = &*guard;
let (layers, _) = &*guard;
let mut candidates = Vec::new();
for hist_layer in layers.iter_historic_layers() {
let hist_layer = mapping.get_from_desc(&hist_layer);
let hist_layer = self.layer_cache.get_from_desc(&hist_layer);
if hist_layer.is_remote_layer() {
continue;
}

View File

@@ -2675,6 +2675,7 @@ bool
neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
{
XLogRecPtr end_recptr = record->EndRecPtr;
XLogRecPtr prev_end_recptr = record->ReadRecPtr - 1;
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blkno;
@@ -2718,15 +2719,16 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
no_redo_needed = buffer < 0;
/* In both cases st lwlsn past this WAL record */
SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
/* we don't have the buffer in memory, update lwLsn past this record,
* also evict page fro file cache
*/
/* we don't have the buffer in memory, update lwLsn past this record */
if (no_redo_needed)
{
SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
lfc_evict(rnode, forknum, blkno);
}
else
{
SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
}
LWLockRelease(partitionLock);
@@ -2734,10 +2736,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
if (get_cached_relsize(rnode, forknum, &relsize))
{
if (relsize < blkno + 1)
{
update_cached_relsize(rnode, forknum, blkno + 1);
SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
}
}
else
{
@@ -2769,7 +2768,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
Assert(nbresponse->n_blocks > blkno);
set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
}

View File

@@ -1,6 +1,7 @@
import shutil
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Tuple
import pytest
@@ -427,14 +428,14 @@ def poor_mans_du(
largest_layer = 0
smallest_layer = None
for tenant_id, timeline_id in timelines:
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
total = 0
for file in timeline_dir.iterdir():
dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
assert dir.exists(), f"timeline dir does not exist: {dir}"
sum = 0
for file in dir.iterdir():
if "__" not in file.name:
continue
size = file.stat().st_size
total += size
sum += size
largest_layer = max(largest_layer, size)
if smallest_layer:
smallest_layer = min(smallest_layer, size)
@@ -442,8 +443,8 @@ def poor_mans_du(
smallest_layer = size
log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
log.info(f"{tenant_id}/{timeline_id}: sum {total}")
total_on_disk += total
log.info(f"{tenant_id}/{timeline_id}: sum {sum}")
total_on_disk += sum
assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
return (total_on_disk, largest_layer, smallest_layer or 0)

View File

@@ -58,7 +58,7 @@ def test_basic_eviction(
for sk in env.safekeepers:
sk.stop()
timeline_path = env.timeline_dir(tenant_id, timeline_id)
timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
initial_local_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
)

View File

@@ -535,7 +535,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
"pitr_interval": "0s",
}
)
timeline_path = env.timeline_dir(tenant_id, timeline_id)
timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
client = env.pageserver.http_client()

View File

@@ -632,14 +632,14 @@ def test_ignored_tenant_download_missing_layers(
# ignore the tenant and remove its layers
pageserver_http.tenant_ignore(tenant_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
layers_removed = False
for dir_entry in timeline_dir.iterdir():
for dir_entry in tenant_timeline_dir.iterdir():
if dir_entry.name.startswith("00000"):
# Looks like a layer file. Remove it
dir_entry.unlink()
layers_removed = True
assert layers_removed, f"Found no layers for tenant {timeline_dir}"
assert layers_removed, f"Found no layers for tenant {tenant_timeline_dir}"
# now, load it from the local files and expect it to work due to remote storage restoration
pageserver_http.tenant_load(tenant_id=tenant_id)
@@ -688,14 +688,14 @@ def test_ignored_tenant_stays_broken_without_metadata(
# ignore the tenant and remove its metadata
pageserver_http.tenant_ignore(tenant_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
metadata_removed = False
for dir_entry in timeline_dir.iterdir():
for dir_entry in tenant_timeline_dir.iterdir():
if dir_entry.name == "metadata":
# Looks like a layer file. Remove it
dir_entry.unlink()
metadata_removed = True
assert metadata_removed, f"Failed to find metadata file in {timeline_dir}"
assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"
env.pageserver.allowed_errors.append(
f".*{tenant_id}.*: load failed.*: failed to load metadata.*"

View File

@@ -214,7 +214,9 @@ def switch_pg_to_new_pageserver(
endpoint.start()
timeline_to_detach_local_path = env.timeline_dir(tenant_id, timeline_id)
timeline_to_detach_local_path = (
env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
)
files_before_detach = os.listdir(timeline_to_detach_local_path)
assert (
"metadata" in files_before_detach
@@ -417,6 +419,8 @@ def test_tenant_relocation(
new_pageserver_http.tenant_attach(tenant_id)
# wait for tenant to finish attaching
tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id)
assert tenant_status["state"]["slug"] in ["Attaching", "Active"]
wait_until(
number_of_iterations=10,
interval=1,

View File

@@ -257,7 +257,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
env.endpoints.stop_all()
env.pageserver.stop()
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
local_layer_truncated = None
for path in Path.iterdir(timeline_dir):
if path.name.startswith("00000"):