Compare commits

..

22 Commits

Author SHA1 Message Date
Alex Chi
19180e167f remove LayerKey usage
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-28 10:00:02 -04:00
Alex Chi
b2cd142836 fix tests
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-28 09:58:31 -04:00
Alex Chi
f2d7baf0ba rename DeleteGuard -> LayerDeletionGuard
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 16:57:11 -04:00
Alex Chi
113a4256d4 rename lcache to layer_cache
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 16:55:56 -04:00
Alex Chi
be4999713a add comments for LayerCache
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 16:54:51 -04:00
Alex Chi
7335f155c3 fmt
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:31:15 -04:00
Alex Chi
a1ca70ff35 Merge branch 'skyzh/layermap-ref-2' of https://github.com/neondatabase/neon into skyzh/layermap-as-cache 2023-06-27 10:26:13 -04:00
Alex Chi
ce1e57faea fix merge conflicts
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:25:06 -04:00
Alex Chi
6f50bec781 fix merge conflicts
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:24:48 -04:00
Alex Chi
b981702ecf Merge branch 'skyzh/layermap-ref-2' of https://github.com/neondatabase/neon into skyzh/layermap-as-cache 2023-06-27 10:23:08 -04:00
Alex Chi
21d30fc43f use layer_desc key for replace cmp
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:16:30 -04:00
Alex Chi
137ad83f37 fix merge conflicts
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:14:09 -04:00
Alex Chi
22da36bc02 fix merge conflicts
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-27 10:04:21 -04:00
Alex Chi
900ef3d92b Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/layermap-ref-2 2023-06-27 10:02:57 -04:00
Alex Chi
b7923fa0be use new errmsg
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-23 16:27:49 -04:00
Alex Chi
4c4a531d5e rename LayerMapping -> LayerFileManager
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-23 15:52:09 -04:00
Alex Chi
2b4f96345b resolve comments
Signed-off-by: Alex Chi <chi@neon.tech>
2023-06-23 15:32:57 -04:00
Alex Chi
b775ca8a58 resolve conflicts
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-20 09:52:28 -04:00
Alex Chi
ddb5862be2 Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/layermap-ref-2 2023-06-20 09:12:15 -04:00
Alex Chi
a2056666ae pgserver: move mapping logic to layer cache
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-14 15:07:38 -04:00
Alex Chi
fc190a2a19 resolve merge conflicts
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-13 13:56:50 -04:00
Alex Chi
faee3152f3 refactor: use LayerDesc in LayerMap (part 2)
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-13 13:54:59 -04:00
47 changed files with 643 additions and 2177 deletions

View File

@@ -722,35 +722,6 @@ jobs:
--dockerfile Dockerfile.compute-node
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--cleanup
# Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
# During the transition period we need to have extensions in both places (in S3 and in compute-node image),
# so we won't build extension twice, but extract them from compute-node.
#
# For now we use extensions image only for new custom extensitons
- name: Kaniko build extensions only
run: |
# Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
# Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
# it still fails with error:
# error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
#
# Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--context . \
--build-arg GIT_VERSION=${{ github.sha }} \
--build-arg PG_VERSION=${{ matrix.version }} \
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
--dockerfile Dockerfile.compute-node \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--cleanup \
--target postgres-extensions
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
@@ -869,10 +840,8 @@ jobs:
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
- name: Push images to production ECR
if: |
@@ -883,10 +852,8 @@ jobs:
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
- name: Configure Docker Hub login
run: |
@@ -908,89 +875,16 @@ jobs:
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
upload-postgres-extensions-to-s3:
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
needs: [ tag, promote-images ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15 ]
env:
# While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
# Later all the extensions will be moved to extensions image.
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
S3_BUCKETS: |
${{ github.ref_name == 'release' &&
'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
steps:
- name: Pull postgres-extensions image
run: |
docker pull ${EXTENSIONS_IMAGE}
docker pull ${COMPUTE_NODE_IMAGE}
- name: Create postgres-extensions container
id: create-container
run: |
EID=$(docker create ${EXTENSIONS_IMAGE} true)
echo "EID=${EID}" >> $GITHUB_OUTPUT
CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
echo "CID=${CID}" >> $GITHUB_OUTPUT
- name: Extract postgres-extensions from container
run: |
rm -rf ./extensions-to-upload ./custom-extensions # Just in case
# In compute image we have a bit different directory layout
mkdir -p extensions-to-upload/share
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib
# Delete Neon extensitons (they always present on compute-node image)
rm -rf ./extensions-to-upload/share/extension/neon*
rm -rf ./extensions-to-upload/lib/neon*
docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
for EXT_NAME in $(ls ./custom-extensions); do
mkdir -p ./extensions-to-upload/${EXT_NAME}/share
mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib
done
- name: Upload postgres-extensions to S3
run: |
for BUCKET in $(echo ${S3_BUCKETS}); do
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
done
- name: Cleanup
if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
run: |
docker rm ${{ steps.create-container.outputs.CID }} || true
docker rm ${{ steps.create-container.outputs.EID }} || true
deploy:
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
needs: [ promote-images, tag, regress-tests ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
steps:
- name: Fix git ownership

3
Cargo.lock generated
View File

@@ -924,14 +924,12 @@ dependencies = [
"opentelemetry",
"postgres",
"regex",
"remote_storage",
"reqwest",
"serde",
"serde_json",
"tar",
"tokio",
"tokio-postgres",
"toml_edit",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
@@ -999,7 +997,6 @@ dependencies = [
"tar",
"thiserror",
"toml",
"tracing",
"url",
"utils",
"workspace_hack",

View File

@@ -515,26 +515,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
#########################################################################################
#
# Layer "pg-anon-pg-build"
# compile anon extension
#
#########################################################################################
FROM build-deps AS pg-anon-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sort > /before.txt && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
find /usr/local/pgsql -type f | sort > /after.txt && \
/bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
#########################################################################################
#
# Layer "rust extensions"
@@ -643,7 +623,6 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
#
#########################################################################################
FROM build-deps AS neon-pg-ext-build
# Public extensions
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=postgis-build /sfcgal/* /
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -719,22 +698,6 @@ RUN rm -r /usr/local/pgsql/include
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Extenstion only
#
#########################################################################################
FROM scratch AS postgres-extensions
# After the transition this layer will include all extensitons.
# As for now, it's only for new custom ones
#
# # Default extensions
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib
# Custom extensions
COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
#########################################################################################
#
# Final layer

View File

@@ -30,5 +30,3 @@ url.workspace = true
compute_api.workspace = true
utils.workspace = true
workspace_hack.workspace = true
toml_edit.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }

View File

@@ -5,8 +5,6 @@
//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
//! - Every start is a fresh start, so the data directory is removed and
//! initialized again on each run.
//! - If remote_extension_config is provided, it will be used to fetch extensions list
//! and download `shared_preload_libraries` from the remote storage.
//! - Next it will put configuration files into the `PGDATA` directory.
//! - Sync safekeepers and get commit LSN.
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -29,8 +27,7 @@
//! compute_ctl -D /var/db/postgres/compute \
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres \
//! -r {"bucket": "my-bucket", "region": "eu-central-1", "endpoint": "http:://localhost:9000"} \
//! -b /usr/local/bin/postgres
//! ```
//!
use std::collections::HashMap;
@@ -38,7 +35,7 @@ use std::fs::File;
use std::panic;
use std::path::Path;
use std::process::exit;
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
use std::sync::{mpsc, Arc, Condvar, Mutex};
use std::{thread, time::Duration};
use anyhow::{Context, Result};
@@ -51,7 +48,6 @@ use compute_api::responses::ComputeStatus;
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
use compute_tools::http::api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
@@ -68,14 +64,6 @@ fn main() -> Result<()> {
info!("build_tag: {build_tag}");
let matches = cli().get_matches();
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
let ext_remote_storage = remote_ext_config.map(|x| {
init_remote_storage(x, build_tag)
.expect("cannot initialize remote extension storage from config")
});
let http_port = *matches
.get_one::<u16>("http-port")
@@ -140,6 +128,9 @@ fn main() -> Result<()> {
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
// Try to use just 'postgres' if no path is provided
let pgbin = matches.get_one::<String>("pgbin").unwrap();
let spec;
let mut live_config_allowed = false;
match spec_json {
@@ -177,7 +168,6 @@ fn main() -> Result<()> {
let mut new_state = ComputeState::new();
let spec_set;
if let Some(spec) = spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
new_state.pspec = Some(pspec);
@@ -189,13 +179,9 @@ fn main() -> Result<()> {
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
pgversion: get_pg_version(pgbin),
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage,
available_libraries: OnceLock::new(),
available_extensions: OnceLock::new(),
};
let compute = Arc::new(compute_node);
@@ -204,8 +190,6 @@ fn main() -> Result<()> {
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
@@ -246,7 +230,7 @@ fn main() -> Result<()> {
// Start Postgres
let mut delay_exit = false;
let mut exit_code = None;
let pg = match compute.start_compute(extension_server_port) {
let pg = match compute.start_compute() {
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:?}", err);
@@ -365,12 +349,6 @@ fn cli() -> clap::Command {
.long("control-plane-uri")
.value_name("CONTROL_PLANE_API_BASE_URI"),
)
.arg(
Arg::new("remote-ext-config")
.short('r')
.long("remote-ext-config")
.value_name("REMOTE_EXT_CONFIG"),
)
}
#[test]

View File

@@ -1,15 +1,13 @@
use std::collections::HashMap;
use std::fs;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::{Condvar, Mutex, OnceLock};
use std::sync::{Condvar, Mutex};
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use postgres::{Client, NoTls};
use tokio;
use tokio_postgres;
use tracing::{info, instrument, warn};
use utils::id::{TenantId, TimelineId};
@@ -18,12 +16,9 @@ use utils::lsn::Lsn;
use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeMode, ComputeSpec};
use remote_storage::{GenericRemoteStorage, RemotePath};
use crate::extension_server::PathAndFlag;
use crate::config;
use crate::pg_helpers::*;
use crate::spec::*;
use crate::{config, extension_server};
/// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode {
@@ -31,7 +26,6 @@ pub struct ComputeNode {
pub connstr: url::Url,
pub pgdata: String,
pub pgbin: String,
pub pgversion: String,
/// We should only allow live re- / configuration of the compute node if
/// it uses 'pull model', i.e. it can go to control-plane and fetch
/// the latest configuration. Otherwise, there could be a case:
@@ -51,11 +45,6 @@ pub struct ComputeNode {
pub state: Mutex<ComputeState>,
/// `Condvar` to allow notifying waiters about state changes.
pub state_changed: Condvar,
/// the S3 bucket that we search for extensions in
pub ext_remote_storage: Option<GenericRemoteStorage>,
// cached lists of available extensions and libraries
pub available_libraries: OnceLock<HashMap<String, Vec<RemotePath>>>,
pub available_extensions: OnceLock<HashMap<String, Vec<PathAndFlag>>>,
}
#[derive(Clone, Debug)]
@@ -246,7 +235,7 @@ impl ComputeNode {
// Get basebackup from the libpq connection to pageserver using `connstr` and
// unarchive it to `pgdata` directory overriding all its previous content.
#[instrument(skip_all, fields(%lsn))]
#[instrument(skip(self, compute_state))]
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Utc::now();
@@ -288,7 +277,7 @@ impl ComputeNode {
// Run `postgres` in a special mode with `--sync-safekeepers` argument
// and return the reported LSN back to the caller.
#[instrument(skip_all)]
#[instrument(skip(self, storage_auth_token))]
fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
let start_time = Utc::now();
@@ -333,23 +322,15 @@ impl ComputeNode {
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
#[instrument(skip_all)]
pub fn prepare_pgdata(
&self,
compute_state: &ComputeState,
extension_server_port: u16,
) -> Result<()> {
#[instrument(skip(self, compute_state))]
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
let pgdata_path = Path::new(&self.pgdata);
// Remove/create an empty pgdata directory and put configuration there.
self.create_pgdata()?;
config::write_postgres_conf(
&pgdata_path.join("postgresql.conf"),
&pspec.spec,
Some(extension_server_port),
)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
// Syncing safekeepers is only safe with primary nodes: if a primary
// is already connected it will be kicked out, so a secondary (standby)
@@ -399,7 +380,7 @@ impl ComputeNode {
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
#[instrument(skip_all)]
#[instrument(skip(self))]
pub fn start_postgres(
&self,
storage_auth_token: Option<String>,
@@ -423,7 +404,7 @@ impl ComputeNode {
}
/// Do initial configuration of the already started Postgres.
#[instrument(skip_all)]
#[instrument(skip(self, compute_state))]
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
// If connection fails,
// it may be the old node with `zenith_admin` superuser.
@@ -477,7 +458,7 @@ impl ComputeNode {
// We could've wrapped this around `pg_ctl reload`, but right now we don't use
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
// have opened connection to Postgres and superuser access.
#[instrument(skip_all)]
#[instrument(skip(self, client))]
fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
client.simple_query("SELECT pg_reload_conf()")?;
Ok(())
@@ -485,13 +466,13 @@ impl ComputeNode {
/// Similar to `apply_config()`, but does a bit different sequence of operations,
/// as it's used to reconfigure a previously started and configured Postgres node.
#[instrument(skip_all)]
#[instrument(skip(self))]
pub fn reconfigure(&self) -> Result<()> {
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
// Write new config
let pgdata_path = Path::new(&self.pgdata);
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
self.pg_reload_conf(&mut client)?;
@@ -520,8 +501,8 @@ impl ComputeNode {
Ok(())
}
#[instrument(skip_all)]
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
#[instrument(skip(self))]
pub fn start_compute(&self) -> Result<std::process::Child> {
let compute_state = self.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
@@ -532,12 +513,7 @@ impl ComputeNode {
pspec.timeline_id,
);
// TODO FIXME: this should not be blocking here
// Maybe we can run it as a child process?
// Also, worth measuring how long this step is taking
self.prepare_external_extensions(&compute_state)?;
self.prepare_pgdata(&compute_state, extension_server_port)?;
self.prepare_pgdata(&compute_state)?;
let start_time = Utc::now();
@@ -673,129 +649,4 @@ LIMIT 100",
"{{\"pg_stat_statements\": []}}".to_string()
}
}
// If remote extension storage is configured,
// download extension control files
// and shared preload libraries.
#[tokio::main]
pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
if let Some(ref ext_remote_storage) = self.ext_remote_storage {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
// download preload shared libraries before postgres start (if any)
let spec = &pspec.spec;
// 1. parse custom extension paths from spec
let custom_ext_prefixes = match &spec.custom_extensions {
Some(custom_extensions) => custom_extensions.clone(),
None => Vec::new(),
};
info!("custom_ext_prefixes: {:?}", &custom_ext_prefixes);
// 2. parse shared_preload_libraries from spec
let mut libs_vec = Vec::new();
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
libs_vec = libs
.split(&[',', '\'', ' '])
.filter(|s| *s != "neon" && !s.is_empty())
.map(str::to_string)
.collect();
}
info!(
"shared_preload_libraries parsed from spec.cluster.settings: {:?}",
libs_vec
);
// also parse shared_preload_libraries from provided postgresql.conf
// that is used in neon_local and python tests
if let Some(conf) = &spec.cluster.postgresql_conf {
let conf_lines = conf.split('\n').collect::<Vec<&str>>();
let mut shared_preload_libraries_line = "";
for line in conf_lines {
if line.starts_with("shared_preload_libraries") {
shared_preload_libraries_line = line;
}
}
let mut preload_libs_vec = Vec::new();
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
preload_libs_vec = libs
.split(&[',', '\'', ' '])
.filter(|s| *s != "neon" && !s.is_empty())
.map(str::to_string)
.collect();
}
info!(
"shared_preload_libraries parsed from spec.cluster.postgresql_conf: {:?}",
preload_libs_vec
);
libs_vec.extend(preload_libs_vec);
}
info!("Libraries to download: {:?}", &libs_vec);
// download extension control files & shared_preload_libraries
let get_extensions_task = extension_server::get_available_extensions(
ext_remote_storage,
&self.pgbin,
&self.pgversion,
&custom_ext_prefixes,
);
let get_libraries_task = extension_server::get_available_libraries(
ext_remote_storage,
&self.pgbin,
&self.pgversion,
&custom_ext_prefixes,
&libs_vec,
);
let (available_extensions, available_libraries) =
tokio::join!(get_extensions_task, get_libraries_task);
self.available_extensions
.set(available_extensions?)
.expect("available_extensions.set error");
self.available_libraries
.set(available_libraries?)
.expect("available_libraries.set error");
}
Ok(())
}
pub async fn download_extension_files(&self, filename: String) -> Result<()> {
match &self.ext_remote_storage {
None => anyhow::bail!("No remote extension storage"),
Some(remote_storage) => {
extension_server::download_extension_files(
&filename,
remote_storage,
&self.pgbin,
self.available_extensions
.get()
.context("available_extensions broke")?,
)
.await
}
}
}
pub async fn download_library_file(&self, filename: String) -> Result<()> {
match &self.ext_remote_storage {
None => anyhow::bail!("No remote extension storage"),
Some(remote_storage) => {
extension_server::download_library_file(
&filename,
remote_storage,
&self.pgbin,
self.available_libraries
.get()
.context("available_libraries broke")?,
)
.await
}
}
}
}

View File

@@ -33,11 +33,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
}
/// Create or completely rewrite configuration file specified by `path`
pub fn write_postgres_conf(
path: &Path,
spec: &ComputeSpec,
extension_server_port: Option<u16>,
) -> Result<()> {
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
// File::create() destroys the file content if it exists.
let mut file = File::create(path)?;
@@ -99,9 +95,5 @@ pub fn write_postgres_conf(
writeln!(file, "# Managed by compute_ctl: end")?;
}
if let Some(port) = extension_server_port {
writeln!(file, "neon.extension_server_port={}", port)?;
}
Ok(())
}

View File

@@ -8,7 +8,7 @@ use compute_api::responses::ComputeStatus;
use crate::compute::ComputeNode;
#[instrument(skip_all)]
#[instrument(skip(compute))]
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
info!("waiting for reconfiguration requests");
loop {

View File

@@ -1,428 +0,0 @@
// Download extension files from the extension store
// and put them in the right place in the postgres directory
use anyhow::{self, bail, Context, Result};
use futures::future::join_all;
use remote_storage::*;
use serde_json::{self, Value};
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::num::{NonZeroU32, NonZeroUsize};
use std::path::{Path, PathBuf};
use std::str;
use tokio::io::AsyncReadExt;
use tracing::info;
// remote!
const SHARE_EXT_PATH: &str = "share/extension";
fn pass_any_error(results: Vec<Result<()>>) -> Result<()> {
for result in results {
result?;
}
Ok(())
}
fn get_pg_config(argument: &str, pgbin: &str) -> String {
// gives the result of `pg_config [argument]`
// where argument is a flag like `--version` or `--sharedir`
let pgconfig = pgbin.replace("postgres", "pg_config");
let config_output = std::process::Command::new(pgconfig)
.arg(argument)
.output()
.expect("pg_config error");
std::str::from_utf8(&config_output.stdout)
.expect("pg_config error")
.trim()
.to_string()
}
pub fn get_pg_version(pgbin: &str) -> String {
// pg_config --version returns a (platform specific) human readable string
// such as "PostgreSQL 15.4". We parse this to v14/v15
let human_version = get_pg_config("--version", pgbin);
if human_version.contains("15") {
return "v15".to_string();
} else if human_version.contains("14") {
return "v14".to_string();
}
panic!("Unsuported postgres version {human_version}");
}
async fn download_helper(
remote_storage: &GenericRemoteStorage,
remote_from_path: RemotePath,
sub_directory: Option<&str>,
download_location: &Path,
) -> anyhow::Result<()> {
// downloads file at remote_from_path to
// `download_location/[optional: subdirectory]/[remote_storage.object_name()]`
// Note: the subdirectory commmand is needed when there is an extension that
// depends on files in a subdirectory.
// For example, v14/share/extension/some_ext.control
// might depend on v14/share/extension/some_ext/some_ext--1.1.0.sql
// and v14/share/extension/some_ext/xxx.csv
// Note: it is the caller's responsibility to create the appropriate subdirectory
let local_path = match sub_directory {
Some(subdir) => download_location
.join(subdir)
.join(remote_from_path.object_name().expect("bad object")),
None => download_location.join(remote_from_path.object_name().expect("bad object")),
};
if local_path.exists() {
info!("File {:?} already exists. Skipping download", &local_path);
return Ok(());
}
info!(
"Downloading {:?} to location {:?}",
&remote_from_path, &local_path
);
let mut download = remote_storage.download(&remote_from_path).await?;
let mut write_data_buffer = Vec::new();
download
.download_stream
.read_to_end(&mut write_data_buffer)
.await?;
let mut output_file = BufWriter::new(File::create(local_path)?);
output_file.write_all(&write_data_buffer)?;
info!("Download {:?} completed successfully", &remote_from_path);
Ok(())
}
// download extension control files
//
// if custom_ext_prefixes is provided - search also in custom extension paths
//
pub async fn get_available_extensions(
remote_storage: &GenericRemoteStorage,
pgbin: &str,
pg_version: &str,
custom_ext_prefixes: &Vec<String>,
) -> anyhow::Result<HashMap<String, Vec<PathAndFlag>>> {
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
// public path, plus any private paths to download extensions from
let mut paths: Vec<RemotePath> = Vec::new();
paths.push(RemotePath::new(
&Path::new(pg_version).join(SHARE_EXT_PATH),
)?);
for custom_prefix in custom_ext_prefixes {
paths.push(RemotePath::new(
&Path::new(pg_version)
.join(custom_prefix)
.join(SHARE_EXT_PATH),
)?);
}
let (extension_files, control_files) =
organized_extension_files(remote_storage, &paths).await?;
let mut control_file_download_tasks = Vec::new();
// download all control files
for control_file in control_files {
control_file_download_tasks.push(download_helper(
remote_storage,
control_file.clone(),
None,
&local_sharedir,
));
}
pass_any_error(join_all(control_file_download_tasks).await)?;
Ok(extension_files)
}
// Download requested shared_preload_libraries
//
// Note that tenant_id is not optional here, because we only download libraries
// after we know the tenant spec and the tenant_id.
//
// return list of all library files to use it in the future searches
pub async fn get_available_libraries(
remote_storage: &GenericRemoteStorage,
pgbin: &str,
pg_version: &str,
custom_ext_prefixes: &Vec<String>,
preload_libraries: &Vec<String>,
) -> anyhow::Result<HashMap<String, Vec<RemotePath>>> {
// Construct a hashmap of all available libraries
// example (key, value) pair: test_lib0: [RemotePath(v14/lib/test_lib0.so), RemotePath(v14/lib/test_lib0.so.3)]
let mut paths: Vec<RemotePath> = Vec::new();
// public libraries
paths.push(
RemotePath::new(&Path::new(&pg_version).join("lib/"))
.expect("The hard coded path here is valid"),
);
// custom libraries
for custom_prefix in custom_ext_prefixes {
paths.push(
RemotePath::new(&Path::new(&pg_version).join(custom_prefix).join("lib"))
.expect("The hard coded path here is valid"),
);
}
let all_available_libraries = organized_library_files(remote_storage, &paths).await?;
info!("list of library files {:?}", &all_available_libraries);
// download all requested libraries
let mut download_tasks = Vec::new();
for lib_name in preload_libraries {
download_tasks.push(download_library_file(
lib_name,
remote_storage,
pgbin,
&all_available_libraries,
));
}
pass_any_error(join_all(download_tasks).await)?;
Ok(all_available_libraries)
}
// download all sqlfiles (and possibly data files) for a given extension name
//
pub async fn download_extension_files(
ext_name: &str,
remote_storage: &GenericRemoteStorage,
pgbin: &str,
all_available_files: &HashMap<String, Vec<PathAndFlag>>,
) -> Result<()> {
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
let mut downloaded_something = false;
let mut made_subdir = false;
info!("EXTENSION {:?}", ext_name);
info!("{:?}", all_available_files.get(ext_name));
info!("start download");
let mut download_tasks = Vec::new();
if let Some(files) = all_available_files.get(ext_name) {
info!("Downloading files for extension {:?}", &ext_name);
for path_and_flag in files {
let file = &path_and_flag.path;
let subdir_flag = path_and_flag.subdir_flag;
info!(
"--- Downloading {:?} (for {:?} as subdir? = {:?})",
&file, &ext_name, subdir_flag
);
let mut subdir = None;
if subdir_flag {
subdir = Some(ext_name);
if !made_subdir {
made_subdir = true;
std::fs::create_dir_all(local_sharedir.join(ext_name))?;
}
}
download_tasks.push(download_helper(
remote_storage,
file.clone(),
subdir,
&local_sharedir,
));
downloaded_something = true;
}
}
if !downloaded_something {
bail!("Files for extension {ext_name} are not found in the extension store");
}
pass_any_error(join_all(download_tasks).await)?;
info!("finish download");
Ok(())
}
// appends an .so suffix to libname if it does not already have one
fn enforce_so_end(libname: &str) -> String {
if !libname.contains(".so") {
format!("{}.so", libname)
} else {
libname.to_string()
}
}
// download shared library file
pub async fn download_library_file(
lib_name: &str,
remote_storage: &GenericRemoteStorage,
pgbin: &str,
all_available_libraries: &HashMap<String, Vec<RemotePath>>,
) -> Result<()> {
let lib_name = get_library_name(lib_name);
let local_libdir: PathBuf = Path::new(&get_pg_config("--pkglibdir", pgbin)).into();
info!("looking for library {:?}", &lib_name);
match all_available_libraries.get(&*lib_name) {
Some(remote_paths) => {
let mut library_download_tasks = Vec::new();
for remote_path in remote_paths {
let file_path = local_libdir.join(remote_path.object_name().expect("bad object"));
if file_path.exists() {
info!("File {:?} already exists. Skipping download", &file_path);
} else {
library_download_tasks.push(download_helper(
remote_storage,
remote_path.clone(),
None,
&local_libdir,
));
}
}
pass_any_error(join_all(library_download_tasks).await)?;
}
None => {
// minor TODO: this logic seems to be somewhat faulty for .so.3 type files?
let lib_name_with_ext = enforce_so_end(&lib_name);
let file_path = local_libdir.join(lib_name_with_ext);
if file_path.exists() {
info!("File {:?} already exists. Skipping download", &file_path);
} else {
bail!("Library file {lib_name} not found")
}
}
}
Ok(())
}
// This function initializes the necessary structs to use remmote storage (should be fairly cheap)
pub fn init_remote_storage(
remote_ext_config: &str,
default_prefix: &str,
) -> anyhow::Result<GenericRemoteStorage> {
let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
let remote_ext_bucket = match &remote_ext_config["bucket"] {
Value::String(x) => x,
_ => bail!("remote_ext_config missing bucket"),
};
let remote_ext_region = match &remote_ext_config["region"] {
Value::String(x) => x,
_ => bail!("remote_ext_config missing region"),
};
let remote_ext_endpoint = match &remote_ext_config["endpoint"] {
Value::String(x) => Some(x.clone()),
_ => None,
};
let remote_ext_prefix = match &remote_ext_config["prefix"] {
Value::String(x) => Some(x.clone()),
// if prefix is not provided, use default, which is the build_tag
_ => Some(default_prefix.to_string()),
};
// load will not be large, so default parameters are fine
let config = S3Config {
bucket_name: remote_ext_bucket.to_string(),
bucket_region: remote_ext_region.to_string(),
prefix_in_bucket: remote_ext_prefix,
endpoint: remote_ext_endpoint,
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
max_keys_per_list_response: None,
};
let config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
storage: RemoteStorageKind::AwsS3(config),
};
GenericRemoteStorage::from_config(&config)
}
fn get_library_name(path: &str) -> String {
let path_suffix: Vec<&str> = path.split('/').collect();
let path_suffix = path_suffix.last().expect("bad ext name").to_string();
if let Some(index) = path_suffix.find(".so") {
return path_suffix[..index].to_string();
}
path_suffix
}
// asyncrounously lists files in all necessary directories
// TODO: potential optimization: do a single list files on the entire bucket
// and then filter out the files we don't need
async fn list_all_files(
remote_storage: &GenericRemoteStorage,
paths: &Vec<RemotePath>,
) -> Result<Vec<RemotePath>> {
let mut list_tasks = Vec::new();
let mut all_files = Vec::new();
for path in paths {
list_tasks.push(remote_storage.list_files(Some(path)));
}
for list_result in join_all(list_tasks).await {
all_files.extend(list_result?);
}
Ok(all_files)
}
// helper to collect all libraries, grouped by library name
// Returns a hashmap of (library name: [paths]})
// example entry: {libpgtypes: [libpgtypes.so.3, libpgtypes.so]}
async fn organized_library_files(
remote_storage: &GenericRemoteStorage,
paths: &Vec<RemotePath>,
) -> Result<HashMap<String, Vec<RemotePath>>> {
let mut library_groups = HashMap::new();
for file in list_all_files(remote_storage, paths).await? {
let lib_name = get_library_name(file.get_path().to_str().context("invalid path")?);
let lib_list = library_groups.entry(lib_name).or_insert(Vec::new());
lib_list.push(file.to_owned());
}
Ok(library_groups)
}
// store a path, paired with a flag indicating whether the path is to a file in
// the root or subdirectory
#[derive(Debug)]
pub struct PathAndFlag {
path: RemotePath,
subdir_flag: bool,
}
// get_ext_name extracts the extension name, and returns a flag indicating
// whether this file is in a subdirectory or not.
//
// extension files can be in subdirectories of the extension store.
// examples of layout:
// v14//share//extension/extension_name--1.0.sql,
// v14//share//extension/extension_name/extension_name--1.0.sql,
// v14//share//extension/extension_name/extra_data.csv
// Note: we *assume* that the extension files is in one of these formats.
// If it is not, this code's behavior is *undefined*.
fn get_ext_name(path: &str) -> Result<(&str, bool)> {
let path_suffix: Vec<&str> = path.split(&format!("{SHARE_EXT_PATH}/")).collect();
let ext_name = path_suffix.last().expect("bad ext name");
if let Some(index) = ext_name.find('/') {
return Ok((&ext_name[..index], true));
} else if let Some(index) = ext_name.find("--") {
return Ok((&ext_name[..index], false));
}
Ok((ext_name, false))
}
// helper to collect files of given prefixes for extensions and group them by extension
// returns a hashmap of (extension_name, Vector of remote paths for all files needed for this extension)
// and a list of control files
// For example, an entry in the hashmap could be
// {"anon": [RemotePath("v14/anon/share/extension/anon/address.csv"),
// RemotePath("v14/anon/share/extension/anon/anon--1.1.0.sql")]},
// with corresponding list of control files entry being
// {"anon.control": RemotePath("v14/anon/share/extension/anon.control")}
async fn organized_extension_files(
remote_storage: &GenericRemoteStorage,
paths: &Vec<RemotePath>,
) -> Result<(HashMap<String, Vec<PathAndFlag>>, Vec<RemotePath>)> {
let mut grouped_dependencies = HashMap::new();
let mut control_files = Vec::new();
for file in list_all_files(remote_storage, paths).await? {
if file.extension().context("bad file name")? == "control" {
control_files.push(file.to_owned());
} else {
let (file_ext_name, subdir_flag) =
get_ext_name(file.get_path().to_str().context("invalid path")?)?;
let ext_file_list = grouped_dependencies
.entry(file_ext_name.to_string())
.or_insert(Vec::new());
ext_file_list.push(PathAndFlag {
path: file.to_owned(),
subdir_flag,
});
}
}
Ok((grouped_dependencies, control_files))
}

View File

@@ -121,55 +121,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
// download extension files from S3 on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
info!("req.uri {:?}", req.uri());
let mut is_library = false;
if let Some(params) = req.uri().query() {
info!("serving {:?} POST request with params: {}", route, params);
if params == "is_library=true" {
is_library = true;
} else {
let mut resp = Response::new(Body::from("Wrong request parameters"));
*resp.status_mut() = StatusCode::BAD_REQUEST;
return resp;
}
}
let filename = route.split('/').last().unwrap().to_string();
info!(
"serving /extension_server POST request, filename: {:?} is_library: {}",
filename, is_library
);
if is_library {
match compute.download_library_file(filename.to_string()).await {
Ok(_) => Response::new(Body::from("OK")),
Err(e) => {
error!("library download failed: {}", e);
let mut resp = Response::new(Body::from(e.to_string()));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
} else {
match compute.download_extension_files(filename.to_string()).await {
Ok(_) => Response::new(Body::from("OK")),
Err(e) => {
error!("extension download failed: {}", e);
let mut resp = Response::new(Body::from(e.to_string()));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
}
}
// Return the `404 Not Found` for any other routes.
_ => {
let mut not_found = Response::new(Body::from("404 Not Found"));

View File

@@ -139,34 +139,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
/extension_server:
post:
tags:
- Extension
summary: Download extension from S3 to local folder.
description: ""
operationId: downloadExtension
responses:
200:
description: Extension downloaded
content:
text/plain:
schema:
type: string
description: Error text or 'OK' if download succeeded.
example: "OK"
400:
description: Request is invalid.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: Extension download request failed.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:

View File

@@ -9,7 +9,6 @@ pub mod http;
#[macro_use]
pub mod logger;
pub mod compute;
pub mod extension_server;
pub mod monitor;
pub mod params;
pub mod pg_helpers;

View File

@@ -215,7 +215,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
/// Wait for Postgres to become ready to accept connections. It's ready to
/// accept connections when the state-field in `pgdata/postmaster.pid` says
/// 'ready'.
#[instrument(skip_all, fields(pgdata = %pgdata.display()))]
#[instrument(skip(pg))]
pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
let pid_path = pgdata.join("postmaster.pid");

View File

@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
// File `postgresql.conf` is no longer included into `basebackup`, so just
// always write all config into it creating new file.
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
update_pg_hba(pgdata_path)?;

View File

@@ -32,4 +32,3 @@ utils.workspace = true
compute_api.workspace = true
workspace_hack.workspace = true
tracing.workspace = true

View File

@@ -308,8 +308,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let mut env =
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
let force = init_match.get_flag("force");
env.init(pg_version, force)
env.init(pg_version)
.context("Failed to initialize neon repository")?;
// Initialize pageserver, create initial tenant and timeline.
@@ -658,8 +657,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
// If --safekeepers argument is given, use only the listed safekeeper nodes.
let safekeepers =
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -701,7 +698,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
_ => {}
}
println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
endpoint.start(&auth_token, safekeepers)?;
} else {
let branch_name = sub_args
.get_one::<String>("branch-name")
@@ -745,7 +742,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
pg_version,
mode,
)?;
ep.start(&auth_token, safekeepers, remote_ext_config)?;
ep.start(&auth_token, safekeepers)?;
}
}
"stop" => {
@@ -1005,12 +1002,6 @@ fn cli() -> Command {
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
.required(false);
let remote_ext_config_args = Arg::new("remote-ext-config")
.long("remote-ext-config")
.num_args(1)
.help("Configure the S3 bucket that we search for extensions in.")
.required(false);
let lsn_arg = Arg::new("lsn")
.long("lsn")
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1022,13 +1013,6 @@ fn cli() -> Command {
.help("If set, the node will be a hot replica on the specified timeline")
.required(false);
let force_arg = Arg::new("force")
.value_parser(value_parser!(bool))
.long("force")
.action(ArgAction::SetTrue)
.help("Force initialization even if the repository is not empty")
.required(false);
Command::new("Neon CLI")
.arg_required_else_help(true)
.version(GIT_VERSION)
@@ -1044,7 +1028,6 @@ fn cli() -> Command {
.value_name("config"),
)
.arg(pg_version_arg.clone())
.arg(force_arg)
)
.subcommand(
Command::new("timeline")
@@ -1169,7 +1152,6 @@ fn cli() -> Command {
.arg(pg_version_arg)
.arg(hot_standby_arg)
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
)
.subcommand(
Command::new("stop")

View File

@@ -311,7 +311,7 @@ impl Endpoint {
// TODO: use future host field from safekeeper spec
// Pass the list of safekeepers to the replica so that it can connect to any of them,
// whichever is available.
// whichever is availiable.
let sk_ports = self
.env
.safekeepers
@@ -408,12 +408,7 @@ impl Endpoint {
Ok(())
}
pub fn start(
&self,
auth_token: &Option<String>,
safekeepers: Vec<NodeId>,
remote_ext_config: Option<&String>,
) -> Result<()> {
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
if self.status() == "running" {
anyhow::bail!("The endpoint is already running");
}
@@ -481,13 +476,6 @@ impl Endpoint {
pageserver_connstring: Some(pageserver_connstring),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
// TODO FIXME: This is a hack to test custom extensions locally.
// In test_download_extensions, we assume that the custom extension
// prefix is the tenant ID. So we set it here.
//
// The proper way to implement this is to pass the custom extension
// in spec, but we don't have a way to do that yet in the python tests.
custom_extensions: Some(vec![self.tenant_id.to_string()]),
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -519,9 +507,6 @@ impl Endpoint {
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);
if let Some(remote_ext_config) = remote_ext_config {
cmd.args(["--remote-ext-config", remote_ext_config]);
}
let _child = cmd.spawn()?;
// Wait for it to start

View File

@@ -364,7 +364,7 @@ impl LocalEnv {
//
// Initialize a new Neon repository
//
pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
// check if config already exists
let base_path = &self.base_data_dir;
ensure!(
@@ -372,29 +372,11 @@ impl LocalEnv {
"repository base path is missing"
);
if base_path.exists() {
if force {
println!("removing all contents of '{}'", base_path.display());
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
// all contents inside. This helps if the developer symbol links another directory (i.e.,
// S3 local SSD) to the `.neon` base directory.
for entry in std::fs::read_dir(base_path)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
fs::remove_dir_all(&path)?;
} else {
fs::remove_file(&path)?;
}
}
} else {
bail!(
"directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
base_path.display()
);
}
}
ensure!(
!base_path.exists(),
"directory '{}' already exists. Perhaps already initialized?",
base_path.display()
);
if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
bail!(
"Can't find postgres binary at {}",
@@ -410,9 +392,7 @@ impl LocalEnv {
}
}
if !base_path.exists() {
fs::create_dir(base_path)?;
}
fs::create_dir(base_path)?;
// Generate keypair for JWT.
//

View File

@@ -1,183 +0,0 @@
# Supporting custom user Extensions (Dynamic Extension Loading)
Created 2023-05-03
## Motivation
There are many extensions in the PostgreSQL ecosystem, and not all extensions
are of a quality that we can confidently support them. Additionally, our
current extension inclusion mechanism has several problems because we build all
extensions into the primary Compute image: We build the extensions every time
we build the compute image regardless of whether we actually need to rebuild
the image, and the inclusion of these extensions in the image adds a hard
dependency on all supported extensions - thus increasing the image size, and
with it the time it takes to download that image - increasing first start
latency.
This RFC proposes a dynamic loading mechanism that solves most of these
problems.
## Summary
`compute_ctl` is made responsible for loading extensions on-demand into
the container's file system for dynamically loaded extensions, and will also
make sure that the extensions in `shared_preload_libraries` are downloaded
before the compute node starts.
## Components
compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
## Requirements
Compute nodes with no extra extensions should not be negatively impacted by
the existence of support for many extensions.
Installing an extension into PostgreSQL should be easy.
Non-preloaded extensions shouldn't impact startup latency.
Uninstalled extensions shouldn't impact query latency.
A small latency penalty for dynamically loaded extensions is acceptable in
the first seconds of compute startup, but not in steady-state operations.
## Proposed implementation
### On-demand, JIT-loading of extensions
Before postgres starts we download
- control files for all extensions available to that compute node;
- all `shared_preload_libraries`;
After postgres is running, `compute_ctl` listens for requests to load files.
When PostgreSQL requests a file, `compute_ctl` downloads it.
PostgreSQL requests files in the following cases:
- When loading a preload library set in `local_preload_libraries`
- When explicitly loading a library with `LOAD`
- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
#### Summary
Pros:
- Startup is only as slow as it takes to load all (shared_)preload_libraries
- Supports BYO Extension
Cons:
- O(sizeof(extensions)) IO requirement for loading all extensions.
### Alternative solutions
1. Allow users to add their extensions to the base image
Pros:
- Easy to deploy
Cons:
- Doesn't scale - first start size is dependent on image size;
- All extensions are shared across all users: It doesn't allow users to
bring their own restrictive-licensed extensions
2. Bring Your Own compute image
Pros:
- Still easy to deploy
- User can bring own patched version of PostgreSQL
Cons:
- First start latency is O(sizeof(extensions image))
- Warm instance pool for skipping pod schedule latency is not feasible with
O(n) custom images
- Support channels are difficult to manage
3. Download all user extensions in bulk on compute start
Pros:
- Easy to deploy
- No startup latency issues for "clean" users.
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- Downloading all extensions in advance takes a lot of time, thus startup
latency issues
4. Store user's extensions in persistent storage
Pros:
- Easy to deploy
- No startup latency issues
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- EC2 instances have only limited number of attachments shared between EBS
volumes, direct-attached NVMe drives, and ENIs.
- Compute instance migration isn't trivially solved for EBS mounts (e.g.
the device is unavailable whilst moving the mount between instances).
- EBS can only mount on one instance at a time (except the expensive IO2
device type).
5. Store user's extensions in network drive
Pros:
- Easy to deploy
- Few startup latency issues
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- We'd need networked drives, and a lot of them, which would store many
duplicate extensions.
- **UNCHECKED:** Compute instance migration may not work nicely with
networked IOs
### Idea extensions
The extension store does not have to be S3 directly, but could be a Node-local
caching service on top of S3. This would reduce the load on the network for
popular extensions.
## Extension Storage implementation
Extension Storage in our case is an S3 bucket with a "directory" per build and postgres version,
where extension files are stored as plain files in the bucket following the same directory structure as in the postgres.
i.e.
`s3://<the-bucket>/<build-version>/<postgres-version>/lib/postgis-3.1.so`
`s3://<the-bucket>/<build-version>/<postgres-version>/share/extension/postgis.control`
`s3://<the-bucket>/<build-version>/<postgres-version>/share/extension/postgis--3.1.sql`
To handle custom extensions, that available only to specific users, we use per-extension subdirectories:
i.e.
`s3://<the-bucket>/<build-version>/<postgres-version>/<custom-ext-prefix>/lib/ext-name.so`, etc.
`s3://<the-bucket>/<build-version>/<postgres-version>/<custom-ext-prefix>/share/extension/ext-name.control`, etc.
On compute start, `compute_ctl` accepts a list of custom_ext_prefixes.
To get the list of available extensions,`compute_ctl` downloads control files from all prefixes:
`s3://<the-bucket>/<build-version>/<postgres-version>/share/extension/`
`s3://<the-bucket>/<build-version>/<postgres-version>/<custom-ext-prefix1>/share/extension/`
`s3://<the-bucket>/<build-version>/<postgres-version>/<custom-ext-prefix2>/share/extension/`
### How to add new extension to the Extension Storage?
Simply upload build artifacts to the S3 bucket.
Implement a CI step for that. Splitting it from ompute-node-image build.
### How do we deal with extension versions and updates?
Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
This is needed to ensure that `/share` and `/lib` files are in sync.
For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
### Alternatives
For extensions written on trusted languages we can also adopt
`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
This will increase the amount supported extensions and decrease the amount of work required to support them.

View File

@@ -60,9 +60,6 @@ pub struct ComputeSpec {
/// If set, 'storage_auth_token' is used as the password to authenticate to
/// the pageserver and safekeepers.
pub storage_auth_token: Option<String>,
// list of prefixes to search for custom extensions in remote extension storage
pub custom_extensions: Option<Vec<String>>,
}
#[serde_as]

View File

@@ -184,20 +184,6 @@ pub enum GenericRemoteStorage {
}
impl GenericRemoteStorage {
// A function for listing all the files in a "directory"
// Example:
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
match self {
Self::LocalFs(s) => s.list_files(folder).await,
Self::AwsS3(s) => s.list_files(folder).await,
Self::Unreliable(s) => s.list_files(folder).await,
}
}
// lists common *prefixes*, if any of files
// Example:
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
pub async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
@@ -209,6 +195,14 @@ impl GenericRemoteStorage {
}
}
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
match self {
Self::LocalFs(s) => s.list_files(folder).await,
Self::AwsS3(s) => s.list_files(folder).await,
Self::Unreliable(s) => s.list_files(folder).await,
}
}
pub async fn upload(
&self,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,

View File

@@ -349,17 +349,10 @@ impl RemoteStorage for S3Bucket {
/// See the doc for `RemoteStorage::list_files`
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
let mut folder_name = folder
let folder_name = folder
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
// remove leading "/" if one exists
if let Some(folder_name_slash) = folder_name.clone() {
if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
folder_name = Some(folder_name_slash[1..].to_string());
}
}
// AWS may need to break the response into several parts
let mut continuation_token = None;
let mut all_files = vec![];

View File

@@ -1,22 +1,23 @@
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use std::cmp::{max, min};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Instant;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
let mut layer_map = LayerMap::<LayerDescriptor>::default();
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
let mut layer_map = LayerMap::default();
let mut min_lsn = Lsn(u64::MAX);
let mut max_lsn = Lsn(0);
@@ -33,7 +34,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
min_lsn = min(min_lsn, lsn_range.start);
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
updates.insert_historic(layer.layer_desc().clone());
}
println!("min: {min_lsn}, max: {max_lsn}");
@@ -43,7 +44,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
}
/// Construct a layer map query pattern for benchmarks
fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
// For each image layer we query one of the pages contained, at LSN right
// before the image layer was created. This gives us a somewhat uniform
// coverage of both the lsn and key space because image layers have
@@ -69,7 +70,7 @@ fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn
// Construct a partitioning for testing get_difficulty map when we
// don't have an exact result of `collect_keyspace` to work with.
fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
let mut parts = Vec::new();
// We add a partition boundary at the start of each image layer,
@@ -209,13 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
for i in 0..100_000 {
let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
let layer = LayerDescriptor {
key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
lsn: Lsn(i)..Lsn(i + 1),
is_incremental: false,
short_id: format!("Layer {}", i),
};
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
TenantId::generate(),
TimelineId::generate(),
zero.add(10 * i32)..zero.add(10 * i32 + 1),
Lsn(i),
false,
0,
));
updates.insert_historic(layer.layer_desc().clone());
}
updates.flush();
println!("Finished layer map init in {:?}", now.elapsed());

View File

@@ -904,7 +904,7 @@ where
self.check_permission(Some(tenant_id))?;
let lsn = if params.len() >= 3 {
let lsn = if params.len() == 3 {
Some(
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,

View File

@@ -506,17 +506,17 @@ pub async fn shutdown_tasks(
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
}
}
let join_handle = tokio::select! {
let completed = tokio::select! {
biased;
_ = &mut join_handle => { None },
_ = &mut join_handle => { true },
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
// allow some time to elapse before logging to cut down the number of log
// lines.
info!("waiting for {} to shut down", task.name);
Some(join_handle)
false
}
};
if let Some(join_handle) = join_handle {
if !completed {
// we never handled this return value, but:
// - we don't deschedule which would lead to is_cancelled
// - panics are already logged (is_panicked)

View File

@@ -85,6 +85,7 @@ pub mod blob_io;
pub mod block_io;
pub mod disk_btree;
pub(crate) mod ephemeral_file;
pub mod layer_cache;
pub mod layer_map;
pub mod manifest;
@@ -590,6 +591,7 @@ impl Tenant {
.layers
.read()
.await
.0
.iter_historic_layers()
.next()
.is_some(),
@@ -1616,7 +1618,7 @@ impl Tenant {
// No timeout here, GC & Compaction should be responsive to the
// `TimelineState::Stopping` change.
info!("waiting for layer_removal_cs.lock()");
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
let layer_removal_guard = timeline.layer_cache.delete_guard().await;
info!("got layer_removal_cs.lock(), deleting layer files");
// NB: storage_sync upload tasks that reference these layers have been cancelled

View File

@@ -0,0 +1,146 @@
use super::storage_layer::{PersistentLayer, PersistentLayerDesc, PersistentLayerKey, RemoteLayer};
use super::Timeline;
use crate::tenant::layer_map::LayerMap;
use crate::tenant::timeline::compare_arced_layers;
use anyhow::Result;
use std::sync::{Mutex, Weak};
use std::{collections::HashMap, sync::Arc};
/// LayerCache is meant to facilitate mapping to/from whatever `PersistentLayerDesc` to an actual in-memory layer
/// object. In the future, operations that do not modify layer map (i.e., eviction and download) will be implemented
/// here.
pub struct LayerCache {
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
pub layers_removal_lock: Arc<tokio::sync::Mutex<()>>,
/// We need this lock b/c we do not have any way to prevent GC/compaction from removing files in-use.
/// We need to do reference counting on Arc to prevent this from happening, and we can safely remove this lock.
pub layers_operation_lock: Arc<tokio::sync::RwLock<()>>,
/// Will be useful when we move evict / download to layer cache.
#[allow(unused)]
timeline: Weak<Timeline>,
mapping: Mutex<HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>>,
}
pub struct LayerInUseWrite(tokio::sync::OwnedRwLockWriteGuard<()>);
pub struct LayerInUseRead(tokio::sync::OwnedRwLockReadGuard<()>);
#[derive(Clone)]
pub struct LayerDeletionGuard(Arc<tokio::sync::OwnedMutexGuard<()>>);
impl LayerCache {
pub fn new(timeline: Weak<Timeline>) -> Self {
Self {
layers_operation_lock: Arc::new(tokio::sync::RwLock::new(())),
layers_removal_lock: Arc::new(tokio::sync::Mutex::new(())),
mapping: Mutex::new(HashMap::new()),
timeline,
}
}
pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
let guard = self.mapping.lock().unwrap();
guard.get(&desc.key()).expect("not found").clone()
}
/// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
/// we won't delete files that are being read.
pub async fn layer_in_use_write(&self) -> LayerInUseWrite {
LayerInUseWrite(self.layers_operation_lock.clone().write_owned().await)
}
/// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
/// we won't delete files that are being read.
pub async fn layer_in_use_read(&self) -> LayerInUseRead {
LayerInUseRead(self.layers_operation_lock.clone().read_owned().await)
}
/// Ensures only one of compaction / gc can happen at a time.
pub async fn delete_guard(&self) -> LayerDeletionGuard {
LayerDeletionGuard(Arc::new(
self.layers_removal_lock.clone().lock_owned().await,
))
}
/// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
pub fn remove_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.remove(&layer.layer_desc().key());
}
/// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
pub fn populate_remote_when_init(&self, layer: Arc<RemoteLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.insert(layer.layer_desc().key(), layer);
}
/// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
pub fn populate_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.insert(layer.layer_desc().key(), layer);
}
/// Called within read path.
pub fn replace_and_verify(
&self,
expected: Arc<dyn PersistentLayer>,
new: Arc<dyn PersistentLayer>,
) -> Result<()> {
let mut guard = self.mapping.lock().unwrap();
let key = expected.layer_desc().key();
let other = new.layer_desc().key();
let expected_l0 = LayerMap::is_l0(expected.layer_desc());
let new_l0 = LayerMap::is_l0(new.layer_desc());
fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
"layermap-replace-notfound"
));
anyhow::ensure!(
key == other,
"replacing downloaded layer into layermap failed because two layers have different keys: {key:?} != {other:?}"
);
anyhow::ensure!(
expected_l0 == new_l0,
"replacing downloaded layer into layermap failed because one layer is l0 while the other is not: {expected_l0} != {new_l0}"
);
if let Some(layer) = guard.get_mut(&expected.layer_desc().key()) {
anyhow::ensure!(
compare_arced_layers(&expected, layer),
"replacing downloaded layer into layermap failed because another layer was found instead of expected, expected={expected:?}, new={new:?}",
expected = Arc::as_ptr(&expected),
new = Arc::as_ptr(layer),
);
*layer = new;
Ok(())
} else {
anyhow::bail!(
"replacing downloaded layer into layermap failed because layer was not found"
);
}
}
/// Called within write path. When compaction and image layer creation we will create new layers.
pub fn create_new_layer(&self, layer: Arc<dyn PersistentLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.insert(layer.layer_desc().key(), layer);
}
/// Called within write path. When GC and compaction we will remove layers and delete them on disk.
/// Will move logic to delete files here later.
pub fn delete_layer(&self, layer: Arc<dyn PersistentLayer>) {
let mut guard = self.mapping.lock().unwrap();
guard.remove(&layer.layer_desc().key());
}
}

View File

@@ -51,25 +51,23 @@ use crate::keyspace::KeyPartitioning;
use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use crate::tenant::storage_layer::Layer;
use anyhow::Context;
use anyhow::Result;
use std::collections::HashMap;
use std::collections::VecDeque;
use std::ops::Range;
use std::sync::Arc;
use utils::lsn::Lsn;
use historic_layer_coverage::BufferedHistoricLayerCoverage;
pub use historic_layer_coverage::Replacement;
pub use historic_layer_coverage::{LayerKey, Replacement};
use super::storage_layer::range_eq;
use super::storage_layer::PersistentLayerDesc;
use super::storage_layer::PersistentLayerKey;
///
/// LayerMap tracks what layers exist on a timeline.
///
pub struct LayerMap<L: ?Sized> {
#[derive(Default)]
pub struct LayerMap {
//
// 'open_layer' holds the current InMemoryLayer that is accepting new
// records. If it is None, 'next_open_layer_at' will be set instead, indicating
@@ -95,24 +93,6 @@ pub struct LayerMap<L: ?Sized> {
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
/// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
/// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
/// RemoteLayer will be removed.
mapping: HashMap<PersistentLayerKey, Arc<L>>,
}
impl<L: ?Sized> Default for LayerMap<L> {
fn default() -> Self {
Self {
open_layer: None,
next_open_layer_at: None,
frozen_layers: VecDeque::default(),
l0_delta_layers: Vec::default(),
historic: BufferedHistoricLayerCoverage::default(),
mapping: HashMap::default(),
}
}
}
/// The primary update API for the layer map.
@@ -120,24 +100,21 @@ impl<L: ?Sized> Default for LayerMap<L> {
/// Batching historic layer insertions and removals is good for
/// performance and this struct helps us do that correctly.
#[must_use]
pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
pub struct BatchedUpdates<'a> {
// While we hold this exclusive reference to the layer map the type checker
// will prevent us from accidentally reading any unflushed updates.
layer_map: &'a mut LayerMap<L>,
layer_map: &'a mut LayerMap,
}
/// Provide ability to batch more updates while hiding the read
/// API so we don't accidentally read without flushing.
impl<L> BatchedUpdates<'_, L>
where
L: ?Sized + Layer,
{
impl BatchedUpdates<'_> {
///
/// Insert an on-disk layer.
///
// TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
self.layer_map.insert_historic_noflush(layer_desc, layer)
pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
self.layer_map.insert_historic_noflush(layer_desc)
}
///
@@ -145,31 +122,8 @@ where
///
/// This should be called when the corresponding file on disk has been deleted.
///
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
self.layer_map.remove_historic_noflush(layer_desc, layer)
}
/// Replaces existing layer iff it is the `expected`.
///
/// If the expected layer has been removed it will not be inserted by this function.
///
/// Returned `Replacement` describes succeeding in replacement or the reason why it could not
/// be done.
///
/// TODO replacement can be done without buffering and rebuilding layer map updates.
/// One way to do that is to add a layer of indirection for returned values, so
/// that we can replace values only by updating a hashmap.
pub fn replace_historic(
&mut self,
expected_desc: PersistentLayerDesc,
expected: &Arc<L>,
new_desc: PersistentLayerDesc,
new: Arc<L>,
) -> anyhow::Result<Replacement<Arc<L>>> {
fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
self.layer_map
.replace_historic_noflush(expected_desc, expected, new_desc, new)
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
self.layer_map.remove_historic_noflush(layer_desc)
}
// We will flush on drop anyway, but this method makes it
@@ -185,25 +139,19 @@ where
// than panic later or read without flushing.
//
// TODO maybe warn if flush hasn't explicitly been called
impl<L> Drop for BatchedUpdates<'_, L>
where
L: ?Sized + Layer,
{
impl Drop for BatchedUpdates<'_> {
fn drop(&mut self) {
self.layer_map.flush_updates();
}
}
/// Return value of LayerMap::search
pub struct SearchResult<L: ?Sized> {
pub layer: Arc<L>,
pub struct SearchResult {
pub layer: Arc<PersistentLayerDesc>,
pub lsn_floor: Lsn,
}
impl<L> LayerMap<L>
where
L: ?Sized + Layer,
{
impl LayerMap {
///
/// Find the latest layer (by lsn.end) that covers the given
/// 'key', with lsn.start < 'end_lsn'.
@@ -235,7 +183,7 @@ where
/// NOTE: This only searches the 'historic' layers, *not* the
/// 'open' and 'frozen' layers!
///
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
let latest_delta = version.delta_coverage.query(key.to_i128());
let latest_image = version.image_coverage.query(key.to_i128());
@@ -244,7 +192,6 @@ where
(None, None) => None,
(None, Some(image)) => {
let lsn_floor = image.get_lsn_range().start;
let image = self.get_layer_from_mapping(&image.key()).clone();
Some(SearchResult {
layer: image,
lsn_floor,
@@ -252,7 +199,6 @@ where
}
(Some(delta), None) => {
let lsn_floor = delta.get_lsn_range().start;
let delta = self.get_layer_from_mapping(&delta.key()).clone();
Some(SearchResult {
layer: delta,
lsn_floor,
@@ -263,7 +209,6 @@ where
let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
let image_exact_match = img_lsn + 1 == end_lsn;
if image_is_newer || image_exact_match {
let image = self.get_layer_from_mapping(&image.key()).clone();
Some(SearchResult {
layer: image,
lsn_floor: img_lsn,
@@ -271,7 +216,6 @@ where
} else {
let lsn_floor =
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
let delta = self.get_layer_from_mapping(&delta.key()).clone();
Some(SearchResult {
layer: delta,
lsn_floor,
@@ -282,7 +226,7 @@ where
}
/// Start a batch of updates, applied on drop
pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
BatchedUpdates { layer_map: self }
}
@@ -292,48 +236,32 @@ where
/// Helper function for BatchedUpdates::insert_historic
///
/// TODO(chi): remove L generic so that we do not need to pass layer object.
pub(self) fn insert_historic_noflush(
&mut self,
layer_desc: PersistentLayerDesc,
layer: Arc<L>,
) {
self.mapping.insert(layer_desc.key(), layer.clone());
pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
if Self::is_l0(&layer) {
if Self::is_l0(&layer_desc) {
self.l0_delta_layers.push(layer_desc.clone().into());
}
self.historic.insert(
historic_layer_coverage::LayerKey::from(&*layer),
historic_layer_coverage::LayerKey::from(&layer_desc),
layer_desc.into(),
);
}
fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
let layer = self
.mapping
.get(key)
.with_context(|| format!("{key:?}"))
.expect("inconsistent layer mapping");
layer
}
///
/// Remove an on-disk layer from the map.
///
/// Helper function for BatchedUpdates::remove_historic
///
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
self.historic
.remove(historic_layer_coverage::LayerKey::from(&*layer));
if Self::is_l0(&layer) {
.remove(historic_layer_coverage::LayerKey::from(&layer_desc));
let layer_key = layer_desc.key();
if Self::is_l0(&layer_desc) {
let len_before = self.l0_delta_layers.len();
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
l0_delta_layers.retain(|other| {
!Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
});
l0_delta_layers.retain(|other| other.key() != layer_key);
self.l0_delta_layers = l0_delta_layers;
// this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
// there's a chance that the comparison fails at runtime due to it comparing (pointer,
@@ -344,69 +272,6 @@ where
"failed to locate removed historic layer from l0_delta_layers"
);
}
self.mapping.remove(&layer_desc.key());
}
pub(self) fn replace_historic_noflush(
&mut self,
expected_desc: PersistentLayerDesc,
expected: &Arc<L>,
new_desc: PersistentLayerDesc,
new: Arc<L>,
) -> anyhow::Result<Replacement<Arc<L>>> {
let key = historic_layer_coverage::LayerKey::from(&**expected);
let other = historic_layer_coverage::LayerKey::from(&*new);
let expected_l0 = Self::is_l0(expected);
let new_l0 = Self::is_l0(&new);
anyhow::ensure!(
key == other,
"expected and new must have equal LayerKeys: {key:?} != {other:?}"
);
anyhow::ensure!(
expected_l0 == new_l0,
"expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
);
let l0_index = if expected_l0 {
// find the index in case replace worked, we need to replace that as well
let pos = self.l0_delta_layers.iter().position(|slot| {
Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
});
if pos.is_none() {
return Ok(Replacement::NotFound);
}
pos
} else {
None
};
let new_desc = Arc::new(new_desc);
let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
**existing == expected_desc
});
if let Replacement::Replaced { .. } = &replaced {
self.mapping.remove(&expected_desc.key());
self.mapping.insert(new_desc.key(), new);
if let Some(index) = l0_index {
self.l0_delta_layers[index] = new_desc;
}
}
let replaced = match replaced {
Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
Replacement::NotFound => Replacement::NotFound,
Replacement::RemovalBuffered => Replacement::RemovalBuffered,
Replacement::Unexpected(x) => {
Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
}
};
Ok(replaced)
}
/// Helper function for BatchedUpdates::drop.
@@ -454,10 +319,8 @@ where
Ok(true)
}
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
self.historic
.iter()
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
self.historic.iter()
}
///
@@ -472,7 +335,7 @@ where
&self,
key_range: &Range<Key>,
lsn: Lsn,
) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
let version = match self.historic.get().unwrap().get_version(lsn.0) {
Some(v) => v,
None => return Ok(vec![]),
@@ -482,36 +345,26 @@ where
let end = key_range.end.to_i128();
// Initialize loop variables
let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
let mut current_key = start;
let mut current_val = version.image_coverage.query(start);
// Loop through the change events and push intervals
for (change_key, change_val) in version.image_coverage.range(start..end) {
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
coverage.push((
kr,
current_val
.take()
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
));
coverage.push((kr, current_val.take()));
current_key = change_key;
current_val = change_val.clone();
}
// Add the final interval
let kr = Key::from_i128(current_key)..Key::from_i128(end);
coverage.push((
kr,
current_val
.take()
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
));
coverage.push((kr, current_val.take()));
Ok(coverage)
}
pub fn is_l0(layer: &L) -> bool {
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
}
@@ -537,7 +390,7 @@ where
/// TODO The optimal number should probably be slightly higher than 1, but to
/// implement that we need to plumb a lot more context into this function
/// than just the current partition_range.
pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
// Case 1
if !Self::is_l0(layer) {
return true;
@@ -595,9 +448,7 @@ where
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
let lr = lsn.start..val.get_lsn_range().start;
if !kr.is_empty() {
let base_count =
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
as usize;
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath =
self.count_deltas(&kr, &lr, new_limit)?;
@@ -620,9 +471,7 @@ where
let lr = lsn.start..val.get_lsn_range().start;
if !kr.is_empty() {
let base_count =
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
as usize;
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
max_stacked_deltas = std::cmp::max(
@@ -772,12 +621,8 @@ where
}
/// Return all L0 delta layers
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
Ok(self
.l0_delta_layers
.iter()
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
.collect())
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
Ok(self.l0_delta_layers.to_vec())
}
/// debugging function to print out the contents of the layer map
@@ -802,97 +647,48 @@ where
println!("End dump LayerMap");
Ok(())
}
/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
///
/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
#[inline(always)]
pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
// "dyn Trait" objects are "fat pointers" in that they have two components:
// - pointer to the object
// - pointer to the vtable
//
// rust does not provide a guarantee that these vtables are unique, but however
// `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
// pointer and the vtable need to be equal.
//
// See: https://github.com/rust-lang/rust/issues/103763
//
// A future version of rust will most likely use this form below, where we cast each
// pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
// not affect the comparison.
//
// See: https://github.com/rust-lang/rust/pull/106450
let left = Arc::as_ptr(left) as *const ();
let right = Arc::as_ptr(right) as *const ();
left == right
}
}
#[cfg(test)]
mod tests {
use super::{LayerMap, Replacement};
use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
use super::LayerMap;
use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
use std::str::FromStr;
use std::sync::Arc;
mod l0_delta_layers_updated {
use crate::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
use super::*;
#[test]
fn for_full_range_delta() {
// l0_delta_layers are used by compaction, and should observe all buffered updates
l0_delta_layers_updated_scenario(
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
true
)
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
true
)
}
#[test]
fn for_non_full_range_delta() {
// has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
l0_delta_layers_updated_scenario(
"000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
// because not full range
false
)
"000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
// because not full range
false
)
}
#[test]
fn for_image() {
l0_delta_layers_updated_scenario(
"000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
// code only checks if it is a full range layer, doesn't care about images, which must
// mean we should in practice never have full range images
false
)
}
#[test]
fn replacing_missing_l0_is_notfound() {
// original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
// however only happen for precondition failures.
let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
let layer = LayerFileName::from_str(layer).unwrap();
let layer = LayerDescriptor::from(layer);
// same skeletan construction; see scenario below
let not_found = Arc::new(layer.clone());
let new_version = Arc::new(layer);
let mut map = LayerMap::default();
let res = map.batch_update().replace_historic(
not_found.get_persistent_layer_desc(),
&not_found,
new_version.get_persistent_layer_desc(),
new_version,
);
assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
"000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
// code only checks if it is a full range layer, doesn't care about images, which must
// mean we should in practice never have full range images
false
)
}
fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
@@ -906,46 +702,31 @@ mod tests {
// two disjoint Arcs in different lifecycle phases. even if it seems they must be the
// same layer, we use LayerMap::compare_arced_layers as the identity of layers.
assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));
assert_eq!(remote.layer_desc(), downloaded.layer_desc());
let expected_in_counts = (1, usize::from(expected_l0));
map.batch_update()
.insert_historic(remote.get_persistent_layer_desc(), remote.clone());
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
let replaced = map
.batch_update()
.replace_historic(
remote.get_persistent_layer_desc(),
&remote,
downloaded.get_persistent_layer_desc(),
downloaded.clone(),
)
.expect("name derived attributes are the same");
assert!(
matches!(replaced, Replacement::Replaced { .. }),
"{replaced:?}"
.insert_historic(remote.layer_desc().clone());
assert_eq!(
count_layer_in(&map, remote.layer_desc()),
expected_in_counts
);
assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
map.batch_update()
.remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
.remove_historic(downloaded.layer_desc().clone());
assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
}
fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
let historic = map
.iter_historic_layers()
.filter(|x| LayerMap::compare_arced_layers(x, layer))
.filter(|x| x.key() == layer.key())
.count();
let l0s = map
.get_level0_deltas()
.expect("why does this return a result");
let l0 = l0s
.iter()
.filter(|x| LayerMap::compare_arced_layers(x, layer))
.count();
let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
(historic, l0)
}

View File

@@ -3,6 +3,8 @@ use std::ops::Range;
use tracing::info;
use crate::tenant::storage_layer::PersistentLayerDesc;
use super::layer_coverage::LayerCoverageTuple;
/// Layers in this module are identified and indexed by this data.
@@ -53,6 +55,18 @@ impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerK
}
}
impl From<&PersistentLayerDesc> for LayerKey {
fn from(layer: &PersistentLayerDesc) -> Self {
let kr = layer.get_key_range();
let lr = layer.get_lsn_range();
LayerKey {
key: kr.start.to_i128()..kr.end.to_i128(),
lsn: lr.start.0..lr.end.0,
is_image: !layer.is_incremental(),
}
}
}
/// Efficiently queryable layer coverage for each LSN.
///
/// Allows answering layer map queries very efficiently,
@@ -467,6 +481,11 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
///
/// Returns a `Replacement` value describing the outcome; only the case of
/// `Replacement::Replaced` modifies the map and requires a rebuild.
///
/// This function is unlikely to be used in the future because LayerMap now only records the
/// layer descriptors. Therefore, anything added to the layer map will only be removed or
/// added, and never replaced.
#[cfg(test)]
pub fn replace<F>(
&mut self,
layer_key: &LayerKey,

View File

@@ -176,13 +176,10 @@ impl LayerAccessStats {
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
///
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
pub(crate) fn for_loading_layer<L>(
layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
pub(crate) fn for_loading_layer(
layer_map_lock_held_witness: &BatchedUpdates<'_>,
status: LayerResidenceStatus,
) -> Self
where
L: ?Sized + Layer,
{
) -> Self {
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
new.record_residence_event(
layer_map_lock_held_witness,
@@ -197,14 +194,11 @@ impl LayerAccessStats {
/// The `new_status` is not recorded in `self`.
///
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
pub(crate) fn clone_for_residence_change<L>(
pub(crate) fn clone_for_residence_change(
&self,
layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
layer_map_lock_held_witness: &BatchedUpdates<'_>,
new_status: LayerResidenceStatus,
) -> LayerAccessStats
where
L: ?Sized + Layer,
{
) -> LayerAccessStats {
let clone = {
let inner = self.0.lock().unwrap();
inner.clone()
@@ -232,14 +226,12 @@ impl LayerAccessStats {
/// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
/// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
///
pub(crate) fn record_residence_event<L>(
pub(crate) fn record_residence_event(
&self,
_layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
_layer_map_lock_held_witness: &BatchedUpdates<'_>,
status: LayerResidenceStatus,
reason: LayerResidenceEventReason,
) where
L: ?Sized + Layer,
{
) {
let mut locked = self.0.lock().unwrap();
locked.iter_mut().for_each(|inner| {
inner
@@ -473,94 +465,125 @@ pub fn downcast_remote_layer(
}
}
/// Holds metadata about a layer without any content. Used mostly for testing.
///
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
/// LayerDescriptor.
#[derive(Clone, Debug)]
pub struct LayerDescriptor {
pub key: Range<Key>,
pub lsn: Range<Lsn>,
pub is_incremental: bool,
pub short_id: String,
}
pub mod tests {
use super::*;
impl LayerDescriptor {
/// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
/// and the tenant / timeline id does not matter.
pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
PersistentLayerDesc::new_delta(
TenantId::from_array([0; 16]),
TimelineId::from_array([0; 16]),
self.key.clone(),
self.lsn.clone(),
233,
)
}
}
impl Layer for LayerDescriptor {
fn get_key_range(&self) -> Range<Key> {
self.key.clone()
/// Holds metadata about a layer without any content. Used mostly for testing.
///
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
/// LayerDescriptor.
#[derive(Clone, Debug)]
pub struct LayerDescriptor {
base: PersistentLayerDesc,
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn.clone()
}
fn is_incremental(&self) -> bool {
self.is_incremental
}
fn get_value_reconstruct_data(
&self,
_key: Key,
_lsn_range: Range<Lsn>,
_reconstruct_data: &mut ValueReconstructState,
_ctx: &RequestContext,
) -> Result<ValueReconstructResult> {
todo!("This method shouldn't be part of the Layer trait")
}
fn short_id(&self) -> String {
self.short_id.clone()
}
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
todo!()
}
}
impl From<DeltaFileName> for LayerDescriptor {
fn from(value: DeltaFileName) -> Self {
let short_id = value.to_string();
LayerDescriptor {
key: value.key_range,
lsn: value.lsn_range,
is_incremental: true,
short_id,
impl From<PersistentLayerDesc> for LayerDescriptor {
fn from(base: PersistentLayerDesc) -> Self {
Self { base }
}
}
}
impl From<ImageFileName> for LayerDescriptor {
fn from(value: ImageFileName) -> Self {
let short_id = value.to_string();
let lsn = value.lsn_as_range();
LayerDescriptor {
key: value.key_range,
lsn,
is_incremental: false,
short_id,
impl Layer for LayerDescriptor {
fn get_value_reconstruct_data(
&self,
_key: Key,
_lsn_range: Range<Lsn>,
_reconstruct_data: &mut ValueReconstructState,
_ctx: &RequestContext,
) -> Result<ValueReconstructResult> {
todo!("This method shouldn't be part of the Layer trait")
}
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
todo!()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn short_id(&self) -> String {
self.layer_desc().short_id()
}
}
}
impl From<LayerFileName> for LayerDescriptor {
fn from(value: LayerFileName) -> Self {
match value {
LayerFileName::Delta(d) => Self::from(d),
LayerFileName::Image(i) => Self::from(i),
impl PersistentLayer for LayerDescriptor {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.base
}
fn local_path(&self) -> Option<PathBuf> {
unimplemented!()
}
fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
unimplemented!()
}
fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
unimplemented!()
}
fn delete_resident_layer_file(&self) -> Result<()> {
unimplemented!()
}
fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
unimplemented!()
}
fn access_stats(&self) -> &LayerAccessStats {
unimplemented!()
}
}
impl From<DeltaFileName> for LayerDescriptor {
fn from(value: DeltaFileName) -> Self {
LayerDescriptor {
base: PersistentLayerDesc::new_delta(
TenantId::from_array([0; 16]),
TimelineId::from_array([0; 16]),
value.key_range,
value.lsn_range,
233,
),
}
}
}
impl From<ImageFileName> for LayerDescriptor {
fn from(value: ImageFileName) -> Self {
LayerDescriptor {
base: PersistentLayerDesc::new_img(
TenantId::from_array([0; 16]),
TimelineId::from_array([0; 16]),
value.key_range,
value.lsn,
false,
233,
),
}
}
}
impl From<LayerFileName> for LayerDescriptor {
fn from(value: LayerFileName) -> Self {
match value {
LayerFileName::Delta(d) => Self::from(d),
LayerFileName::Image(i) => Self::from(i),
}
}
}
}

View File

@@ -218,15 +218,12 @@ impl RemoteLayer {
}
/// Create a Layer struct representing this layer, after it has been downloaded.
pub fn create_downloaded_layer<L>(
pub fn create_downloaded_layer(
&self,
layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
layer_map_lock_held_witness: &BatchedUpdates<'_>,
conf: &'static PageServerConf,
file_size: u64,
) -> Arc<dyn PersistentLayer>
where
L: ?Sized + Layer,
{
) -> Arc<dyn PersistentLayer> {
if self.desc.is_delta {
let fname = self.desc.delta_file_name();
Arc::new(DeltaLayer::new(

View File

@@ -3,7 +3,7 @@
mod eviction_task;
mod walreceiver;
use anyhow::{anyhow, bail, ensure, Context};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::Bytes;
use fail::fail_point;
use futures::StreamExt;
@@ -82,10 +82,13 @@ use self::eviction_task::EvictionTaskTimelineState;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::layer_cache::{LayerCache, LayerDeletionGuard};
use super::layer_map::BatchedUpdates;
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::RemoteTimelineClient;
use super::storage_layer::{DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset};
use super::storage_layer::{
DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc,
};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(super) enum FlushLoopState {
@@ -118,8 +121,28 @@ impl PartialOrd for Hole {
}
}
pub struct LayerFileManager(());
impl LayerFileManager {
pub(crate) fn new() -> Self {
Self(())
}
}
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
/// Can be removed after all refactors are done.
fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
drop(rlock)
}
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
/// Can be removed after all refactors are done.
fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
drop(rlock)
}
pub struct Timeline {
conf: &'static PageServerConf,
pub(super) conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<TenantConfOpt>>,
myself: Weak<Self>,
@@ -129,7 +152,9 @@ pub struct Timeline {
pub pg_version: u32,
pub(crate) layers: Arc<tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>>,
pub(crate) layers: Arc<tokio::sync::RwLock<(LayerMap, LayerFileManager)>>,
pub(super) layer_cache: LayerCache,
/// Set of key ranges which should be covered by image layers to
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
@@ -202,13 +227,6 @@ pub struct Timeline {
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
@@ -599,7 +617,8 @@ impl Timeline {
/// This method makes no distinction between local and remote layers.
/// Hence, the result **does not represent local filesystem usage**.
pub async fn layer_size_sum(&self) -> u64 {
let layer_map = self.layers.read().await;
let guard = self.layers.read().await;
let (layer_map, _) = &*guard;
let mut size = 0;
for l in layer_map.iter_historic_layers() {
size += l.file_size();
@@ -819,7 +838,7 @@ impl Timeline {
// Below are functions compact_level0() and create_image_layers()
// but they are a bit ad hoc and don't quite work like it's explained
// above. Rewrite it.
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
let layer_removal_cs = self.layer_cache.delete_guard().await;
// Is the timeline being deleted?
if self.is_stopping() {
return Err(anyhow::anyhow!("timeline is Stopping").into());
@@ -909,7 +928,8 @@ impl Timeline {
pub async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
let last_lsn = self.get_last_record_lsn();
let open_layer_size = {
let layers = self.layers.read().await;
let guard = self.layers.read().await;
let (layers, _) = &*guard;
let Some(open_layer) = layers.open_layer.as_ref() else {
return Ok(());
};
@@ -1040,7 +1060,8 @@ impl Timeline {
}
pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
let layer_map = self.layers.read().await;
let guard = self.layers.read().await;
let (layer_map, _) = &*guard;
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
if let Some(open_layer) = &layer_map.open_layer {
in_memory_layers.push(open_layer.info());
@@ -1051,6 +1072,7 @@ impl Timeline {
let mut historic_layers = Vec::new();
for historic_layer in layer_map.iter_historic_layers() {
let historic_layer = self.layer_cache.get_from_desc(&historic_layer);
historic_layers.push(historic_layer.info(reset));
}
@@ -1148,7 +1170,7 @@ impl Timeline {
.context("wait for layer upload ops to complete")?;
// now lock out layer removal (compaction, gc, timeline deletion)
let layer_removal_guard = self.layer_removal_cs.lock().await;
let layer_removal_guard = self.layer_cache.delete_guard().await;
{
// to avoid racing with detach and delete_timeline
@@ -1160,7 +1182,8 @@ impl Timeline {
}
// start the batch update
let mut layer_map = self.layers.write().await;
let mut guard = self.layers.write().await;
let (layer_map, _) = &mut *guard;
let mut batch_updates = layer_map.batch_update();
let mut results = Vec::with_capacity(layers_to_evict.len());
@@ -1176,7 +1199,7 @@ impl Timeline {
// commit the updates & release locks
batch_updates.flush();
drop(layer_map);
drop_wlock(guard);
drop(layer_removal_guard);
assert_eq!(results.len(), layers_to_evict.len());
@@ -1185,12 +1208,10 @@ impl Timeline {
fn evict_layer_batch_impl(
&self,
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
_layer_removal_cs: &LayerDeletionGuard,
local_layer: &Arc<dyn PersistentLayer>,
batch_updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
batch_updates: &mut BatchedUpdates<'_>,
) -> anyhow::Result<bool> {
use super::layer_map::Replacement;
if local_layer.is_remote_layer() {
// TODO(issue #3851): consider returning an err here instead of false,
// which is the same out the match later
@@ -1238,13 +1259,13 @@ impl Timeline {
),
});
let replaced = match batch_updates.replace_historic(
local_layer.layer_desc().clone(),
local_layer,
new_remote_layer.layer_desc().clone(),
new_remote_layer,
)? {
Replacement::Replaced { .. } => {
assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());
let succeed = match self
.layer_cache
.replace_and_verify(local_layer.clone(), new_remote_layer)
{
Ok(()) => {
if let Err(e) = local_layer.delete_resident_layer_file() {
error!("failed to remove layer file on evict after replacement: {e:#?}");
}
@@ -1277,24 +1298,17 @@ impl Timeline {
true
}
Replacement::NotFound => {
debug!(evicted=?local_layer, "layer was no longer in layer map");
false
}
Replacement::RemovalBuffered => {
unreachable!("not doing anything else in this batch")
}
Replacement::Unexpected(other) => {
error!(
local_layer.ptr=?Arc::as_ptr(local_layer),
other.ptr=?Arc::as_ptr(&other),
?other,
"failed to replace");
Err(err) => {
if cfg!(debug_assertions) {
panic!("failed to replace: {err}, evicted: {local_layer:?}");
} else {
error!(evicted=?local_layer, "failed to replace: {err}");
}
false
}
};
Ok(replaced)
Ok(succeed)
}
}
@@ -1418,7 +1432,11 @@ impl Timeline {
timeline_id,
tenant_id,
pg_version,
layers: Arc::new(tokio::sync::RwLock::new(LayerMap::default())),
layers: Arc::new(tokio::sync::RwLock::new((
LayerMap::default(),
LayerFileManager::new(),
))),
layer_cache: LayerCache::new(myself.clone()),
wanted_image_layers: Mutex::new(None),
walredo_mgr,
@@ -1454,7 +1472,6 @@ impl Timeline {
layer_flush_done_tx,
write_lock: tokio::sync::Mutex::new(()),
layer_removal_cs: Default::default(),
gc_info: std::sync::RwLock::new(GcInfo {
retain_lsns: Vec::new(),
@@ -1602,14 +1619,15 @@ impl Timeline {
let mut layers = self.layers.try_write().expect(
"in the context where we call this function, no other task has access to the object",
);
layers.next_open_layer_at = Some(Lsn(start_lsn.0));
layers.0.next_open_layer_at = Some(Lsn(start_lsn.0));
}
///
/// Scan the timeline directory to populate the layer map.
///
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
let mut layers = self.layers.write().await;
let mut guard = self.layers.write().await;
let (layers, _) = &mut *guard;
let mut updates = layers.batch_update();
let mut num_layers = 0;
@@ -1652,7 +1670,8 @@ impl Timeline {
trace!("found layer {}", layer.path().display());
total_physical_size += file_size;
updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
updates.insert_historic(layer.layer_desc().clone());
self.layer_cache.populate_local_when_init(Arc::new(layer));
num_layers += 1;
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
// Create a DeltaLayer struct for each delta file.
@@ -1684,7 +1703,8 @@ impl Timeline {
trace!("found layer {}", layer.path().display());
total_physical_size += file_size;
updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
updates.insert_historic(layer.layer_desc().clone());
self.layer_cache.populate_local_when_init(Arc::new(layer));
num_layers += 1;
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
// ignore these
@@ -1738,7 +1758,8 @@ impl Timeline {
// We're holding a layer map lock for a while but this
// method is only called during init so it's fine.
let mut layer_map = self.layers.write().await;
let mut guard = self.layers.write().await;
let (layer_map, _) = &mut *guard;
let mut updates = layer_map.batch_update();
for remote_layer_name in &index_part.timeline_layers {
let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1783,7 +1804,8 @@ impl Timeline {
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
} else {
self.metrics.resident_physical_size_gauge.sub(local_size);
updates.remove_historic(local_layer.layer_desc().clone(), local_layer);
updates.remove_historic(local_layer.layer_desc().clone());
self.layer_cache.remove_local_when_init(local_layer);
// fall-through to adding the remote layer
}
} else {
@@ -1822,7 +1844,8 @@ impl Timeline {
);
let remote_layer = Arc::new(remote_layer);
updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
updates.insert_historic(remote_layer.layer_desc().clone());
self.layer_cache.populate_remote_when_init(remote_layer);
}
LayerFileName::Delta(deltafilename) => {
// Create a RemoteLayer for the delta file.
@@ -1849,7 +1872,8 @@ impl Timeline {
),
);
let remote_layer = Arc::new(remote_layer);
updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
updates.insert_historic(remote_layer.layer_desc().clone());
self.layer_cache.populate_remote_when_init(remote_layer);
}
}
}
@@ -1888,13 +1912,14 @@ impl Timeline {
let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
let local_layers = self
.layers
.read()
.await
.iter_historic_layers()
.map(|l| (l.filename(), l))
.collect::<HashMap<_, _>>();
let local_layers = {
let guard = self.layers.read().await;
let (layers, _) = &*guard;
layers
.iter_historic_layers()
.map(|l| (l.filename(), self.layer_cache.get_from_desc(&l)))
.collect::<HashMap<_, _>>()
};
// If no writes happen, new branches do not have any layers, only the metadata file.
let has_local_layers = !local_layers.is_empty();
@@ -2265,10 +2290,12 @@ impl Timeline {
}
async fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
for historic_layer in self.layers.read().await.iter_historic_layers() {
let guard = self.layers.read().await;
let (layers, _) = &*guard;
for historic_layer in layers.iter_historic_layers() {
let historic_layer_name = historic_layer.filename().file_name();
if layer_file_name == historic_layer_name {
return Some(historic_layer);
return Some(self.layer_cache.get_from_desc(&historic_layer));
}
}
@@ -2280,10 +2307,11 @@ impl Timeline {
fn delete_historic_layer(
&self,
// we cannot remove layers otherwise, since gc and compaction will race
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
layer: Arc<dyn PersistentLayer>,
updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
_layer_removal_cs: LayerDeletionGuard,
layer: Arc<PersistentLayerDesc>,
updates: &mut BatchedUpdates<'_>,
) -> anyhow::Result<()> {
let layer = self.layer_cache.get_from_desc(&layer);
if !layer.is_remote_layer() {
layer.delete_resident_layer_file()?;
let layer_file_size = layer.file_size();
@@ -2297,7 +2325,8 @@ impl Timeline {
// won't be needed for page reconstruction for this timeline,
// and mark what we can't delete yet as deleted from the layer
// map index without actually rebuilding the index.
updates.remove_historic(layer.layer_desc().clone(), layer);
updates.remove_historic(layer.layer_desc().clone());
self.layer_cache.delete_layer(layer);
Ok(())
}
@@ -2482,7 +2511,8 @@ impl Timeline {
#[allow(clippy::never_loop)] // see comment at bottom of this loop
'layer_map_search: loop {
let remote_layer = {
let layers = timeline.layers.read().await;
let guard = timeline.layers.read().await;
let (layers, _) = &*guard;
// Check the open and frozen in-memory layers first, in order from newest
// to oldest.
@@ -2544,6 +2574,7 @@ impl Timeline {
}
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
let layer = timeline.layer_cache.get_from_desc(&layer);
// If it's a remote layer, download it and retry.
if let Some(remote_layer) =
super::storage_layer::downcast_remote_layer(&layer)
@@ -2665,7 +2696,8 @@ impl Timeline {
/// Get a handle to the latest layer for appending.
///
async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
let mut layers = self.layers.write().await;
let mut guard = self.layers.write().await;
let (layers, _) = &mut *guard;
ensure!(lsn.is_aligned());
@@ -2742,7 +2774,8 @@ impl Timeline {
} else {
Some(self.write_lock.lock().await)
};
let mut layers = self.layers.write().await;
let mut guard = self.layers.write().await;
let (layers, _) = &mut *guard;
if let Some(open_layer) = &layers.open_layer {
let open_layer_rc = Arc::clone(open_layer);
// Does this layer need freezing?
@@ -2756,7 +2789,7 @@ impl Timeline {
layers.next_open_layer_at = Some(end_lsn);
self.last_freeze_at.store(end_lsn);
}
drop(layers);
drop_wlock(guard);
}
/// Layer flusher task's main loop.
@@ -2780,7 +2813,8 @@ impl Timeline {
let flush_counter = *layer_flush_start_rx.borrow();
let result = loop {
let layer_to_flush = {
let layers = self.layers.read().await;
let guard = self.layers.read().await;
let (layers, _) = &*guard;
layers.frozen_layers.front().cloned()
// drop 'layers' lock to allow concurrent reads and writes
};
@@ -2903,15 +2937,17 @@ impl Timeline {
fail_point!("flush-frozen-before-sync");
// The new on-disk layers are now in the layer map. We can remove the
// in-memory layer from the map now.
// in-memory layer from the map now. We do not modify `LayerFileManager` because
// it only contains persistent layers. The flushed layer is stored in
// the mapping in `create_delta_layer`.
{
let mut layers = self.layers.write().await;
let l = layers.frozen_layers.pop_front();
let l = layers.0.frozen_layers.pop_front();
// Only one thread may call this function at a time (for this
// timeline). If two threads tried to flush the same frozen
// layer to disk at the same time, that would not work.
assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer));
assert!(compare_arced_layers(&l.unwrap(), &frozen_layer));
// release lock on 'layers'
}
@@ -3044,14 +3080,16 @@ impl Timeline {
// Add it to the layer map
let l = Arc::new(new_delta);
let mut layers = self.layers.write().await;
let mut guard = self.layers.write().await;
let (layers, _) = &mut *guard;
let mut batch_updates = layers.batch_update();
l.access_stats().record_residence_event(
&batch_updates,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
batch_updates.insert_historic(l.layer_desc().clone(), l);
batch_updates.insert_historic(l.layer_desc().clone());
self.layer_cache.create_new_layer(l);
batch_updates.flush();
// update metrics
@@ -3100,7 +3138,8 @@ impl Timeline {
) -> anyhow::Result<bool> {
let threshold = self.get_image_creation_threshold();
let layers = self.layers.read().await;
let guard = self.layers.read().await;
let (layers, _) = &*guard;
let mut max_deltas = 0;
{
@@ -3278,7 +3317,8 @@ impl Timeline {
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
let mut layers = self.layers.write().await;
let mut guard = self.layers.write().await;
let (layers, _) = &mut *guard;
let mut updates = layers.batch_update();
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
@@ -3300,10 +3340,11 @@ impl Timeline {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
updates.insert_historic(l.layer_desc().clone(), l);
updates.insert_historic(l.layer_desc().clone());
self.layer_cache.create_new_layer(l);
}
updates.flush();
drop(layers);
drop_wlock(guard);
timer.stop_and_record();
Ok(layer_paths_to_upload)
@@ -3313,7 +3354,7 @@ impl Timeline {
#[derive(Default)]
struct CompactLevel0Phase1Result {
new_layers: Vec<DeltaLayer>,
deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
deltas_to_compact: Vec<Arc<PersistentLayerDesc>>,
}
/// Top-level failure to compact.
@@ -3456,22 +3497,22 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
}
impl Timeline {
/// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
///
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
/// start of level0 files compaction, the on-demand download should be revisited as well.
fn compact_level0_phase1(
self: Arc<Self>,
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
layers: tokio::sync::OwnedRwLockReadGuard<LayerMap<dyn PersistentLayer>>,
_layer_removal_cs: LayerDeletionGuard,
guard: tokio::sync::OwnedRwLockReadGuard<(LayerMap, LayerFileManager)>,
mut stats: CompactLevel0Phase1StatsBuilder,
target_file_size: u64,
ctx: &RequestContext,
) -> Result<CompactLevel0Phase1Result, CompactionError> {
stats.read_lock_held_spawn_blocking_startup_micros =
stats.read_lock_acquisition_micros.till_now(); // set by caller
let mut level0_deltas = layers.get_level0_deltas()?;
let (layers, _) = &*guard;
let level0_deltas = layers.get_level0_deltas()?;
let mut level0_deltas = level0_deltas
.into_iter()
.map(|x| self.layer_cache.get_from_desc(&x))
.collect_vec();
stats.level0_deltas_count = Some(level0_deltas.len());
// Only compact if enough layers have accumulated.
let threshold = self.get_compaction_threshold();
@@ -3591,7 +3632,7 @@ impl Timeline {
}
stats.read_lock_held_compute_holes_micros =
stats.read_lock_held_prerequisites_micros.till_now();
drop(layers);
drop_rlock(guard);
stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
let mut holes = heap.into_vec();
holes.sort_unstable_by_key(|hole| hole.key_range.start);
@@ -3818,7 +3859,10 @@ impl Timeline {
Ok(CompactLevel0Phase1Result {
new_layers,
deltas_to_compact,
deltas_to_compact: deltas_to_compact
.into_iter()
.map(|x| Arc::new(x.layer_desc().clone()))
.collect(),
})
}
@@ -3828,7 +3872,7 @@ impl Timeline {
///
async fn compact_level0(
self: &Arc<Self>,
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
layer_removal_cs: LayerDeletionGuard,
target_file_size: u64,
ctx: &RequestContext,
) -> Result<(), CompactionError> {
@@ -3882,7 +3926,8 @@ impl Timeline {
.context("wait for layer upload ops to complete")?;
}
let mut layers = self.layers.write().await;
let mut guard = self.layers.write().await;
let (layers, _) = &mut *guard;
let mut updates = layers.batch_update();
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
for l in new_layers {
@@ -3914,7 +3959,8 @@ impl Timeline {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
updates.insert_historic(x.layer_desc().clone(), x);
updates.insert_historic(x.layer_desc().clone());
self.layer_cache.create_new_layer(x);
}
// Now that we have reshuffled the data to set of new delta layers, we can
@@ -3925,7 +3971,7 @@ impl Timeline {
self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
}
updates.flush();
drop(layers);
drop_wlock(guard);
// Also schedule the deletions in remote storage
if let Some(remote_client) = &self.remote_client {
@@ -4043,7 +4089,7 @@ impl Timeline {
fail_point!("before-timeline-gc");
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
let layer_removal_cs = self.layer_cache.delete_guard().await;
// Is the timeline being deleted?
if self.is_stopping() {
anyhow::bail!("timeline is Stopping");
@@ -4081,7 +4127,7 @@ impl Timeline {
async fn gc_timeline(
&self,
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
layer_removal_cs: LayerDeletionGuard,
horizon_cutoff: Lsn,
pitr_cutoff: Lsn,
retain_lsns: Vec<Lsn>,
@@ -4142,7 +4188,8 @@ impl Timeline {
// 4. newer on-disk image layers cover the layer's whole key range
//
// TODO holding a write lock is too agressive and avoidable
let mut layers = self.layers.write().await;
let mut guard = self.layers.write().await;
let (layers, _) = &mut *guard;
'outer: for l in layers.iter_historic_layers() {
result.layers_total += 1;
@@ -4221,7 +4268,7 @@ impl Timeline {
// delta layers. Image layers can form "stairs" preventing old image from been deleted.
// But image layers are in any case less sparse than delta layers. Also we need some
// protection from replacing recent image layers with new one after each GC iteration.
if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) {
if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) {
wanted_image_layers.add_range(l.get_key_range());
}
result.layers_not_updated += 1;
@@ -4442,42 +4489,16 @@ impl Timeline {
// Download complete. Replace the RemoteLayer with the corresponding
// Delta- or ImageLayer in the layer map.
let mut layers = self_clone.layers.write().await;
let mut updates = layers.batch_update();
let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
let mut guard = self_clone.layers.write().await;
let (layers, _) = &mut *guard;
let updates = layers.batch_update();
let new_layer =
remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
{
use crate::tenant::layer_map::Replacement;
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
let failure = match updates.replace_historic(l.layer_desc().clone(), &l, new_layer.layer_desc().clone(), new_layer) {
Ok(Replacement::Replaced { .. }) => false,
Ok(Replacement::NotFound) => {
// TODO: the downloaded file should probably be removed, otherwise
// it will be added to the layermap on next load? we should
// probably restart any get_reconstruct_data search as well.
//
// See: https://github.com/neondatabase/neon/issues/3533
error!("replacing downloaded layer into layermap failed because layer was not found");
true
}
Ok(Replacement::RemovalBuffered) => {
unreachable!("current implementation does not remove anything")
}
Ok(Replacement::Unexpected(other)) => {
// if the other layer would have the same pointer value as
// expected, it means they differ only on vtables.
//
// otherwise there's no known reason for this to happen as
// compacted layers should have different covering rectangle
// leading to produce Replacement::NotFound.
error!(
expected.ptr = ?Arc::as_ptr(&l),
other.ptr = ?Arc::as_ptr(&other),
?other,
"replacing downloaded layer into layermap failed because another layer was found instead of expected"
);
true
}
let failure = match self_clone.layer_cache.replace_and_verify(l, new_layer)
{
Ok(()) => false,
Err(e) => {
// this is a precondition failure, the layer filename derived
// attributes didn't match up, which doesn't seem likely.
@@ -4505,7 +4526,7 @@ impl Timeline {
}
}
updates.flush();
drop(layers);
drop_wlock(guard);
info!("on-demand download successful");
@@ -4516,7 +4537,10 @@ impl Timeline {
remote_layer.ongoing_download.close();
} else {
// Keep semaphore open. We'll drop the permit at the end of the function.
error!("layer file download failed: {:?}", result.as_ref().unwrap_err());
error!(
"layer file download failed: {:?}",
result.as_ref().unwrap_err()
);
}
// Don't treat it as an error if the task that triggered the download
@@ -4530,7 +4554,8 @@ impl Timeline {
drop(permit);
Ok(())
}.in_current_span(),
}
.in_current_span(),
);
receiver.await.context("download task cancelled")?
@@ -4600,9 +4625,11 @@ impl Timeline {
) {
let mut downloads = Vec::new();
{
let layers = self.layers.read().await;
let guard = self.layers.read().await;
let (layers, _) = &*guard;
layers
.iter_historic_layers()
.map(|l| self.layer_cache.get_from_desc(&l))
.filter_map(|l| l.downcast_remote_layer())
.map(|l| self.download_remote_layer(l))
.for_each(|dl| downloads.push(dl))
@@ -4703,7 +4730,8 @@ impl LocalLayerInfoForDiskUsageEviction {
impl Timeline {
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
let layers = self.layers.read().await;
let guard = self.layers.read().await;
let (layers, _) = &*guard;
let mut max_layer_size: Option<u64> = None;
let mut resident_layers = Vec::new();
@@ -4712,6 +4740,8 @@ impl Timeline {
let file_size = l.file_size();
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
let l = self.layer_cache.get_from_desc(&l);
if l.is_remote_layer() {
continue;
}
@@ -4870,3 +4900,31 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
),
}
}
/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
///
/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
///
/// If comparing persistent layers, ALWAYS compare the layer descriptor key.
#[inline(always)]
pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {
// "dyn Trait" objects are "fat pointers" in that they have two components:
// - pointer to the object
// - pointer to the vtable
//
// rust does not provide a guarantee that these vtables are unique, but however
// `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
// pointer and the vtable need to be equal.
//
// See: https://github.com/rust-lang/rust/issues/103763
//
// A future version of rust will most likely use this form below, where we cast each
// pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
// not affect the comparison.
//
// See: https://github.com/rust-lang/rust/pull/106450
let left = Arc::as_ptr(left) as *const ();
let right = Arc::as_ptr(right) as *const ();
left == right
}

View File

@@ -197,9 +197,11 @@ impl Timeline {
// We don't want to hold the layer map lock during eviction.
// So, we just need to deal with this.
let candidates: Vec<Arc<dyn PersistentLayer>> = {
let layers = self.layers.read().await;
let guard = self.layers.read().await;
let (layers, _) = &*guard;
let mut candidates = Vec::new();
for hist_layer in layers.iter_historic_layers() {
let hist_layer = self.layer_cache.get_from_desc(&hist_layer);
if hist_layer.is_remote_layer() {
continue;
}

View File

@@ -4,7 +4,6 @@
MODULE_big = neon
OBJS = \
$(WIN32RES) \
extension_server.o \
file_cache.o \
libpagestore.o \
libpqwalproposer.o \

View File

@@ -1,104 +0,0 @@
/*-------------------------------------------------------------------------
*
* extension_server.c
* Request compute_ctl to download extension files.
*
* IDENTIFICATION
* contrib/neon/extension_server.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tcop/pquery.h"
#include "tcop/utility.h"
#include "access/xact.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "commands/defrem.h"
#include "miscadmin.h"
#include "utils/acl.h"
#include "fmgr.h"
#include "utils/guc.h"
#include "port.h"
#include "fmgr.h"
#include <curl/curl.h>
static int extension_server_port = 0;
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
// to download all SQL (and data) files for an extension:
// curl -X POST http://localhost:8080/extension_server/postgis
// it covers two possible extension files layouts:
// 1. extension_name--version--platform.sql
// 2. extension_name/extension_name--version.sql
// extension_name/extra_files.csv
//
// to download specific library file:
// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
static bool
neon_download_extension_file_http(const char *filename, bool is_library)
{
CURL *curl;
CURLcode res;
char *compute_ctl_url;
char *postdata;
bool ret = false;
if ((curl = curl_easy_init()) == NULL)
{
elog(ERROR, "Failed to initialize curl handle");
}
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
extension_server_port, filename, is_library ? "?is_library=true" : "");
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
// NOTE: 15L may be insufficient time for large extensions like postgis
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L /* seconds */);
if (curl)
{
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
/* Check for errors */
if (res == CURLE_OK)
{
ret = true;
}
else
{
// Don't error here because postgres will try to find the file
// and will fail with some proper error message if it's not found.
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
}
/* always cleanup */
curl_easy_cleanup(curl);
}
return ret;
}
void pg_init_extension_server()
{
// Port to connect to compute_ctl on localhost
// to request extension files.
DefineCustomIntVariable("neon.extension_server_port",
"connection string to the compute_ctl",
NULL,
&extension_server_port,
0, 0, INT_MAX,
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
// set download_extension_file_hook
prev_download_extension_file_hook = download_extension_file_hook;
download_extension_file_hook = neon_download_extension_file_http;
}

View File

@@ -1 +0,0 @@

View File

@@ -35,11 +35,8 @@ _PG_init(void)
{
pg_init_libpagestore();
pg_init_walproposer();
InitControlPlaneConnector();
pg_init_extension_server();
// Important: This must happen after other parts of the extension
// are loaded, otherwise any settings to GUCs that were set before
// the extension was loaded will be removed.

View File

@@ -21,8 +21,6 @@ extern char *neon_tenant;
extern void pg_init_libpagestore(void);
extern void pg_init_walproposer(void);
extern void pg_init_extension_server(void);
/*
* Returns true if we shouldn't do REDO on that block in record indicated by
* block_id; false otherwise.

View File

@@ -2675,6 +2675,7 @@ bool
neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
{
XLogRecPtr end_recptr = record->EndRecPtr;
XLogRecPtr prev_end_recptr = record->ReadRecPtr - 1;
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blkno;
@@ -2718,15 +2719,16 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
no_redo_needed = buffer < 0;
/* In both cases st lwlsn past this WAL record */
SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
/* we don't have the buffer in memory, update lwLsn past this record,
* also evict page fro file cache
*/
/* we don't have the buffer in memory, update lwLsn past this record */
if (no_redo_needed)
{
SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
lfc_evict(rnode, forknum, blkno);
}
else
{
SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
}
LWLockRelease(partitionLock);
@@ -2734,10 +2736,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
if (get_cached_relsize(rnode, forknum, &relsize))
{
if (relsize < blkno + 1)
{
update_cached_relsize(rnode, forknum, blkno + 1);
SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
}
}
else
{
@@ -2769,7 +2768,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
Assert(nbresponse->n_blocks > blkno);
set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
}

View File

@@ -514,16 +514,6 @@ def available_remote_storages() -> List[RemoteStorageKind]:
return remote_storages
def available_s3_storages() -> List[RemoteStorageKind]:
remote_storages = [RemoteStorageKind.MOCK_S3]
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
remote_storages.append(RemoteStorageKind.REAL_S3)
log.info("Enabling real s3 storage for tests")
else:
log.info("Using mock implementations to test remote storage")
return remote_storages
@dataclass
class LocalFsStorage:
root: Path
@@ -544,16 +534,6 @@ class S3Storage:
"AWS_SECRET_ACCESS_KEY": self.secret_key,
}
def to_string(self) -> str:
return json.dumps(
{
"bucket": self.bucket_name,
"region": self.bucket_region,
"endpoint": self.endpoint,
"prefix": self.prefix_in_bucket,
}
)
RemoteStorage = Union[LocalFsStorage, S3Storage]
@@ -620,12 +600,10 @@ class NeonEnvBuilder:
self.rust_log_override = rust_log_override
self.port_distributor = port_distributor
self.remote_storage = remote_storage
self.ext_remote_storage: Optional[S3Storage] = None
self.remote_storage_client: Optional[Any] = None
self.remote_storage_users = remote_storage_users
self.broker = broker
self.run_id = run_id
self.mock_s3_server: MockS3Server = mock_s3_server
self.mock_s3_server = mock_s3_server
self.pageserver_config_override = pageserver_config_override
self.num_safekeepers = num_safekeepers
self.safekeepers_id_start = safekeepers_id_start
@@ -673,24 +651,15 @@ class NeonEnvBuilder:
remote_storage_kind: RemoteStorageKind,
test_name: str,
force_enable: bool = True,
enable_remote_extensions: bool = False,
):
if remote_storage_kind == RemoteStorageKind.NOOP:
return
elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
self.enable_local_fs_remote_storage(force_enable=force_enable)
elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
self.enable_mock_s3_remote_storage(
bucket_name=test_name,
force_enable=force_enable,
enable_remote_extensions=enable_remote_extensions,
)
self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
elif remote_storage_kind == RemoteStorageKind.REAL_S3:
self.enable_real_s3_remote_storage(
test_name=test_name,
force_enable=force_enable,
enable_remote_extensions=enable_remote_extensions,
)
self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
else:
raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")
@@ -704,15 +673,11 @@ class NeonEnvBuilder:
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage"))
def enable_mock_s3_remote_storage(
self, bucket_name: str, force_enable: bool = True, enable_remote_extensions: bool = False
):
def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True):
"""
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
Starts up the mock server, if that does not run yet.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
Also creates the bucket for extensions, self.ext_remote_storage bucket
"""
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
mock_endpoint = self.mock_s3_server.endpoint()
@@ -733,22 +698,9 @@ class NeonEnvBuilder:
bucket_region=mock_region,
access_key=self.mock_s3_server.access_key(),
secret_key=self.mock_s3_server.secret_key(),
prefix_in_bucket="pageserver",
)
if enable_remote_extensions:
self.ext_remote_storage = S3Storage(
bucket_name=bucket_name,
endpoint=mock_endpoint,
bucket_region=mock_region,
access_key=self.mock_s3_server.access_key(),
secret_key=self.mock_s3_server.secret_key(),
prefix_in_bucket="ext",
)
def enable_real_s3_remote_storage(
self, test_name: str, force_enable: bool = True, enable_remote_extensions: bool = False
):
def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True):
"""
Sets up configuration to use real s3 endpoint without mock server
"""
@@ -785,18 +737,9 @@ class NeonEnvBuilder:
bucket_region=region,
access_key=access_key,
secret_key=secret_key,
prefix_in_bucket=f"{self.remote_storage_prefix}/pageserver",
prefix_in_bucket=self.remote_storage_prefix,
)
if enable_remote_extensions:
self.ext_remote_storage = S3Storage(
bucket_name=bucket_name,
bucket_region=region,
access_key=access_key,
secret_key=secret_key,
prefix_in_bucket=f"{self.remote_storage_prefix}/ext",
)
def cleanup_local_storage(self):
if self.preserve_database_files:
return
@@ -830,7 +773,6 @@ class NeonEnvBuilder:
# `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
# so this line effectively a no-op
assert isinstance(self.remote_storage, S3Storage)
assert self.remote_storage_client is not None
if self.keep_remote_storage_contents:
log.info("keep_remote_storage_contents skipping remote storage cleanup")
@@ -960,8 +902,6 @@ class NeonEnv:
self.neon_binpath = config.neon_binpath
self.pg_distrib_dir = config.pg_distrib_dir
self.endpoint_counter = 0
self.remote_storage_client = config.remote_storage_client
self.ext_remote_storage = config.ext_remote_storage
# generate initial tenant ID here instead of letting 'neon init' generate it,
# so that we don't need to dig it out of the config file afterwards.
@@ -1548,7 +1488,6 @@ class NeonCli(AbstractNeonCli):
safekeepers: Optional[List[int]] = None,
tenant_id: Optional[TenantId] = None,
lsn: Optional[Lsn] = None,
remote_ext_config: Optional[str] = None,
) -> "subprocess.CompletedProcess[str]":
args = [
"endpoint",
@@ -1558,8 +1497,6 @@ class NeonCli(AbstractNeonCli):
"--pg-version",
self.env.pg_version,
]
if remote_ext_config is not None:
args.extend(["--remote-ext-config", remote_ext_config])
if lsn is not None:
args.append(f"--lsn={lsn}")
args.extend(["--pg-port", str(pg_port)])
@@ -2421,7 +2358,7 @@ class Endpoint(PgProtocol):
return self
def start(self, remote_ext_config: Optional[str] = None) -> "Endpoint":
def start(self) -> "Endpoint":
"""
Start the Postgres instance.
Returns self.
@@ -2437,7 +2374,6 @@ class Endpoint(PgProtocol):
http_port=self.http_port,
tenant_id=self.tenant_id,
safekeepers=self.active_safekeepers,
remote_ext_config=remote_ext_config,
)
self.running = True
@@ -2527,7 +2463,6 @@ class Endpoint(PgProtocol):
hot_standby: bool = False,
lsn: Optional[Lsn] = None,
config_lines: Optional[List[str]] = None,
remote_ext_config: Optional[str] = None,
) -> "Endpoint":
"""
Create an endpoint, apply config, and start Postgres.
@@ -2542,7 +2477,7 @@ class Endpoint(PgProtocol):
config_lines=config_lines,
hot_standby=hot_standby,
lsn=lsn,
).start(remote_ext_config=remote_ext_config)
).start()
log.info(f"Postgres startup took {time.time() - started_at} seconds")
@@ -2576,7 +2511,6 @@ class EndpointFactory:
lsn: Optional[Lsn] = None,
hot_standby: bool = False,
config_lines: Optional[List[str]] = None,
remote_ext_config: Optional[str] = None,
) -> Endpoint:
ep = Endpoint(
self.env,
@@ -2593,7 +2527,6 @@ class EndpointFactory:
hot_standby=hot_standby,
config_lines=config_lines,
lsn=lsn,
remote_ext_config=remote_ext_config,
)
def create(

View File

@@ -89,9 +89,6 @@ class TenantId(Id):
def __repr__(self) -> str:
return f'`TenantId("{self.id.hex()}")'
def __str__(self) -> str:
return self.id.hex()
class TimelineId(Id):
def __repr__(self) -> str:

View File

@@ -1,295 +0,0 @@
import json
import os
from contextlib import closing
from io import BytesIO
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
PgBin,
RemoteStorageKind,
available_s3_storages,
)
from fixtures.pg_version import PgVersion
from fixtures.types import TenantId
NUM_EXT = 3
def control_file_content(owner, i):
output = f"""# mock {owner} extension{i}
comment = 'This is a mock extension'
default_version = '1.0'
module_pathname = '$libdir/test_ext{i}'
relocatable = true"""
return BytesIO(bytes(output, "utf-8"))
def sql_file_content():
output = """
CREATE FUNCTION test_ext_add(integer, integer) RETURNS integer
AS 'select $1 + $2;'
LANGUAGE SQL
IMMUTABLE
RETURNS NULL ON NULL INPUT;
"""
return BytesIO(bytes(output, "utf-8"))
def fake_library_content():
return BytesIO(bytes("\n111\n", "utf-8"))
# Prepare some mock extension files and upload them to the bucket
# returns a list of files that should be cleaned up after the test
def prepare_mock_ext_storage(
pg_version: PgVersion,
tenant_id: TenantId,
pg_bin: PgBin,
ext_remote_storage,
remote_storage_client,
):
bucket_prefix = ext_remote_storage.prefix_in_bucket
custom_prefix = str(tenant_id)
PUB_EXT_ROOT = f"v{pg_version}/share/extension"
PRIVATE_EXT_ROOT = f"v{pg_version}/{custom_prefix}/share/extension"
LOCAL_EXT_ROOT = f"pg_install/{pg_version}/share/postgresql/extension"
PUB_LIB_ROOT = f"v{pg_version}/lib"
PRIVATE_LIB_ROOT = f"v{pg_version}/{custom_prefix}/lib"
LOCAL_LIB_ROOT = f"{pg_bin.pg_lib_dir}/postgresql"
log.info(
f"""
PUB_EXT_ROOT: {PUB_EXT_ROOT}
PRIVATE_EXT_ROOT: {PRIVATE_EXT_ROOT}
LOCAL_EXT_ROOT: {LOCAL_EXT_ROOT}
PUB_LIB_ROOT: {PUB_LIB_ROOT}
PRIVATE_LIB_ROOT: {PRIVATE_LIB_ROOT}
LOCAL_LIB_ROOT: {LOCAL_LIB_ROOT}
"""
)
cleanup_files = []
# Upload several test_ext{i}.control files to the bucket
for i in range(NUM_EXT):
public_remote_name = f"{bucket_prefix}/{PUB_EXT_ROOT}/test_ext{i}.control"
custom_remote_name = f"{bucket_prefix}/{PRIVATE_EXT_ROOT}/custom_ext{i}.control"
cleanup_files += [
f"{LOCAL_EXT_ROOT}/test_ext{i}.control",
f"{LOCAL_EXT_ROOT}/custom_ext{i}.control",
]
log.info(f"Uploading control file to {public_remote_name}")
remote_storage_client.upload_fileobj(
control_file_content("public", i), ext_remote_storage.bucket_name, public_remote_name
)
log.info(f"Uploading control file to {custom_remote_name}")
remote_storage_client.upload_fileobj(
control_file_content(str(tenant_id), i),
ext_remote_storage.bucket_name,
custom_remote_name,
)
# Upload SQL file for the extension we're going to create
sql_filename = "test_ext0--1.0.sql"
test_sql_public_remote_path = f"{bucket_prefix}/{PUB_EXT_ROOT}/{sql_filename}"
remote_storage_client.upload_fileobj(
sql_file_content(),
ext_remote_storage.bucket_name,
test_sql_public_remote_path,
)
cleanup_files += [f"{LOCAL_EXT_ROOT}/{sql_filename}"]
# upload some fake library files
for i in range(2):
public_remote_name = f"{bucket_prefix}/{PUB_LIB_ROOT}/test_lib{i}.so"
custom_remote_name = f"{bucket_prefix}/{PRIVATE_LIB_ROOT}/custom_lib{i}.so"
log.info(f"uploading fake library to {public_remote_name}")
remote_storage_client.upload_fileobj(
fake_library_content(),
ext_remote_storage.bucket_name,
public_remote_name,
)
log.info(f"uploading fake library to {custom_remote_name}")
remote_storage_client.upload_fileobj(
fake_library_content(),
ext_remote_storage.bucket_name,
custom_remote_name,
)
cleanup_files += [f"{LOCAL_LIB_ROOT}/test_lib{i}.so", f"{LOCAL_LIB_ROOT}/custom_lib{i}.so"]
return cleanup_files
# Generate mock extension files and upload them to the bucket.
#
# Then check that compute nodes can download them and use them
# to CREATE EXTENSION and LOAD 'library.so'
#
# NOTE: You must have appropriate AWS credentials to run REAL_S3 test.
# It may also be necessary to set the following environment variables:
# export AWS_ACCESS_KEY_ID='test'
# export AWS_SECRET_ACCESS_KEY='test'
# export AWS_SECURITY_TOKEN='test'
# export AWS_SESSION_TOKEN='test'
# export AWS_DEFAULT_REGION='us-east-1'
@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
def test_remote_extensions(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
pg_version: PgVersion,
pg_bin: PgBin,
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_remote_extensions",
enable_remote_extensions=True,
)
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
tenant_id, _ = env.neon_cli.create_tenant()
env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id)
assert env.ext_remote_storage is not None
assert env.remote_storage_client is not None
# Prepare some mock extension files and upload them to the bucket
cleanup_files = prepare_mock_ext_storage(
pg_version,
tenant_id,
pg_bin,
env.ext_remote_storage,
env.remote_storage_client,
)
# Start a compute node and check that it can download the extensions
# and use them to CREATE EXTENSION and LOAD 'library.so'
#
# This block is wrapped in a try/finally so that the downloaded files
# are cleaned up even if the test fails
try:
endpoint = env.endpoints.create_start(
"test_remote_extensions",
tenant_id=tenant_id,
remote_ext_config=env.ext_remote_storage.to_string(),
# config_lines=["log_min_messages=debug3"],
)
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
# Test query: check that test_ext0 was successfully downloaded
cur.execute("SELECT * FROM pg_available_extensions")
all_extensions = [x[0] for x in cur.fetchall()]
log.info(all_extensions)
for i in range(NUM_EXT):
assert f"test_ext{i}" in all_extensions
assert f"custom_ext{i}" in all_extensions
cur.execute("CREATE EXTENSION test_ext0")
cur.execute("SELECT extname FROM pg_extension")
all_extensions = [x[0] for x in cur.fetchall()]
log.info(all_extensions)
assert "test_ext0" in all_extensions
# Try to load existing library file
try:
cur.execute("LOAD 'test_lib0.so'")
except Exception as e:
# expected to fail with
# could not load library ... test_ext.so: file too short
# because test_lib0.so is not real library file
log.info("LOAD test_lib0.so failed (expectedly): %s", e)
assert "file too short" in str(e)
# Try to load custom library file
try:
cur.execute("LOAD 'custom_lib0.so'")
except Exception as e:
# expected to fail with
# could not load library ... test_ext.so: file too short
# because test_lib0.so is not real library file
log.info("LOAD custom_lib0.so failed (expectedly): %s", e)
assert "file too short" in str(e)
# Try to load existing library file without .so extension
try:
cur.execute("LOAD 'test_lib1'")
except Exception as e:
# expected to fail with
# could not load library ... test_lib1.so: file too short
# because test_lib1.so is not real library file
log.info("LOAD test_lib1 failed (expectedly): %s", e)
assert "file too short" in str(e)
# Try to load non-existent library file
try:
cur.execute("LOAD 'test_lib_fail.so'")
except Exception as e:
# expected to fail because test_lib_fail.so is not found
log.info("LOAD test_lib_fail.so failed (expectedly): %s", e)
assert (
"""could not access file "test_lib_fail.so": No such file or directory"""
in str(e)
)
finally:
# this is important because if the files aren't cleaned up then the test can
# pass even without successfully downloading the files if a previous run (or
# run with different type of remote storage) of the test did download the
# files
for file in cleanup_files:
try:
os.remove(file)
log.info(f"Deleted {file}")
except FileNotFoundError:
log.info(f"{file} does not exist, so cannot be deleted")
"""
This tests against the actual infra for real S3
Note in particular that we don't need to set up a bucket (real or mock)
because we are testing the files already uploaded as part of CI/CD
"""
def test_remote_extensions_in_bucket(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.REAL_S3,
test_name="test_remote_extensions_in_bucket",
enable_remote_extensions=False, # we don't enable remote extensions here; instead we use the real bucket
)
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
tenant_id, _ = env.neon_cli.create_tenant()
env.neon_cli.create_timeline("test_remote_extensions_in_bucket", tenant_id=tenant_id)
# Start a compute node and check that it can download the extensions
# and use them to CREATE EXTENSION and LOAD 'library.so'
remote_ext_config = {
"bucket": "neon-dev-extensions-eu-central-1",
"region": "eu-central-1",
"endpoint": None,
"prefix": "5412197734", # build tag
}
endpoint = env.endpoints.create_start(
"test_remote_extensions_in_bucket",
tenant_id=tenant_id,
remote_ext_config=json.dumps(remote_ext_config),
)
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("SELECT * FROM pg_available_extensions")
all_extensions = [x[0] for x in cur.fetchall()]
log.info("ALEK******\n\n" * 20)
log.info(all_extensions)
log.info("ALEK******\n\n" * 20)
# TODO: test download anon
# TODO: test load library / create extension

View File

@@ -713,9 +713,7 @@ def test_ondemand_download_failure_to_replace(
# error message is not useful
pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
actual_message = (
".* ERROR .*replacing downloaded layer into layermap failed because layer was not found"
)
actual_message = ".* ERROR .*layermap-replace-notfound"
assert env.pageserver.log_contains(actual_message) is not None
env.pageserver.allowed_errors.append(actual_message)

View File

@@ -419,6 +419,8 @@ def test_tenant_relocation(
new_pageserver_http.tenant_attach(tenant_id)
# wait for tenant to finish attaching
tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id)
assert tenant_status["state"]["slug"] in ["Attaching", "Active"]
wait_until(
number_of_iterations=10,
interval=1,

View File

@@ -275,7 +275,6 @@ def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str]
assert isinstance(neon_env_builder.remote_storage, S3Storage)
# Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
assert neon_env_builder.remote_storage_client is not None
response = neon_env_builder.remote_storage_client.list_objects_v2(
Bucket=neon_env_builder.remote_storage.bucket_name,
Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
@@ -629,7 +628,7 @@ def test_timeline_delete_works_for_remote_smoke(
)
# for some reason the check above doesnt immediately take effect for the below.
# Assume it is mock server inconsistency and check twice.
# Assume it is mock server incosistency and check twice.
wait_until(
2,
0.5,