Compare commits

..

12 Commits

Author SHA1 Message Date
Bojan Serafimov
aa9baddd3d Fix unrelated test 2023-06-09 10:27:54 -04:00
Bojan Serafimov
4756dcd0cc fmt 2023-06-09 09:57:06 -04:00
Bojan Serafimov
ab23e28768 revert test changes 2023-06-09 09:50:05 -04:00
Bojan Serafimov
341563261a Store compute spec id inside basebackup 2023-06-09 00:39:06 -04:00
Bojan Serafimov
b836013721 Cleanup pg_stat_statements 2023-06-08 22:52:52 -04:00
Bojan Serafimov
44ad006eb3 Merge branch 'main' into startup-no-config 2023-06-08 18:12:15 -04:00
Bojan Serafimov
aff94b54c8 more roles, dbs 2023-06-08 12:47:59 -04:00
Bojan Serafimov
1adb38bb82 Merge branch 'new-startup-test' into startup-no-config 2023-06-08 12:41:14 -04:00
Bojan Serafimov
1baecdc27a comments 2023-06-08 12:33:19 -04:00
Bojan Serafimov
eceda63379 Do two iterations 2023-06-08 12:30:47 -04:00
Bojan Serafimov
881bfc4da8 WIP 2023-06-08 10:21:07 -04:00
Bojan Serafimov
eda4f86588 Add startup test 2023-06-06 16:45:16 -04:00
86 changed files with 1696 additions and 3154 deletions

View File

@@ -623,6 +623,51 @@ jobs:
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
neon-image-depot:
# For testing this will run side-by-side for a few merges.
# This action is not really optimized yet, but gets the job done
runs-on: [ self-hosted, gen3, large ]
needs: [ tag ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
permissions:
contents: read
id-token: write
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Setup go
uses: actions/setup-go@v3
with:
go-version: '1.19'
- name: Set up Depot CLI
uses: depot/setup-action@v1
- name: Install Crane & ECR helper
run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Build and push
uses: depot/build-push-action@v1
with:
# if no depot.json file is at the root of your repo, you must specify the project id
project: nrdv0s4kcs
push: true
tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
build-args: |
GIT_VERSION=${{ github.sha }}
REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
compute-tools-image:
runs-on: [ self-hosted, gen3, large ]
needs: [ tag ]
@@ -914,20 +959,6 @@ jobs:
exit 1
fi
- name: Create tag "release-${{ needs.tag.outputs.build-tag }}"
if: github.ref_name == 'release'
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
github.rest.git.createRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: "refs/tags/release-${{ needs.tag.outputs.build-tag }}",
sha: context.sha,
})
promote-compatibility-data:
runs-on: [ self-hosted, gen3, small ]
container:

View File

@@ -3,7 +3,6 @@ name: Create Release Branch
on:
schedule:
- cron: '0 10 * * 2'
workflow_dispatch:
jobs:
create_release_branch:

10
Cargo.lock generated
View File

@@ -2770,7 +2770,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
dependencies = [
"bytes",
"fallible-iterator",
@@ -2783,7 +2783,7 @@ dependencies = [
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
dependencies = [
"native-tls",
"tokio",
@@ -2794,7 +2794,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -2812,7 +2812,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4272,7 +4272,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
dependencies = [
"async-trait",
"byteorder",

View File

@@ -140,11 +140,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
## Other git libraries
@@ -180,7 +180,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
# Changes the MAX_THREADS limit from 4096 to 32768.
# This is a temporary workaround for using tracing from many threads in safekeepers code,

View File

@@ -67,7 +67,7 @@ RUN apt update && \
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
make clean && cp -R /sfcgal/* /
@@ -95,7 +95,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
cmake .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
@@ -355,7 +355,7 @@ RUN apt-get update && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
cd build && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -410,7 +410,7 @@ RUN apt-get update && \
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
cmake .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
@@ -432,54 +432,6 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
#########################################################################################
#
# Layer "rdkit-pg-build"
# compile rdkit extension
#
#########################################################################################
FROM build-deps AS rdkit-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt-get update && \
apt-get install -y \
cmake \
libboost-iostreams1.74-dev \
libboost-regex1.74-dev \
libboost-serialization1.74-dev \
libboost-system1.74-dev \
libeigen3-dev \
libfreetype6-dev
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
cmake \
-D RDK_BUILD_CAIRO_SUPPORT=OFF \
-D RDK_BUILD_INCHI_SUPPORT=ON \
-D RDK_BUILD_AVALON_SUPPORT=ON \
-D RDK_BUILD_PYTHON_WRAPPERS=OFF \
-D RDK_BUILD_DESCRIPTORS3D=OFF \
-D RDK_BUILD_FREESASA_SUPPORT=OFF \
-D RDK_BUILD_COORDGEN_SUPPORT=ON \
-D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
-D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
-D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
-D RDK_USE_URF=OFF \
-D RDK_BUILD_PGSQL=ON \
-D RDK_PGSQL_STATIC=ON \
-D PostgreSQL_CONFIG=pg_config \
-D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
-D RDK_INSTALL_INTREE=OFF \
-D CMAKE_BUILD_TYPE=Release \
. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
#########################################################################################
#
# Layer "rust extensions"
@@ -612,7 +564,6 @@ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -686,19 +637,14 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2
# libzstd1 for zstd
# libboost*, libfreetype6, and zlib1g for rdkit
RUN apt update && \
apt install --no-install-recommends -y \
gdb \
locales \
libicu67 \
liblz4-1 \
libreadline8 \
libboost-iostreams1.74.0 \
libboost-regex1.74.0 \
libboost-serialization1.74.0 \
libboost-system1.74.0 \
libossp-uuid16 \
libfreetype6 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
@@ -708,9 +654,7 @@ RUN apt update && \
libxslt1.1 \
libzstd1 \
libcurl4-openssl-dev \
locales \
procps \
zlib1g && \
procps && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -370,6 +370,11 @@ impl ComputeNode {
// 'Close' connection
drop(client);
info!(
"finished configuration of compute for project {}",
spec.cluster.cluster_id.as_deref().unwrap_or("None")
);
Ok(())
}
@@ -422,23 +427,57 @@ impl ComputeNode {
#[instrument(skip(self))]
pub fn start_compute(&self) -> Result<std::process::Child> {
let compute_state = self.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}",
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
pspec.tenant_id,
pspec.timeline_id,
spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
spec.spec.operation_uuid.as_deref().unwrap_or("None"),
spec.tenant_id,
spec.timeline_id,
);
self.prepare_pgdata(&compute_state)?;
let start_time = Utc::now();
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
let pg = self.start_postgres(spec.storage_auth_token.clone())?;
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
self.apply_config(&compute_state)?;
// Maybe apply the spec
if spec.spec.mode == ComputeMode::Primary {
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
// Get spec_id or make it up by hashing
//
// TODO Make spec_id required so there would be no need to hash.
let spec_id = spec.operation_uuid.clone().unwrap_or_else(|| {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
// HACK Exclude postgresql.conf because it doesn't need
// to be applied like the other fields in the spec
let mut spec_no_conf = spec.clone();
spec_no_conf.cluster.postgresql_conf = None;
let json = serde_json::to_vec(&spec_no_conf).unwrap();
let mut hasher = DefaultHasher::new();
json.hash(&mut hasher);
let hash = hasher.finish();
format!("{:x}", hash)
});
// Get current spec_id
let path = Path::new(&self.pgdata).join("neon_compute_spec_id.txt");
let current_spec_id = std::fs::read_to_string(path).ok();
// Respec if needed
if current_spec_id == Some(spec_id.clone()) {
info!("no need to respec");
} else {
info!("respeccing {:?} {:?}", current_spec_id, &spec_id);
self.apply_config(&compute_state)?;
self.cache_spec_id(&compute_state, spec_id)?;
}
}
let startup_end_time = Utc::now();
@@ -457,14 +496,31 @@ impl ComputeNode {
}
self.set_status(ComputeStatus::Running);
info!(
"finished configuration of compute for project {}",
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
);
Ok(pg)
}
fn cache_spec_id(&self, compute_state: &ComputeState, spec_id: String) -> anyhow::Result<()> {
let spec = &compute_state.pspec.as_ref().expect("spec must be set");
let cmd = format!(
"set_compute_spec_id {} {} {}",
spec.tenant_id, spec.timeline_id, spec_id,
);
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
// Use the storage auth token from the config file, if given.
// Note: this overrides any password set in the connection string.
if let Some(storage_auth_token) = &spec.storage_auth_token {
info!("Got storage auth token from spec file");
config.password(storage_auth_token);
} else {
info!("Storage auth token not set");
}
let mut client = config.connect(NoTls)?;
client.simple_query(&cmd)?;
Ok(())
}
// Look for core dumps and collect backtraces.
//
// EKS worker nodes have following core dump settings:

View File

@@ -450,7 +450,6 @@ impl Endpoint {
// Create spec file
let spec = ComputeSpec {
skip_pg_catalog_updates: false,
format_version: 1.0,
operation_uuid: None,
cluster: Cluster {

View File

@@ -27,12 +27,6 @@ pub struct ComputeSpec {
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
/// An optinal hint that can be passed to speed up startup time if we know
/// that no pg catalog mutations (like role creation, database creation,
/// extension creation) need to be done on the actual database to start.
#[serde(default)] // Default false
pub skip_pg_catalog_updates: bool,
// Information needed to connect to the storage layer.
//
// `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
@@ -148,14 +142,4 @@ mod tests {
let file = File::open("tests/cluster_spec.json").unwrap();
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
}
#[test]
fn parse_unknown_fields() {
// Forward compatibility test
let file = File::open("tests/cluster_spec.json").unwrap();
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
let ob = json.as_object_mut().unwrap();
ob.insert("unknown_field_123123123".into(), "hello".into());
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
}
}

View File

@@ -111,8 +111,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
) -> Result<Download, DownloadError>;
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
}
pub struct Download {
@@ -225,14 +223,6 @@ impl GenericRemoteStorage {
Self::Unreliable(s) => s.delete(path).await,
}
}
pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.delete_objects(paths).await,
Self::AwsS3(s) => s.delete_objects(paths).await,
Self::Unreliable(s) => s.delete_objects(paths).await,
}
}
}
impl GenericRemoteStorage {

View File

@@ -320,13 +320,6 @@ impl RemoteStorage for LocalFs {
.await
.map_err(|e| anyhow::anyhow!(e))?)
}
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
for path in paths {
self.delete(path).await?
}
Ok(())
}
}
fn storage_metadata_path(original_path: &Path) -> PathBuf {

View File

@@ -17,7 +17,6 @@ use aws_sdk_s3::{
error::SdkError,
operation::get_object::GetObjectError,
primitives::ByteStream,
types::{Delete, ObjectIdentifier},
Client,
};
use aws_smithy_http::body::SdkBody;
@@ -34,8 +33,6 @@ use crate::{
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
pub(super) mod metrics {
use metrics::{register_int_counter_vec, IntCounterVec};
use once_cell::sync::Lazy;
@@ -84,24 +81,12 @@ pub(super) mod metrics {
.inc();
}
pub fn inc_delete_objects(count: u64) {
S3_REQUESTS_COUNT
.with_label_values(&["delete_object"])
.inc_by(count);
}
pub fn inc_delete_object_fail() {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["delete_object"])
.inc();
}
pub fn inc_delete_objects_fail(count: u64) {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["delete_object"])
.inc_by(count);
}
pub fn inc_list_objects() {
S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
}
@@ -411,50 +396,6 @@ impl RemoteStorage for S3Bucket {
})
.await
}
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 delete")?;
let mut delete_objects = Vec::with_capacity(paths.len());
for path in paths {
let obj_id = ObjectIdentifier::builder()
.set_key(Some(self.relative_path_to_s3_object(path)))
.build();
delete_objects.push(obj_id);
}
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
metrics::inc_delete_objects(chunk.len() as u64);
let resp = self
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
.send()
.await;
match resp {
Ok(resp) => {
if let Some(errors) = resp.errors {
metrics::inc_delete_objects_fail(errors.len() as u64);
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
}
}
Err(e) => {
metrics::inc_delete_objects_fail(chunk.len() as u64);
return Err(e.into());
}
}
}
Ok(())
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let _guard = self

View File

@@ -24,7 +24,6 @@ enum RemoteOp {
Upload(RemotePath),
Download(RemotePath),
Delete(RemotePath),
DeleteObjects(Vec<RemotePath>),
}
impl UnreliableWrapper {
@@ -120,21 +119,4 @@ impl RemoteStorage for UnreliableWrapper {
self.attempt(RemoteOp::Delete(path.clone()))?;
self.inner.delete(path).await
}
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
let mut error_counter = 0;
for path in paths {
if (self.delete(path).await).is_err() {
error_counter += 1;
}
}
if error_counter > 0 {
return Err(anyhow::anyhow!(
"failed to delete {} objects",
error_counter
));
}
Ok(())
}
}

View File

@@ -107,37 +107,6 @@ async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result
Ok(())
}
#[test_context(MaybeEnabledS3)]
#[tokio::test]
async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3::Enabled(ctx) => ctx,
MaybeEnabledS3::Disabled => return Ok(()),
};
let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let data1 = "remote blob data1".as_bytes();
let data1_len = data1.len();
let data2 = "remote blob data2".as_bytes();
let data2_len = data2.len();
ctx.client
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
.await?;
ctx.client
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
.await?;
ctx.client.delete_objects(&[path1, path2]).await?;
Ok(())
}
fn ensure_logging_ready() {
LOGGING_DONE.get_or_init(|| {
utils::logging::init(

View File

@@ -1,18 +1,19 @@
use crate::auth::{Claims, JwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
use anyhow::Context;
use anyhow::{anyhow, Context};
use hyper::header::{HeaderName, AUTHORIZATION};
use hyper::http::HeaderValue;
use hyper::Method;
use hyper::{header::CONTENT_TYPE, Body, Request, Response};
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
use tokio::task::JoinError;
use tracing::{self, debug, info, info_span, warn, Instrument};
use std::future::Future;
use std::net::TcpListener;
use std::str::FromStr;
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -347,6 +348,40 @@ pub fn check_permission_with(
}
}
///
/// Start listening for HTTP requests on given socket.
///
/// 'shutdown_future' can be used to stop. If the Future becomes
/// ready, we stop listening for new requests, and the function returns.
///
pub fn serve_thread_main<S>(
router_builder: RouterBuilder<hyper::Body, ApiError>,
listener: TcpListener,
shutdown_future: S,
) -> anyhow::Result<()>
where
S: Future<Output = ()> + Send + Sync,
{
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
// Create a Service from the router above to handle incoming requests.
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
// Enter a single-threaded tokio runtime bound to the current thread
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
let _guard = runtime.enter();
let server = Server::from_tcp(listener)?
.serve(service)
.with_graceful_shutdown(shutdown_future);
runtime.block_on(server)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -417,6 +417,16 @@ where
// Also send zenith.signal file with extra bootstrap data.
//
async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
// Add neon_compute_spec_id.txt
if let Some(spec_id) = &self.timeline.compute_spec_id.lock().await.clone() {
self.ar
.append(
&new_tar_header("neon_compute_spec_id.txt", spec_id.len() as u64)?,
spec_id.as_bytes(),
)
.await?;
}
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {

View File

@@ -516,7 +516,7 @@ async fn collect_eviction_candidates(
if !tl.is_active() {
continue;
}
let info = tl.get_local_layers_for_disk_usage_eviction().await;
let info = tl.get_local_layers_for_disk_usage_eviction();
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
tenant_candidates.extend(
info.resident_layers

View File

@@ -215,7 +215,7 @@ async fn build_timeline_info(
) -> anyhow::Result<TimelineInfo> {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
let mut info = build_timeline_info_common(timeline, ctx).await?;
let mut info = build_timeline_info_common(timeline, ctx)?;
if include_non_incremental_logical_size {
// XXX we should be using spawn_ondemand_logical_size_calculation here.
// Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -233,7 +233,7 @@ async fn build_timeline_info(
Ok(info)
}
async fn build_timeline_info_common(
fn build_timeline_info_common(
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<TimelineInfo> {
@@ -264,7 +264,7 @@ async fn build_timeline_info_common(
None
}
};
let current_physical_size = Some(timeline.layer_size_sum().await);
let current_physical_size = Some(timeline.layer_size_sum());
let state = timeline.current_state();
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
@@ -330,7 +330,6 @@ async fn timeline_create_handler(
Ok(Some(new_timeline)) => {
// Created. Construct a TimelineInfo for it.
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
@@ -592,7 +591,7 @@ async fn tenant_status(
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
for timeline in tenant.list_timelines().iter() {
current_physical_size += timeline.layer_size_sum().await;
current_physical_size += timeline.layer_size_sum();
}
let state = tenant.current_state();
@@ -702,7 +701,7 @@ async fn layer_map_info_handler(
check_permission(&request, Some(tenant_id))?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let layer_map_info = timeline.layer_map_info(reset).await;
let layer_map_info = timeline.layer_map_info(reset);
json_response(StatusCode::OK, layer_map_info)
}

View File

@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
{
pg_control = Some(control_file);
}
modification.flush().await?;
modification.flush()?;
}
}
// We're done importing all the data files.
modification.commit().await?;
modification.commit()?;
// We expect the Postgres server to be shut down cleanly.
let pg_control = pg_control.context("pg_control file not found")?;
@@ -148,17 +148,17 @@ async fn import_rel(
// because there is no guarantee about the order in which we are processing segments.
// ignore "relation already exists" error
//
// FIXME: Keep track of which relations we've already created?
// FIXME: use proper error type for this, instead of parsing the error message.
// Or better yet, keep track of which relations we've already created
// https://github.com/neondatabase/neon/issues/3309
if let Err(e) = modification
.put_rel_creation(rel, nblocks as u32, ctx)
.await
{
match e {
RelationError::AlreadyExists => {
debug!("Relation {} already exist. We must be extending it.", rel)
}
_ => return Err(e.into()),
if e.to_string().contains("already exists") {
debug!("relation {} already exists. we must be extending it", rel);
} else {
return Err(e);
}
}
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
// We found the pg_control file.
pg_control = Some(res);
}
modification.flush().await?;
modification.flush()?;
}
tokio_tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
// sanity check: ensure that pg_control is loaded
let _pg_control = pg_control.context("pg_control file not found")?;
modification.commit().await?;
modification.commit()?;
Ok(())
}
@@ -594,7 +594,7 @@ async fn import_file(
// zenith.signal is not necessarily the last file, that we handle
// but it is ok to call `finish_write()`, because final `modification.commit()`
// will update lsn once more to the final one.
let writer = modification.tline.writer().await;
let writer = modification.tline.writer();
writer.finish_write(prev_lsn);
debug!("imported zenith signal {}", prev_lsn);

View File

@@ -1,3 +1,4 @@
use metrics::core::{AtomicU64, GenericCounter};
use metrics::{
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
@@ -94,19 +95,21 @@ static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
});
// Metrics collected on operations on the storage repository.
pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value (reconstruct a page from deltas)",
"Time spent in reconstruct_value",
&["tenant_id", "timeline_id"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_materialized_cache_hits_direct_total",
"Number of cache hits from materialized page cache without redo",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -121,10 +124,11 @@ static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_materialized_cache_hits_total",
"Number of cache hits from materialized page cache",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -748,7 +752,10 @@ impl StorageTimeMetrics {
pub struct TimelineMetrics {
tenant_id: String,
timeline_id: String,
pub reconstruct_time_histo: Histogram,
pub get_reconstruct_data_time_histo: Histogram,
pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
pub flush_time_histo: StorageTimeMetrics,
pub compact_time_histo: StorageTimeMetrics,
pub create_images_time_histo: StorageTimeMetrics,
@@ -776,9 +783,15 @@ impl TimelineMetrics {
) -> Self {
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let reconstruct_time_histo = RECONSTRUCT_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let flush_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
let compact_time_histo =
@@ -820,18 +833,19 @@ impl TimelineMetrics {
let read_num_fs_layers = READ_NUM_FS_LAYERS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration =
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
// TODO(chi): remove this once we remove Lazy for all metrics. Otherwise this will not appear in the exporter
// and integration test will error.
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
MATERIALIZED_PAGE_CACHE_HIT.get();
TimelineMetrics {
tenant_id,
timeline_id,
reconstruct_time_histo,
get_reconstruct_data_time_histo,
materialized_page_cache_hit_counter,
materialized_page_cache_hit_upon_request_counter,
flush_time_histo,
compact_time_histo,
create_images_time_histo,
@@ -858,7 +872,10 @@ impl Drop for TimelineMetrics {
fn drop(&mut self) {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);

View File

@@ -915,6 +915,27 @@ where
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
.await?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("set_compute_spec_id ") {
let (_, params_raw) = query_string.split_at("set_compute_spec_id ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
if params.len() != 3 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for set_compute_spec_id command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
let spec_id = params[2].to_string();
self.check_permission(Some(tenant_id))?;
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
*timeline.compute_spec_id.lock().await = Some(spec_id);
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
}
// return pair of prev_lsn and last_lsn
else if query_string.starts_with("get_last_record_rlsn ") {

View File

@@ -43,16 +43,6 @@ pub enum CalculateLogicalSizeError {
Other(#[from] anyhow::Error),
}
#[derive(Debug, thiserror::Error)]
pub enum RelationError {
#[error("Relation Already Exists")]
AlreadyExists,
#[error("invalid relnode")]
InvalidRelnode,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
///
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
/// and other special kinds of files, in a versioned key-value store. The
@@ -111,9 +101,9 @@ impl Timeline {
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
));
return Err(PageReconstructError::Other(anyhow::anyhow!(
"invalid relnode"
)));
}
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
@@ -158,9 +148,9 @@ impl Timeline {
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
));
return Err(PageReconstructError::Other(anyhow::anyhow!(
"invalid relnode"
)));
}
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -203,9 +193,9 @@ impl Timeline {
ctx: &RequestContext,
) -> Result<bool, PageReconstructError> {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
));
return Err(PageReconstructError::Other(anyhow::anyhow!(
"invalid relnode"
)));
}
// first try to lookup relation in cache
@@ -709,20 +699,6 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
#[cfg(test)]
pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
self.init_empty()?;
self.put_control_file(bytes::Bytes::from_static(
b"control_file contents do not matter",
))
.context("put_control_file")?;
self.put_checkpoint(bytes::Bytes::from_static(
b"checkpoint_file contents do not matter",
))
.context("put_checkpoint_file")?;
Ok(())
}
/// Put a new page version that can be constructed from a WAL record
///
/// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -734,7 +710,7 @@ impl<'a> DatadirModification<'a> {
blknum: BlockNumber,
rec: NeonWalRecord,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
Ok(())
}
@@ -761,7 +737,7 @@ impl<'a> DatadirModification<'a> {
blknum: BlockNumber,
img: Bytes,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
Ok(())
}
@@ -885,38 +861,32 @@ impl<'a> DatadirModification<'a> {
rel: RelTag,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> Result<(), RelationError> {
if rel.relnode == 0 {
return Err(RelationError::AlreadyExists);
}
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// It's possible that this is the first rel for this db in this
// tablespace. Create the reldir entry for it if so.
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
.context("deserialize db")?;
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
// Didn't exist. Update dbdir
dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
let buf = DbDirectory::ser(&dbdir)?;
self.put(DBDIR_KEY, Value::Image(buf.into()));
// and create the RelDirectory
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
};
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
anyhow::bail!("rel {rel} already exists");
}
self.put(
rel_dir_key,
Value::Image(Bytes::from(
RelDirectory::ser(&rel_dir).context("serialize")?,
)),
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
);
// Put size
@@ -941,7 +911,7 @@ impl<'a> DatadirModification<'a> {
nblocks: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
let last_lsn = self.tline.get_last_record_lsn();
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
let size_key = rel_size_to_key(rel);
@@ -972,7 +942,7 @@ impl<'a> DatadirModification<'a> {
nblocks: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// Put size
let size_key = rel_size_to_key(rel);
@@ -993,7 +963,7 @@ impl<'a> DatadirModification<'a> {
/// Drop a relation.
pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// Remove it from the directory entry
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
@@ -1138,7 +1108,7 @@ impl<'a> DatadirModification<'a> {
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub async fn flush(&mut self) -> anyhow::Result<()> {
pub fn flush(&mut self) -> anyhow::Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
@@ -1146,20 +1116,19 @@ impl<'a> DatadirModification<'a> {
return Ok(());
}
let writer = self.tline.writer().await;
let writer = self.tline.writer();
// Flush relation and SLRU data blocks, keep metadata.
let mut retained_pending_updates = HashMap::new();
for (key, value) in self.pending_updates.drain() {
if is_rel_block_key(key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, self.lsn, &value).await?;
let mut result: anyhow::Result<()> = Ok(());
self.pending_updates.retain(|&key, value| {
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
result = writer.put(key, self.lsn, value);
false
} else {
retained_pending_updates.insert(key, value);
true
}
}
self.pending_updates.extend(retained_pending_updates);
});
result?;
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1174,17 +1143,17 @@ impl<'a> DatadirModification<'a> {
/// underlying timeline.
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub async fn commit(&mut self) -> anyhow::Result<()> {
let writer = self.tline.writer().await;
pub fn commit(&mut self) -> anyhow::Result<()> {
let writer = self.tline.writer();
let lsn = self.lsn;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
for (key, value) in self.pending_updates.drain() {
writer.put(key, lsn, &value).await?;
writer.put(key, lsn, &value)?;
}
for key_range in self.pending_deletions.drain(..) {
writer.delete(key_range, lsn).await?;
writer.delete(key_range, lsn)?;
}
writer.finish_write(lsn);
@@ -1624,6 +1593,20 @@ fn is_slru_block_key(key: Key) -> bool {
&& key.field6 != 0xffffffff // and not SlruSegSize
}
#[cfg(test)]
pub fn create_test_timeline(
tenant: &crate::tenant::Tenant,
timeline_id: utils::id::TimelineId,
pg_version: u32,
ctx: &RequestContext,
) -> anyhow::Result<std::sync::Arc<Timeline>> {
let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
let mut m = tline.begin_modification(Lsn(8));
m.init_empty()?;
m.commit()?;
Ok(tline)
}
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {

File diff suppressed because it is too large Load Diff

View File

@@ -38,8 +38,8 @@ pub mod defaults {
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
}

View File

@@ -1,325 +0,0 @@
//! This module contains the encoding and decoding of the local manifest file.
//!
//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
//! records the state of the storage engine. It contains a snapshot of the
//! state and all operations proceeding that snapshot. The file begins with a
//! header recording MANIFEST version number. After that, it contains a snapshot.
//! The snapshot is followed by a list of operations. Each operation is a list
//! of records. Each record is either an addition or a removal of a layer.
//!
//! With MANIFEST, we can:
//!
//! 1. recover state quickly by reading the file, potentially boosting the
//! startup speed.
//! 2. ensure all operations are atomic and avoid corruption, solving issues
//! like redundant image layer and preparing us for future compaction
//! strategies.
//!
//! There is also a format for storing all layer files on S3, called
//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
//! records all operations as logs, and therefore we can easily replay the
//! operations when recovering from crash, while ensuring those operations
//! are atomic upon restart.
//!
//! Currently, this is not used in the system. Future refactors will ensure
//! the storage state will be recorded in this file, and the system can be
//! recovered from this file. This is tracked in
//! https://github.com/neondatabase/neon/issues/4418
use std::io::{self, Read, Write};
use crate::virtual_file::VirtualFile;
use anyhow::Result;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use crc32c::crc32c;
use serde::{Deserialize, Serialize};
use tracing::log::warn;
use utils::lsn::Lsn;
use super::storage_layer::PersistentLayerDesc;
pub struct Manifest {
file: VirtualFile,
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub struct Snapshot {
pub layers: Vec<PersistentLayerDesc>,
}
/// serde by default encode this in tagged enum, and therefore it will be something
/// like `{ "AddLayer": { ... } }`.
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub enum Record {
AddLayer(PersistentLayerDesc),
RemoveLayer(PersistentLayerDesc),
}
/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
const MANIFEST_VERSION: u64 = 1;
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub struct ManifestHeader {
magic_number: u64,
version: u64,
}
const MANIFEST_HEADER_LEN: usize = 16;
impl ManifestHeader {
fn encode(&self) -> BytesMut {
let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
buf.put_u64(self.magic_number);
buf.put_u64(self.version);
buf
}
fn decode(mut buf: &[u8]) -> Self {
assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
Self {
magic_number: buf.get_u64(),
version: buf.get_u64(),
}
}
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub enum Operation {
/// A snapshot of the current state.
///
/// Lsn field represents the LSN that is persisted to disk for this snapshot.
Snapshot(Snapshot, Lsn),
/// An atomic operation that changes the state.
///
/// Lsn field represents the LSN that is persisted to disk after the operation is done.
/// This will only change when new L0 is flushed to the disk.
Operation(Vec<Record>, Lsn),
}
struct RecordHeader {
size: u32,
checksum: u32,
}
const RECORD_HEADER_LEN: usize = 8;
impl RecordHeader {
fn encode(&self) -> BytesMut {
let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
buf.put_u32(self.size);
buf.put_u32(self.checksum);
buf
}
fn decode(mut buf: &[u8]) -> Self {
assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
Self {
size: buf.get_u32(),
checksum: buf.get_u32(),
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum ManifestLoadError {
#[error("manifest header is corrupted")]
CorruptedManifestHeader,
#[error("unsupported manifest version: got {0}, expected {1}")]
UnsupportedVersion(u64, u64),
#[error("error when decoding record: {0}")]
DecodeRecord(serde_json::Error),
#[error("I/O error: {0}")]
Io(io::Error),
}
#[must_use = "Should check if the manifest is partially corrupted"]
pub struct ManifestPartiallyCorrupted(bool);
impl Manifest {
/// Create a new manifest by writing the manifest header and a snapshot record to the given file.
pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
let mut manifest = Self { file };
manifest.append_manifest_header(ManifestHeader {
magic_number: MANIFEST_MAGIC_NUMBER,
version: MANIFEST_VERSION,
})?;
manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
Ok(manifest)
}
/// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
/// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
/// backup the current one.
pub fn load(
mut file: VirtualFile,
) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
let mut buf = vec![];
file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
// Read manifest header
let mut buf = Bytes::from(buf);
if buf.remaining() < MANIFEST_HEADER_LEN {
return Err(ManifestLoadError::CorruptedManifestHeader);
}
let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
buf.advance(MANIFEST_HEADER_LEN);
if header.version != MANIFEST_VERSION {
return Err(ManifestLoadError::UnsupportedVersion(
header.version,
MANIFEST_VERSION,
));
}
// Read operations
let mut operations = Vec::new();
let corrupted = loop {
if buf.remaining() == 0 {
break false;
}
if buf.remaining() < RECORD_HEADER_LEN {
warn!("incomplete header when decoding manifest, could be corrupted");
break true;
}
let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
let size = size as usize;
buf.advance(RECORD_HEADER_LEN);
if buf.remaining() < size {
warn!("incomplete data when decoding manifest, could be corrupted");
break true;
}
let data = &buf[..size];
if crc32c(data) != checksum {
warn!("checksum mismatch when decoding manifest, could be corrupted");
break true;
}
// if the following decode fails, we cannot use the manifest or safely ignore any record.
operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
buf.advance(size);
};
Ok((
Self { file },
operations,
ManifestPartiallyCorrupted(corrupted),
))
}
fn append_data(&mut self, data: &[u8]) -> Result<()> {
if data.len() >= u32::MAX as usize {
panic!("data too large");
}
let header = RecordHeader {
size: data.len() as u32,
checksum: crc32c(data),
};
let header = header.encode();
self.file.write_all(&header)?;
self.file.write_all(data)?;
self.file.sync_all()?;
Ok(())
}
fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
let encoded = header.encode();
self.file.write_all(&encoded)?;
Ok(())
}
/// Add an operation to the manifest. The operation will be appended to the end of the file,
/// and the file will fsync.
pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
let encoded = Vec::from(serde_json::to_string(&operation)?);
self.append_data(&encoded)
}
}
#[cfg(test)]
mod tests {
use std::fs::OpenOptions;
use crate::repository::Key;
use super::*;
#[test]
fn test_read_manifest() {
let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
std::fs::create_dir_all(&testdir).unwrap();
let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
// Write a manifest with a snapshot and some operations
let snapshot = Snapshot {
layers: vec![layer1, layer2],
};
let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
manifest
.append_operation(Operation::Operation(
vec![Record::AddLayer(layer3.clone())],
Lsn::from(1),
))
.unwrap();
drop(manifest);
// Open the second time and write
let file = VirtualFile::open_with_options(
&testdir.join("MANIFEST"),
OpenOptions::new()
.read(true)
.write(true)
.create_new(false)
.truncate(false),
)
.unwrap();
let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
assert!(!corrupted.0);
assert_eq!(operations.len(), 2);
assert_eq!(
&operations[0],
&Operation::Snapshot(snapshot.clone(), Lsn::from(0))
);
assert_eq!(
&operations[1],
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
);
manifest
.append_operation(Operation::Operation(
vec![
Record::RemoveLayer(layer3.clone()),
Record::AddLayer(layer4.clone()),
],
Lsn::from(2),
))
.unwrap();
drop(manifest);
// Open the third time and verify
let file = VirtualFile::open_with_options(
&testdir.join("MANIFEST"),
OpenOptions::new()
.read(true)
.write(true)
.create_new(false)
.truncate(false),
)
.unwrap();
let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
assert!(!corrupted.0);
assert_eq!(operations.len(), 3);
assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
assert_eq!(
&operations[1],
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
);
assert_eq!(
&operations[2],
&Operation::Operation(
vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
Lsn::from(2)
)
);
}
}

View File

@@ -753,18 +753,22 @@ impl RemoteTimelineClient {
// Have a failpoint that can use the `pause` failpoint action.
// We don't want to block the executor thread, hence, spawn_blocking + await.
if cfg!(feature = "testing") {
tokio::task::spawn_blocking({
let current = tracing::Span::current();
move || {
let _entered = current.entered();
tracing::info!("at failpoint persist_deleted_index_part");
fail::fail_point!("persist_deleted_index_part");
}
})
.await
.expect("spawn_blocking");
}
#[cfg(feature = "testing")]
tokio::task::spawn_blocking({
let current = tracing::Span::current();
move || {
let _entered = current.entered();
tracing::info!(
"at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
);
fail::fail_point!(
"persist_index_part_with_deleted_flag_after_set_before_upload_pause"
);
}
})
.await
.expect("spawn_blocking");
upload::upload_index_part(
self.conf,
&self.storage_impl,
@@ -1388,12 +1392,7 @@ mod tests {
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = runtime.block_on(harness.load());
// create an empty timeline directory
let _ = runtime.block_on(tenant.create_test_timeline(
TIMELINE_ID,
Lsn(8),
DEFAULT_PG_VERSION,
&ctx,
))?;
let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
std::fs::create_dir_all(remote_fs_dir)?;

View File

@@ -389,10 +389,10 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
}
/// Returned by [`Layer::iter`]
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
/// Returned by [`Layer::key_iter`]
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
/// range of LSNs.

View File

@@ -37,7 +37,6 @@ use crate::virtual_file::VirtualFile;
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{bail, ensure, Context, Result};
use once_cell::sync::OnceCell;
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
@@ -47,6 +46,7 @@ use std::io::{Seek, SeekFrom};
use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tracing::*;
use utils::{
@@ -184,7 +184,7 @@ pub struct DeltaLayer {
access_stats: LayerAccessStats,
inner: OnceCell<DeltaLayerInner>,
inner: RwLock<DeltaLayerInner>,
}
impl std::fmt::Debug for DeltaLayer {
@@ -201,17 +201,21 @@ impl std::fmt::Debug for DeltaLayer {
}
pub struct DeltaLayerInner {
/// If false, the fields below have not been loaded into memory yet.
loaded: bool,
// values copied from summary
index_start_blk: u32,
index_root_blk: u32,
/// Reader object for reading blocks from the file.
file: FileBlockReader<VirtualFile>,
/// Reader object for reading blocks from the file. (None if not loaded yet)
file: Option<FileBlockReader<VirtualFile>>,
}
impl std::fmt::Debug for DeltaLayerInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("DeltaLayerInner")
.field("loaded", &self.loaded)
.field("index_start_blk", &self.index_start_blk)
.field("index_root_blk", &self.index_root_blk)
.finish()
@@ -242,7 +246,7 @@ impl Layer for DeltaLayer {
inner.index_start_blk, inner.index_root_blk
);
let file = &inner.file;
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
@@ -311,7 +315,7 @@ impl Layer for DeltaLayer {
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
// Scan the page versions backwards, starting from `lsn`.
let file = &inner.file;
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
@@ -496,22 +500,51 @@ impl DeltaLayer {
/// Open the underlying file and read the metadata into memory, if it's
/// not loaded already.
///
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
fn load(
&self,
access_kind: LayerAccessKind,
ctx: &RequestContext,
) -> Result<RwLockReadGuard<DeltaLayerInner>> {
self.access_stats
.record_access(access_kind, ctx.task_kind());
// Quick exit if already loaded
self.inner
.get_or_try_init(|| self.load_inner())
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
loop {
// Quick exit if already loaded
let inner = self.inner.read().unwrap();
if inner.loaded {
return Ok(inner);
}
// Need to open the file and load the metadata. Upgrade our lock to
// a write lock. (Or rather, release and re-lock in write mode.)
drop(inner);
let inner = self.inner.write().unwrap();
if !inner.loaded {
self.load_inner(inner).with_context(|| {
format!("Failed to load delta layer {}", self.path().display())
})?;
} else {
// Another thread loaded it while we were not holding the lock.
}
// We now have the file open and loaded. There's no function to do
// that in the std library RwLock, so we have to release and re-lock
// in read mode. (To be precise, the lock guard was moved in the
// above call to `load_inner`, so it's already been released). And
// while we do that, another thread could unload again, so we have
// to re-check and retry if that happens.
}
}
fn load_inner(&self) -> Result<DeltaLayerInner> {
fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
let path = self.path();
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
// Open the file if it's not open already.
if inner.file.is_none() {
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
inner.file = Some(FileBlockReader::new(file));
}
let file = inner.file.as_mut().unwrap();
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -538,13 +571,13 @@ impl DeltaLayer {
}
}
inner.index_start_blk = actual_summary.index_start_blk;
inner.index_root_blk = actual_summary.index_root_blk;
debug!("loaded from {}", &path.display());
Ok(DeltaLayerInner {
file,
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
})
inner.loaded = true;
Ok(())
}
/// Create a DeltaLayer struct representing an existing file on disk.
@@ -566,7 +599,12 @@ impl DeltaLayer {
file_size,
),
access_stats,
inner: once_cell::sync::OnceCell::new(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
file: None,
index_start_blk: 0,
index_root_blk: 0,
}),
}
}
@@ -593,7 +631,12 @@ impl DeltaLayer {
metadata.len(),
),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: once_cell::sync::OnceCell::new(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
file: None,
index_start_blk: 0,
index_root_blk: 0,
}),
})
}
@@ -757,7 +800,12 @@ impl DeltaLayerWriterInner {
metadata.len(),
),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: once_cell::sync::OnceCell::new(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
file: None,
index_start_blk,
index_root_blk,
}),
};
// fsync the file
@@ -892,13 +940,13 @@ struct DeltaValueIter<'a> {
reader: BlockCursor<Adapter<'a>>,
}
struct Adapter<'a>(&'a DeltaLayerInner);
struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
impl<'a> BlockReader for Adapter<'a> {
type BlockLease = PageReadGuard<'static>;
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
self.0.file.read_blk(blknum)
self.0.file.as_ref().unwrap().read_blk(blknum)
}
}
@@ -911,8 +959,8 @@ impl<'a> Iterator for DeltaValueIter<'a> {
}
impl<'a> DeltaValueIter<'a> {
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
let file = &inner.file;
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
@@ -985,8 +1033,8 @@ impl Iterator for DeltaKeyIter {
}
impl<'a> DeltaKeyIter {
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
let file = &inner.file;
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
@@ -1026,21 +1074,3 @@ impl<'a> DeltaKeyIter {
Ok(iter)
}
}
#[cfg(test)]
mod test {
use super::DeltaKeyIter;
use super::DeltaLayer;
use super::DeltaValueIter;
// We will soon need the iters to be send in the compaction code.
// Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
// Cf https://github.com/neondatabase/neon/issues/4471
#[test]
fn is_send() {
fn assert_send<T: Send>() {}
assert_send::<DeltaLayer>();
assert_send::<DeltaValueIter>();
assert_send::<DeltaKeyIter>();
}
}

View File

@@ -304,7 +304,7 @@ impl InMemoryLayer {
Ok(())
}
pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
// TODO: Currently, we just leak the storage for any deleted keys
Ok(())

View File

@@ -9,12 +9,10 @@ use crate::{context::RequestContext, repository::Key};
use super::{DeltaFileName, ImageFileName, LayerFileName};
use serde::{Deserialize, Serialize};
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
/// a unified way to generate layer information like file name.
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct PersistentLayerDesc {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
@@ -52,19 +50,6 @@ impl PersistentLayerDesc {
self.filename().file_name()
}
#[cfg(test)]
pub fn new_test(key_range: Range<Key>) -> Self {
Self {
tenant_id: TenantId::generate(),
timeline_id: TimelineId::generate(),
key_range,
lsn_range: Lsn(0)..Lsn(1),
is_delta: false,
is_incremental: false,
file_size: 0,
}
}
pub fn new_img(
tenant_id: TenantId,
timeline_id: TimelineId,

View File

@@ -15,7 +15,6 @@ use pageserver_api::models::{
TimelineState,
};
use remote_storage::GenericRemoteStorage;
use serde_with::serde_as;
use storage_broker::BrokerClientChannel;
use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
use tokio_util::sync::CancellationToken;
@@ -29,7 +28,7 @@ use std::ops::{Deref, Range};
use std::path::{Path, PathBuf};
use std::pin::pin;
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, RwLock, Weak};
use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
use std::time::{Duration, Instant, SystemTime};
use crate::context::{DownloadBehavior, RequestContext};
@@ -48,10 +47,7 @@ use crate::tenant::{
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
use crate::metrics::{
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
RECONSTRUCT_TIME, UNEXPECTED_ONDEMAND_DOWNLOADS,
};
use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
@@ -88,14 +84,9 @@ use super::remote_timeline_client::RemoteTimelineClient;
use super::storage_layer::{DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(super) enum FlushLoopState {
enum FlushLoopState {
NotStarted,
Running {
#[cfg(test)]
expect_initdb_optimization: bool,
#[cfg(test)]
initdb_optimization_count: usize,
},
Running,
Exited,
}
@@ -129,7 +120,7 @@ pub struct Timeline {
pub pg_version: u32,
pub(crate) layers: tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>,
pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,
/// Set of key ranges which should be covered by image layers to
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
@@ -189,10 +180,10 @@ pub struct Timeline {
/// Locked automatically by [`TimelineWriter`] and checkpointer.
/// Must always be acquired before the layer map/individual layer lock
/// to avoid deadlock.
write_lock: tokio::sync::Mutex<()>,
write_lock: Mutex<()>,
/// Used to avoid multiple `flush_loop` tasks running
pub(super) flush_loop_state: Mutex<FlushLoopState>,
flush_loop_state: Mutex<FlushLoopState>,
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
/// The value is a counter, incremented every time a new flush cycle is requested.
@@ -224,6 +215,10 @@ pub struct Timeline {
// though let's keep them both for better error visibility.
pub initdb_lsn: Lsn,
// Compute nodes can set this field after successful application
// of a new spec, in order to avoid reapplying it on next restart.
pub compute_spec_id: tokio::sync::Mutex<Option<String>>,
/// When did we last calculate the partitioning?
partitioning: Mutex<(KeyPartitioning, Lsn)>,
@@ -543,7 +538,9 @@ impl Timeline {
match cached_lsn.cmp(&lsn) {
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
Ordering::Equal => {
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc();
self.metrics
.materialized_page_cache_hit_upon_request_counter
.inc();
return Ok(cached_img); // exact LSN match, return the image
}
Ordering::Greater => {
@@ -565,7 +562,8 @@ impl Timeline {
.await?;
timer.stop_and_record();
RECONSTRUCT_TIME
self.metrics
.reconstruct_time_histo
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
}
@@ -598,8 +596,8 @@ impl Timeline {
/// The sum of the file size of all historic layers in the layer map.
/// This method makes no distinction between local and remote layers.
/// Hence, the result **does not represent local filesystem usage**.
pub async fn layer_size_sum(&self) -> u64 {
let layer_map = self.layers.read().await;
pub fn layer_size_sum(&self) -> u64 {
let layer_map = self.layers.read().unwrap();
let mut size = 0;
for l in layer_map.iter_historic_layers() {
size += l.file_size();
@@ -690,7 +688,7 @@ impl Timeline {
/// Flush to disk all data that was written with the put_* functions
#[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
self.freeze_inmem_layer(false).await;
self.freeze_inmem_layer(false);
self.flush_frozen_layers_and_wait().await
}
@@ -869,10 +867,10 @@ impl Timeline {
}
/// Mutate the timeline with a [`TimelineWriter`].
pub async fn writer(&self) -> TimelineWriter<'_> {
pub fn writer(&self) -> TimelineWriter<'_> {
TimelineWriter {
tl: self,
_write_guard: self.write_lock.lock().await,
_write_guard: self.write_lock.lock().unwrap(),
}
}
@@ -906,39 +904,37 @@ impl Timeline {
///
/// Also flush after a period of time without new data -- it helps
/// safekeepers to regard pageserver as caught up and suspend activity.
pub async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
let last_lsn = self.get_last_record_lsn();
let open_layer_size = {
let layers = self.layers.read().await;
let Some(open_layer) = layers.open_layer.as_ref() else {
return Ok(());
};
open_layer.size()?
};
let last_freeze_at = self.last_freeze_at.load();
let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
let distance = last_lsn.widening_sub(last_freeze_at);
// Checkpointing the open layer can be triggered by layer size or LSN range.
// S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
// we want to stay below that with a big margin. The LSN distance determines how
// much WAL the safekeepers need to store.
if distance >= self.get_checkpoint_distance().into()
|| open_layer_size > self.get_checkpoint_distance()
|| (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
{
info!(
"check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
distance,
open_layer_size,
last_freeze_ts.elapsed()
);
let layers = self.layers.read().unwrap();
if let Some(open_layer) = &layers.open_layer {
let open_layer_size = open_layer.size()?;
drop(layers);
let last_freeze_at = self.last_freeze_at.load();
let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
let distance = last_lsn.widening_sub(last_freeze_at);
// Checkpointing the open layer can be triggered by layer size or LSN range.
// S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
// we want to stay below that with a big margin. The LSN distance determines how
// much WAL the safekeepers need to store.
if distance >= self.get_checkpoint_distance().into()
|| open_layer_size > self.get_checkpoint_distance()
|| (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
{
info!(
"check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
distance,
open_layer_size,
last_freeze_ts.elapsed()
);
self.freeze_inmem_layer(true).await;
self.last_freeze_at.store(last_lsn);
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
self.freeze_inmem_layer(true);
self.last_freeze_at.store(last_lsn);
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
// Wake up the layer flusher
self.flush_frozen_layers();
// Wake up the layer flusher
self.flush_frozen_layers();
}
}
Ok(())
}
@@ -1039,8 +1035,8 @@ impl Timeline {
}
}
pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
let layer_map = self.layers.read().await;
pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
let layer_map = self.layers.read().unwrap();
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
if let Some(open_layer) = &layer_map.open_layer {
in_memory_layers.push(open_layer.info());
@@ -1062,7 +1058,7 @@ impl Timeline {
#[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) };
let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) };
if self.remote_client.is_none() {
return Ok(Some(false));
@@ -1075,7 +1071,7 @@ impl Timeline {
/// Like [`evict_layer_batch`], but for just one layer.
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let Some(local_layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) };
let remote_client = self
.remote_client
.as_ref()
@@ -1160,7 +1156,7 @@ impl Timeline {
}
// start the batch update
let mut layer_map = self.layers.write().await;
let mut layer_map = self.layers.write().unwrap();
let mut batch_updates = layer_map.batch_update();
let mut results = Vec::with_capacity(layers_to_evict.len());
@@ -1418,7 +1414,7 @@ impl Timeline {
timeline_id,
tenant_id,
pg_version,
layers: tokio::sync::RwLock::new(LayerMap::default()),
layers: RwLock::new(LayerMap::default()),
wanted_image_layers: Mutex::new(None),
walredo_mgr,
@@ -1453,7 +1449,7 @@ impl Timeline {
layer_flush_start_tx,
layer_flush_done_tx,
write_lock: tokio::sync::Mutex::new(()),
write_lock: Mutex::new(()),
layer_removal_cs: Default::default(),
gc_info: std::sync::RwLock::new(GcInfo {
@@ -1464,6 +1460,7 @@ impl Timeline {
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
initdb_lsn: metadata.initdb_lsn(),
compute_spec_id: tokio::sync::Mutex::new(None),
current_logical_size: if disk_consistent_lsn.is_valid() {
// we're creating timeline data with some layer files existing locally,
@@ -1505,7 +1502,7 @@ impl Timeline {
let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
match *flush_loop_state {
FlushLoopState::NotStarted => (),
FlushLoopState::Running { .. } => {
FlushLoopState::Running => {
info!(
"skipping attempt to start flush_loop twice {}/{}",
self.tenant_id, self.timeline_id
@@ -1525,12 +1522,6 @@ impl Timeline {
let self_clone = Arc::clone(self);
info!("spawning flush loop");
*flush_loop_state = FlushLoopState::Running {
#[cfg(test)]
expect_initdb_optimization: false,
#[cfg(test)]
initdb_optimization_count: 0,
};
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::LayerFlushTask,
@@ -1542,12 +1533,14 @@ impl Timeline {
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
assert_eq!(*flush_loop_state, FlushLoopState::Running);
*flush_loop_state = FlushLoopState::Exited;
Ok(())
}
.instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
);
*flush_loop_state = FlushLoopState::Running;
}
/// Creates and starts the wal receiver.
@@ -1595,21 +1588,12 @@ impl Timeline {
));
}
///
/// Initialize with an empty layer map. Used when creating a new timeline.
///
pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
let mut layers = self.layers.try_write().expect(
"in the context where we call this function, no other task has access to the object",
);
layers.next_open_layer_at = Some(Lsn(start_lsn.0));
}
///
/// Scan the timeline directory to populate the layer map.
/// Returns all timeline-related files that were found and loaded.
///
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
let mut layers = self.layers.write().await;
pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
let mut layers = self.layers.write().unwrap();
let mut updates = layers.batch_update();
let mut num_layers = 0;
@@ -1738,7 +1722,7 @@ impl Timeline {
// We're holding a layer map lock for a while but this
// method is only called during init so it's fine.
let mut layer_map = self.layers.write().await;
let mut layer_map = self.layers.write().unwrap();
let mut updates = layer_map.batch_update();
for remote_layer_name in &index_part.timeline_layers {
let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1891,7 +1875,7 @@ impl Timeline {
let local_layers = self
.layers
.read()
.await
.unwrap()
.iter_historic_layers()
.map(|l| (l.filename(), l))
.collect::<HashMap<_, _>>();
@@ -2264,8 +2248,8 @@ impl Timeline {
}
}
async fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
for historic_layer in self.layers.read().await.iter_historic_layers() {
fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
for historic_layer in self.layers.read().unwrap().iter_historic_layers() {
let historic_layer_name = historic_layer.filename().file_name();
if layer_file_name == historic_layer_name {
return Some(historic_layer);
@@ -2388,7 +2372,7 @@ impl Timeline {
ValueReconstructResult::Continue => {
// If we reached an earlier cached page image, we're done.
if cont_lsn == cached_lsn + 1 {
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
self.metrics.materialized_page_cache_hit_counter.inc_by(1);
return Ok(());
}
if prev_lsn <= cont_lsn {
@@ -2406,17 +2390,10 @@ impl Timeline {
}
ValueReconstructResult::Missing => {
return Err(layer_traversal_error(
if cfg!(test) {
format!(
"could not find data for key {} at LSN {}, for request at LSN {}\n{}",
key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
)
} else {
format!(
"could not find data for key {} at LSN {}, for request at LSN {}",
key, cont_lsn, request_lsn
)
},
format!(
"could not find data for key {} at LSN {}, for request at LSN {}",
key, cont_lsn, request_lsn
),
traversal_path,
));
}
@@ -2482,7 +2459,7 @@ impl Timeline {
#[allow(clippy::never_loop)] // see comment at bottom of this loop
'layer_map_search: loop {
let remote_layer = {
let layers = timeline.layers.read().await;
let layers = timeline.layers.read().unwrap();
// Check the open and frozen in-memory layers first, in order from newest
// to oldest.
@@ -2664,29 +2641,24 @@ impl Timeline {
///
/// Get a handle to the latest layer for appending.
///
async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
let mut layers = self.layers.write().await;
fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
let mut layers = self.layers.write().unwrap();
ensure!(lsn.is_aligned());
let last_record_lsn = self.get_last_record_lsn();
ensure!(
lsn > last_record_lsn,
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
lsn,
last_record_lsn,
std::backtrace::Backtrace::force_capture(),
);
// Do we have a layer open for writing already?
let layer;
if let Some(open_layer) = &layers.open_layer {
if open_layer.get_lsn_range().start > lsn {
bail!(
"unexpected open layer in the future: open layers starts at {}, write lsn {}",
open_layer.get_lsn_range().start,
lsn
);
bail!("unexpected open layer in the future");
}
layer = Arc::clone(open_layer);
@@ -2714,16 +2686,17 @@ impl Timeline {
Ok(layer)
}
async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
//info!("PUT: key {} at {}", key, lsn);
let layer = self.get_layer_for_write(lsn).await?;
let layer = self.get_layer_for_write(lsn)?;
layer.put_value(key, lsn, val)?;
Ok(())
}
async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
let layer = self.get_layer_for_write(lsn).await?;
layer.put_tombstone(key_range, lsn).await?;
fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
let layer = self.get_layer_for_write(lsn)?;
layer.put_tombstone(key_range, lsn)?;
Ok(())
}
@@ -2734,15 +2707,15 @@ impl Timeline {
self.last_record_lsn.advance(new_lsn);
}
async fn freeze_inmem_layer(&self, write_lock_held: bool) {
fn freeze_inmem_layer(&self, write_lock_held: bool) {
// Freeze the current open in-memory layer. It will be written to disk on next
// iteration.
let _write_guard = if write_lock_held {
None
} else {
Some(self.write_lock.lock().await)
Some(self.write_lock.lock().unwrap())
};
let mut layers = self.layers.write().await;
let mut layers = self.layers.write().unwrap();
if let Some(open_layer) = &layers.open_layer {
let open_layer_rc = Arc::clone(open_layer);
// Does this layer need freezing?
@@ -2780,7 +2753,7 @@ impl Timeline {
let flush_counter = *layer_flush_start_rx.borrow();
let result = loop {
let layer_to_flush = {
let layers = self.layers.read().await;
let layers = self.layers.read().unwrap();
layers.frozen_layers.front().cloned()
// drop 'layers' lock to allow concurrent reads and writes
};
@@ -2813,7 +2786,7 @@ impl Timeline {
let mut my_flush_request = 0;
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
if flush_loop_state != FlushLoopState::Running {
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
}
@@ -2863,18 +2836,6 @@ impl Timeline {
let lsn_range = frozen_layer.get_lsn_range();
let layer_paths_to_upload =
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
initdb_optimization_count,
..
} => {
*initdb_optimization_count += 1;
}
}
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
// require downloading anything during initial import.
let (partitioning, _lsn) = self
@@ -2883,20 +2844,17 @@ impl Timeline {
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
.await?
} else {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
expect_initdb_optimization,
..
} => {
assert!(!*expect_initdb_optimization, "expected initdb optimization");
}
}
// normal case, write out a L0 delta layer file.
let (delta_path, metadata) = self.create_delta_layer(&frozen_layer).await?;
let this = self.clone();
let frozen_layer = frozen_layer.clone();
let span = tracing::info_span!("blocking");
let (delta_path, metadata) = tokio::task::spawn_blocking(move || {
let _g = span.entered();
this.create_delta_layer(&frozen_layer)
})
.await
.context("create_delta_layer spawn_blocking")
.and_then(|res| res)?;
HashMap::from([(delta_path, metadata)])
};
@@ -2905,7 +2863,7 @@ impl Timeline {
// The new on-disk layers are now in the layer map. We can remove the
// in-memory layer from the map now.
{
let mut layers = self.layers.write().await;
let mut layers = self.layers.write().unwrap();
let l = layers.frozen_layers.pop_front();
// Only one thread may call this function at a time (for this
@@ -2999,52 +2957,34 @@ impl Timeline {
}
// Write out the given frozen in-memory layer as a new L0 delta file
async fn create_delta_layer(
fn create_delta_layer(
self: &Arc<Self>,
frozen_layer: &Arc<InMemoryLayer>,
frozen_layer: &InMemoryLayer,
) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
let span = tracing::info_span!("blocking");
let new_delta: DeltaLayer = tokio::task::spawn_blocking({
let _g = span.entered();
let self_clone = Arc::clone(self);
let frozen_layer = Arc::clone(frozen_layer);
move || {
// Write it out
let new_delta = frozen_layer.write_to_disk()?;
let new_delta_path = new_delta.path();
// Write it out
let new_delta = frozen_layer.write_to_disk()?;
let new_delta_path = new_delta.path();
let new_delta_filename = new_delta.filename();
// Sync it to disk.
//
// We must also fsync the timeline dir to ensure the directory entries for
// new layer files are durable.
//
// NB: timeline dir must be synced _after_ the file contents are durable.
// So, two separate fsyncs are required, they mustn't be batched.
//
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
// files to flush, the fsync overhead can be reduces as follows:
// 1. write them all to temporary file names
// 2. fsync them
// 3. rename to the final name
// 4. fsync the parent directory.
// Note that (1),(2),(3) today happen inside write_to_disk().
par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
par_fsync::par_fsync(&[self_clone
.conf
.timeline_path(&self_clone.timeline_id, &self_clone.tenant_id)])
.context("fsync of timeline dir")?;
// Sync it to disk.
//
// We must also fsync the timeline dir to ensure the directory entries for
// new layer files are durable
//
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
// files to flush, it might be better to first write them all, and then fsync
// them all in parallel.
anyhow::Ok(new_delta)
}
})
.await
.context("spawn_blocking")??;
let new_delta_name = new_delta.filename();
let sz = new_delta.desc.file_size;
// First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace
// this with a single fsync in future refactors.
par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?;
// Then sync the parent directory.
par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
.context("fsync of timeline dir")?;
// Add it to the layer map
let l = Arc::new(new_delta);
let mut layers = self.layers.write().await;
let mut layers = self.layers.write().unwrap();
let mut batch_updates = layers.batch_update();
l.access_stats().record_residence_event(
&batch_updates,
@@ -3054,12 +2994,15 @@ impl Timeline {
batch_updates.insert_historic(l.layer_desc().clone(), l);
batch_updates.flush();
// update metrics
// update the timeline's physical size
let sz = new_delta_path.metadata()?.len();
self.metrics.resident_physical_size_gauge.add(sz);
// update metrics
self.metrics.num_persistent_files_created.inc_by(1);
self.metrics.persistent_bytes_written.inc_by(sz);
Ok((new_delta_name, LayerFileMetadata::new(sz)))
Ok((new_delta_filename, LayerFileMetadata::new(sz)))
}
async fn repartition(
@@ -3093,14 +3036,10 @@ impl Timeline {
}
// Is it time to create a new image layer for the given partition?
async fn time_for_new_image_layer(
&self,
partition: &KeySpace,
lsn: Lsn,
) -> anyhow::Result<bool> {
fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
let threshold = self.get_image_creation_threshold();
let layers = self.layers.read().await;
let layers = self.layers.read().unwrap();
let mut max_deltas = 0;
{
@@ -3195,7 +3134,7 @@ impl Timeline {
for partition in partitioning.parts.iter() {
let img_range = start..partition.ranges.last().unwrap().end;
start = img_range.end;
if force || self.time_for_new_image_layer(partition, lsn).await? {
if force || self.time_for_new_image_layer(partition, lsn)? {
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
@@ -3278,7 +3217,7 @@ impl Timeline {
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
let mut layers = self.layers.write().await;
let mut layers = self.layers.write().unwrap();
let mut updates = layers.batch_update();
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
@@ -3334,159 +3273,21 @@ impl From<anyhow::Error> for CompactionError {
}
}
#[serde_as]
#[derive(serde::Serialize)]
struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
#[derive(Default)]
enum DurationRecorder {
#[default]
NotStarted,
Recorded(RecordedDuration, tokio::time::Instant),
}
impl DurationRecorder {
pub fn till_now(&self) -> DurationRecorder {
match self {
DurationRecorder::NotStarted => {
panic!("must only call on recorded measurements")
}
DurationRecorder::Recorded(_, ended) => {
let now = tokio::time::Instant::now();
DurationRecorder::Recorded(RecordedDuration(now - *ended), now)
}
}
}
pub fn into_recorded(self) -> Option<RecordedDuration> {
match self {
DurationRecorder::NotStarted => None,
DurationRecorder::Recorded(recorded, _) => Some(recorded),
}
}
}
#[derive(Default)]
struct CompactLevel0Phase1StatsBuilder {
version: Option<u64>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
first_read_lock_acquisition_micros: DurationRecorder,
get_level0_deltas_plus_drop_lock_micros: DurationRecorder,
level0_deltas_count: Option<usize>,
time_spent_between_locks: DurationRecorder,
second_read_lock_acquisition_micros: DurationRecorder,
second_read_lock_held_micros: DurationRecorder,
sort_holes_micros: DurationRecorder,
write_layer_files_micros: DurationRecorder,
new_deltas_count: Option<usize>,
new_deltas_size: Option<u64>,
}
#[serde_as]
#[derive(serde::Serialize)]
struct CompactLevel0Phase1Stats {
version: u64,
#[serde_as(as = "serde_with::DisplayFromStr")]
tenant_id: TenantId,
#[serde_as(as = "serde_with::DisplayFromStr")]
timeline_id: TimelineId,
first_read_lock_acquisition_micros: RecordedDuration,
get_level0_deltas_plus_drop_lock_micros: RecordedDuration,
level0_deltas_count: usize,
time_spent_between_locks: RecordedDuration,
second_read_lock_acquisition_micros: RecordedDuration,
second_read_lock_held_micros: RecordedDuration,
sort_holes_micros: RecordedDuration,
write_layer_files_micros: RecordedDuration,
new_deltas_count: usize,
new_deltas_size: u64,
}
impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
type Error = anyhow::Error;
fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
let CompactLevel0Phase1StatsBuilder {
version,
tenant_id,
timeline_id,
first_read_lock_acquisition_micros,
get_level0_deltas_plus_drop_lock_micros,
level0_deltas_count,
time_spent_between_locks,
second_read_lock_acquisition_micros,
second_read_lock_held_micros,
sort_holes_micros,
write_layer_files_micros,
new_deltas_count,
new_deltas_size,
} = value;
Ok(CompactLevel0Phase1Stats {
version: version.ok_or_else(|| anyhow::anyhow!("version not set"))?,
tenant_id: tenant_id.ok_or_else(|| anyhow::anyhow!("tenant_id not set"))?,
timeline_id: timeline_id.ok_or_else(|| anyhow::anyhow!("timeline_id not set"))?,
first_read_lock_acquisition_micros: first_read_lock_acquisition_micros
.into_recorded()
.ok_or_else(|| anyhow::anyhow!("first_read_lock_acquisition_micros not set"))?,
get_level0_deltas_plus_drop_lock_micros: get_level0_deltas_plus_drop_lock_micros
.into_recorded()
.ok_or_else(|| {
anyhow::anyhow!("get_level0_deltas_plus_drop_lock_micros not set")
})?,
level0_deltas_count: level0_deltas_count
.ok_or_else(|| anyhow::anyhow!("level0_deltas_count not set"))?,
time_spent_between_locks: time_spent_between_locks
.into_recorded()
.ok_or_else(|| anyhow::anyhow!("time_spent_between_locks not set"))?,
second_read_lock_acquisition_micros: second_read_lock_acquisition_micros
.into_recorded()
.ok_or_else(|| anyhow::anyhow!("second_read_lock_acquisition_micros not set"))?,
second_read_lock_held_micros: second_read_lock_held_micros
.into_recorded()
.ok_or_else(|| anyhow::anyhow!("second_read_lock_held_micros not set"))?,
sort_holes_micros: sort_holes_micros
.into_recorded()
.ok_or_else(|| anyhow::anyhow!("sort_holes_micros not set"))?,
write_layer_files_micros: write_layer_files_micros
.into_recorded()
.ok_or_else(|| anyhow::anyhow!("write_layer_files_micros not set"))?,
new_deltas_count: new_deltas_count
.ok_or_else(|| anyhow::anyhow!("new_deltas_count not set"))?,
new_deltas_size: new_deltas_size
.ok_or_else(|| anyhow::anyhow!("new_deltas_size not set"))?,
})
}
}
impl Timeline {
/// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
///
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
/// start of level0 files compaction, the on-demand download should be revisited as well.
async fn compact_level0_phase1(
fn compact_level0_phase1(
&self,
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
target_file_size: u64,
ctx: &RequestContext,
) -> Result<CompactLevel0Phase1Result, CompactionError> {
let mut stats = CompactLevel0Phase1StatsBuilder {
version: Some(1),
tenant_id: Some(self.tenant_id),
timeline_id: Some(self.timeline_id),
..Default::default()
};
let begin = tokio::time::Instant::now();
let layers = self.layers.read().await;
let now = tokio::time::Instant::now();
stats.first_read_lock_acquisition_micros =
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
let layers = self.layers.read().unwrap();
let mut level0_deltas = layers.get_level0_deltas()?;
drop(layers);
stats.level0_deltas_count = Some(level0_deltas.len());
stats.get_level0_deltas_plus_drop_lock_micros =
stats.first_read_lock_acquisition_micros.till_now();
// Only compact if enough layers have accumulated.
let threshold = self.get_compaction_threshold();
@@ -3607,9 +3408,7 @@ impl Timeline {
// Determine N largest holes where N is number of compacted layers.
let max_holes = deltas_to_compact.len();
let last_record_lsn = self.get_last_record_lsn();
stats.time_spent_between_locks = stats.get_level0_deltas_plus_drop_lock_micros.till_now();
let layers = self.layers.read().await; // Is'n it better to hold original layers lock till here?
stats.second_read_lock_acquisition_micros = stats.time_spent_between_locks.till_now();
let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here?
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
let min_hole_coverage_size = 3; // TODO: something more flexible?
@@ -3643,11 +3442,9 @@ impl Timeline {
prev = Some(next_key.next());
}
drop(layers);
stats.second_read_lock_held_micros = stats.second_read_lock_acquisition_micros.till_now();
let mut holes = heap.into_vec();
holes.sort_unstable_by_key(|hole| hole.key_range.start);
let mut next_hole = 0; // index of next hole in holes vector
stats.sort_holes_micros = stats.second_read_lock_held_micros.till_now();
// Merge the contents of all the input delta layers into a new set
// of delta layers, based on the current partitioning.
@@ -3807,26 +3604,8 @@ impl Timeline {
layer_paths.pop().unwrap();
}
stats.write_layer_files_micros = stats.sort_holes_micros.till_now();
stats.new_deltas_count = Some(new_layers.len());
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());
drop(all_keys_iter); // So that deltas_to_compact is no longer borrowed
match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
.and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
{
Ok(stats_json) => {
info!(
stats_json = stats_json.as_str(),
"compact_level0_phase1 stats available"
)
}
Err(e) => {
warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
}
}
Ok(CompactLevel0Phase1Result {
new_layers,
deltas_to_compact,
@@ -3843,12 +3622,21 @@ impl Timeline {
target_file_size: u64,
ctx: &RequestContext,
) -> Result<(), CompactionError> {
let this = self.clone();
let ctx_inner = ctx.clone();
let layer_removal_cs_inner = layer_removal_cs.clone();
let span = tracing::info_span!("blocking");
let CompactLevel0Phase1Result {
new_layers,
deltas_to_compact,
} = self
.compact_level0_phase1(layer_removal_cs.clone(), target_file_size, ctx)
.await?;
} = tokio::task::spawn_blocking(move || {
let _g = span.entered();
this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner)
})
.await
.context("compact_level0_phase1 spawn_blocking")
.map_err(CompactionError::Other)
.and_then(|res| res)?;
if new_layers.is_empty() && deltas_to_compact.is_empty() {
// nothing to do
@@ -3866,7 +3654,7 @@ impl Timeline {
.context("wait for layer upload ops to complete")?;
}
let mut layers = self.layers.write().await;
let mut layers = self.layers.write().unwrap();
let mut updates = layers.batch_update();
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
for l in new_layers {
@@ -3953,7 +3741,6 @@ impl Timeline {
/// for example. The caller should hold `Tenant::gc_cs` lock to ensure
/// that.
///
#[instrument(skip_all, fields(timline_id=%self.timeline_id))]
pub(super) async fn update_gc_info(
&self,
retain_lsns: Vec<Lsn>,
@@ -4126,7 +3913,7 @@ impl Timeline {
// 4. newer on-disk image layers cover the layer's whole key range
//
// TODO holding a write lock is too agressive and avoidable
let mut layers = self.layers.write().await;
let mut layers = self.layers.write().unwrap();
'outer: for l in layers.iter_historic_layers() {
result.layers_total += 1;
@@ -4426,7 +4213,7 @@ impl Timeline {
// Download complete. Replace the RemoteLayer with the corresponding
// Delta- or ImageLayer in the layer map.
let mut layers = self_clone.layers.write().await;
let mut layers = self_clone.layers.write().unwrap();
let mut updates = layers.batch_update();
let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
{
@@ -4584,7 +4371,7 @@ impl Timeline {
) {
let mut downloads = Vec::new();
{
let layers = self.layers.read().await;
let layers = self.layers.read().unwrap();
layers
.iter_historic_layers()
.filter_map(|l| l.downcast_remote_layer())
@@ -4686,8 +4473,8 @@ impl LocalLayerInfoForDiskUsageEviction {
}
impl Timeline {
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
let layers = self.layers.read().await;
pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
let layers = self.layers.read().unwrap();
let mut max_layer_size: Option<u64> = None;
let mut resident_layers = Vec::new();
@@ -4759,7 +4546,7 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
// but will cause large code changes.
pub struct TimelineWriter<'a> {
tl: &'a Timeline,
_write_guard: tokio::sync::MutexGuard<'a, ()>,
_write_guard: MutexGuard<'a, ()>,
}
impl Deref for TimelineWriter<'_> {
@@ -4775,12 +4562,12 @@ impl<'a> TimelineWriter<'a> {
///
/// This will implicitly extend the relation, if the page is beyond the
/// current end-of-file.
pub async fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
self.tl.put_value(key, lsn, value).await
pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
self.tl.put_value(key, lsn, value)
}
pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
self.tl.put_tombstone(key_range, lsn).await
pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
self.tl.put_tombstone(key_range, lsn)
}
/// Track the end of the latest digested WAL record.
@@ -4800,14 +4587,6 @@ impl<'a> TimelineWriter<'a> {
}
}
// We need TimelineWriter to be send in upcoming conversion of
// Timeline::layers to tokio::sync::RwLock.
#[test]
fn is_send() {
fn _assert_send<T: Send>() {}
_assert_send::<TimelineWriter<'_>>();
}
/// Add a suffix to a layer file's name: .{num}.old
/// Uses the first available num (starts at 0)
fn rename_to_backup(path: &Path) -> anyhow::Result<()> {

View File

@@ -197,7 +197,7 @@ impl Timeline {
// We don't want to hold the layer map lock during eviction.
// So, we just need to deal with this.
let candidates: Vec<Arc<dyn PersistentLayer>> = {
let layers = self.layers.read().await;
let layers = self.layers.read().unwrap();
let mut candidates = Vec::new();
for hist_layer in layers.iter_historic_layers() {
if hist_layer.is_remote_layer() {

View File

@@ -1324,8 +1324,7 @@ mod tests {
async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
.await
.create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
.expect("Failed to create an empty timeline for dummy wal connection manager");
ConnectionManagerState {

View File

@@ -304,15 +304,12 @@ pub(super) async fn handle_walreceiver_connection(
}
}
timeline
.check_checkpoint_distance()
.await
.with_context(|| {
format!(
"Failed to check checkpoint distance for timeline {}",
timeline.timeline_id
)
})?;
timeline.check_checkpoint_distance().with_context(|| {
format!(
"Failed to check checkpoint distance for timeline {}",
timeline.timeline_id
)
})?;
if let Some(last_lsn) = status_update {
let timeline_remote_consistent_lsn =

View File

@@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
use anyhow::{Context, Result};
use anyhow::Result;
use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN
modification.commit().await?;
modification.commit()?;
Ok(())
}
@@ -1082,10 +1082,7 @@ impl<'a> WalIngest<'a> {
.await?
{
// create it with 0 size initially, the logic below will extend it
modification
.put_rel_creation(rel, 0, ctx)
.await
.context("Relation Error")?;
modification.put_rel_creation(rel, 0, ctx).await?;
0
} else {
self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
@@ -1174,6 +1171,7 @@ impl<'a> WalIngest<'a> {
#[cfg(test)]
mod tests {
use super::*;
use crate::pgdatadir_mapping::create_test_timeline;
use crate::tenant::harness::*;
use crate::tenant::Timeline;
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
@@ -1202,7 +1200,7 @@ mod tests {
let mut m = tline.begin_modification(Lsn(0x10));
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
m.commit().await?;
m.commit()?;
let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
Ok(walingest)
@@ -1211,9 +1209,7 @@ mod tests {
#[tokio::test]
async fn test_relsize() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
let mut walingest = init_walingest_test(&tline, &ctx).await?;
let mut m = tline.begin_modification(Lsn(0x20));
@@ -1221,22 +1217,22 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
.await?;
m.commit().await?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x30));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
.await?;
m.commit().await?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x40));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
.await?;
m.commit().await?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x50));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
.await?;
m.commit().await?;
m.commit()?;
assert_current_logical_size(&tline, Lsn(0x50));
@@ -1322,7 +1318,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
.await?;
m.commit().await?;
m.commit()?;
assert_current_logical_size(&tline, Lsn(0x60));
// Check reported size and contents after truncation
@@ -1364,7 +1360,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
.await?;
m.commit().await?;
m.commit()?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1377,7 +1373,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
.await?;
m.commit().await?;
m.commit()?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1402,7 +1398,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
.await?;
m.commit().await?;
m.commit()?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1432,16 +1428,14 @@ mod tests {
#[tokio::test]
async fn test_drop_extend() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
let mut walingest = init_walingest_test(&tline, &ctx).await?;
let mut m = tline.begin_modification(Lsn(0x20));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
.await?;
m.commit().await?;
m.commit()?;
// Check that rel exists and size is correct
assert_eq!(
@@ -1460,7 +1454,7 @@ mod tests {
// Drop rel
let mut m = tline.begin_modification(Lsn(0x30));
walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
m.commit().await?;
m.commit()?;
// Check that rel is not visible anymore
assert_eq!(
@@ -1478,7 +1472,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
.await?;
m.commit().await?;
m.commit()?;
// Check that rel exists and size is correct
assert_eq!(
@@ -1503,9 +1497,7 @@ mod tests {
#[tokio::test]
async fn test_truncate_extend() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
let mut walingest = init_walingest_test(&tline, &ctx).await?;
// Create a 20 MB relation (the size is arbitrary)
@@ -1517,7 +1509,7 @@ mod tests {
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
.await?;
}
m.commit().await?;
m.commit()?;
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
@@ -1562,7 +1554,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
.await?;
m.commit().await?;
m.commit()?;
// Check reported size and contents after truncation
assert_eq!(
@@ -1611,7 +1603,7 @@ mod tests {
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
.await?;
}
m.commit().await?;
m.commit()?;
assert_eq!(
tline
@@ -1645,9 +1637,7 @@ mod tests {
#[tokio::test]
async fn test_large_rel() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
let mut walingest = init_walingest_test(&tline, &ctx).await?;
let mut lsn = 0x10;
@@ -1658,7 +1648,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
.await?;
m.commit().await?;
m.commit()?;
}
assert_current_logical_size(&tline, Lsn(lsn));
@@ -1674,7 +1664,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
.await?;
m.commit().await?;
m.commit()?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE
@@ -1687,7 +1677,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
.await?;
m.commit().await?;
m.commit()?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE - 1
@@ -1703,7 +1693,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
.await?;
m.commit().await?;
m.commit()?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
size as BlockNumber

View File

@@ -8,7 +8,6 @@ OBJS = \
libpagestore.o \
libpqwalproposer.o \
neon.o \
access_stat.o \
pagestore_smgr.o \
relsize_cache.o \
walproposer.o \

View File

@@ -1,274 +0,0 @@
/*
* We want this statistic to rpresent current access patern mthis is why when
* (n_seq_accesses + n_rnd_accesses) > MAX_ACCESS_COUNTER then we divide both counters by two,
* so decreasng weight of historical data
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "common/hashfn.h"
#include "pagestore_client.h"
#include "storage/relfilenode.h"
#include "utils/guc.h"
/* Structure used to predict sequential access */
typedef struct AccessStatEntry {
RelFileNode relnode;
BlockNumber blkno; /* last accessed black number */
uint32 n_seq_accesses; /* number of sequential accesses (when block N+1 is accessed after block N) */
uint32 n_rnd_accesses; /* number of random accesses */
uint32 hash;
uint32 status;
uint64 access_count; /* total number of relation accesses since backend start */
dlist_node lru_node; /* LRU list node */
} AccessStatEntry;
#define SH_PREFIX as
#define SH_ELEMENT_TYPE AccessStatEntry
#define SH_KEY_TYPE RelFileNode
#define SH_KEY relnode
#define SH_STORE_HASH
#define SH_GET_HASH(tb, a) ((a)->hash)
#define SH_HASH_KEY(tb, key) hash_bytes( \
((const unsigned char *) &(key)), \
sizeof(RelFileNode) \
)
#define SH_EQUAL(tb, a, b) RelFileNodeEquals((a), (b))
#define SH_SCOPE static inline
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"
static as_hash *hash;
static dlist_head lru;
static int max_access_stat_size;
static int max_access_stat_count;
static double min_seq_access_ratio;
static int min_seq_access_count;
void access_stat_init(void)
{
MemoryContext memctx = AllocSetContextCreate(TopMemoryContext,
"NeonSMGR/access_stat",
ALLOCSET_DEFAULT_SIZES);
DefineCustomIntVariable("neon.max_access_stat_size",
"Maximal size of Neon relation access statistic hash",
NULL,
&max_access_stat_size,
1024,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.max_access_stat_count",
"Maximal value of relation access counter after which counters are divided by 2",
NULL,
&max_access_stat_count,
1024,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
DefineCustomRealVariable("neon.min_seq_access_ratio",
"Minimal seq/(rnd+seq) ratio to determine sequential access",
NULL,
&min_seq_access_ratio,
0.9,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.min_seq_access_count",
"Minimal access count to determine sequetial access",
NULL,
&min_seq_access_count,
10,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
hash = as_create(memctx, max_access_stat_size, NULL);
dlist_init(&lru);
}
bool is_sequential_access(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
{
bool is_seq_access = false;
if (forkNum == MAIN_FORKNUM /* prefetch makes sense only for main fork */
&& max_access_stat_size != 0)
{
AccessStatEntry* entry = as_lookup(hash, rnode);
if (entry == NULL)
{
bool found;
/* New item */
while (hash->members >= max_access_stat_size)
{
/* Hash overflow: find candidate for replacement */
AccessStatEntry* victim = dlist_container(AccessStatEntry, lru_node, dlist_pop_head_node(&lru));
as_delete_item(hash, victim);
}
entry = as_insert(hash, rnode, &found);
Assert(!found);
/* Set both counter to zero because we don't know whethr first access is sequential or random */
entry->n_seq_accesses = 0;
entry->n_rnd_accesses = 0;
entry->access_count = 1;
}
else
{
uint32 access_count = entry->n_seq_accesses + entry->n_rnd_accesses;
/*
* We want this function to represent most recent access pattern,
* so when number of accesses exceed threashold value `max_access_stat_count`
* we divide bother coutners by two devaluing old data
*/
if (access_count >= max_access_stat_count)
{
entry->n_seq_accesses >>= 1;
entry->n_rnd_accesses >>= 1;
}
if (entry->blkno+1 == blkno)
entry->n_seq_accesses += 1;
else
entry->n_rnd_accesses += 1;
entry->access_count += 1;
access_count = entry->n_seq_accesses + entry->n_rnd_accesses;
is_seq_access = access_count >= min_seq_access_count
&& (double)entry->n_seq_accesses / access_count >= min_seq_access_ratio;
/* Remove entry from LRU list tobe able to insert it to the end of this list */
dlist_delete(&entry->lru_node);
}
/* Place entry to the tail of LRU list */
dlist_push_tail(&lru, &entry->lru_node);
entry->blkno = blkno;
}
return is_seq_access;
}
/*
* Get relation access pattern
*/
PG_FUNCTION_INFO_V1(get_relation_access_statistics);
typedef struct
{
TupleDesc tupdesc;
dlist_node* curr;
} AccessStatContext;
#define NUM_ACCESS_STAT_COLUMNS 6
Datum
get_relation_access_statistics(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
MemoryContext oldcontext;
AccessStatContext *fctx; /* User function context. */
TupleDesc tupledesc;
TupleDesc expected_tupledesc;
HeapTuple tuple;
if (SRF_IS_FIRSTCALL())
{
funcctx = SRF_FIRSTCALL_INIT();
/* Switch context when allocating stuff to be used in later calls */
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Create a user function context for cross-call persistence */
fctx = (AccessStatContext *) palloc(sizeof(AccessStatContext));
/*
* To smoothly support upgrades from version 1.0 of this extension
* transparently handle the (non-)existence of the pinning_backends
* column. We unfortunately have to get the result type for that... -
* we can't use the result type determined by the function definition
* without potentially crashing when somebody uses the old (or even
* wrong) function definition though.
*/
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
if (expected_tupledesc->natts != NUM_ACCESS_STAT_COLUMNS)
elog(ERROR, "incorrect number of output arguments");
/* Construct a tuple descriptor for the result rows. */
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "relfilenode",
OIDOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "reltablespace",
OIDOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reldatabase",
OIDOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 4, "seqaccess",
INT4OID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 5, "rndaccess",
INT4OID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 6, "accesscnt",
INT8OID, -1, 0);
fctx->tupdesc = BlessTupleDesc(tupledesc);
fctx->curr = dlist_is_empty(&lru) ? NULL : dlist_tail_node(&lru);
/* Set max calls and remember the user function context. */
funcctx->max_calls = hash->members;
funcctx->user_fctx = fctx;
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
/* Get the saved state */
fctx = funcctx->user_fctx;
if (fctx->curr)
{
AccessStatEntry* entry = dlist_container(AccessStatEntry, lru_node, fctx->curr);
Datum values[NUM_ACCESS_STAT_COLUMNS];
bool nulls[NUM_ACCESS_STAT_COLUMNS] = {
false, false, false, false, false, false
};
values[0] = ObjectIdGetDatum(entry->relnode.relNode);
values[1] = ObjectIdGetDatum(entry->relnode.spcNode);
values[2] = ObjectIdGetDatum(entry->relnode.dbNode);
values[3] = Int32GetDatum(entry->n_seq_accesses);
values[4] = Int32GetDatum(entry->n_rnd_accesses);
values[5] = Int64GetDatum(entry->access_count);
/* Build and return the tuple. */
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
result = HeapTupleGetDatum(tuple);
fctx->curr = dlist_has_prev(&lru, fctx->curr) ? dlist_prev_node(&lru, fctx->curr) : NULL;
SRF_RETURN_NEXT(funcctx, result);
}
else
SRF_RETURN_DONE(funcctx);
}

View File

@@ -58,7 +58,6 @@ char *neon_auth_token;
int n_unflushed_requests = 0;
int flush_every_n_requests = 8;
int readahead_buffer_size = 128;
int readahead_distance = 10;
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
@@ -453,18 +452,6 @@ pg_init_libpagestore(void)
PGC_USERSET,
0, /* no flags required */
NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
DefineCustomIntVariable("neon.readahead_distance",
"Number of read-ahead blocks",
NULL,
&readahead_distance,
10,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
relsize_hash_init();
@@ -489,5 +476,4 @@ pg_init_libpagestore(void)
redo_read_buffer_filter = neon_redo_read_buffer_filter;
}
lfc_init();
access_stat_init();
}

View File

@@ -27,18 +27,8 @@ RETURNS SETOF RECORD
AS 'MODULE_PATHNAME', 'local_cache_pages'
LANGUAGE C PARALLEL SAFE;
CREATE FUNCTION get_relation_access_statistics()
RETURNS SETOF RECORD
AS 'MODULE_PATHNAME', 'get_relation_access_statistics'
LANGUAGE C PARALLEL SAFE;
-- Create a view for convenient access.
CREATE VIEW local_cache AS
SELECT relname,P.* FROM local_cache_pages() AS P
SELECT P.* FROM local_cache_pages() AS P
(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
relforknumber int2, relblocknumber int8, accesscount int4) JOIN pg_class pc ON (P.relfilenode = pc.relfilenode);
CREATE VIEW relation_access_statistics AS
SELECT relname,P.* FROM get_relation_access_statistics() AS P
(relfilenode oid, reltablespace oid, reldatabase oid,
seqaccess int4, rndaccess int4, access_count int8) JOIN pg_class pc ON (P.relfilenode = pc.relfilenode);
relforknumber int2, relblocknumber int8, accesscount int4);

View File

@@ -157,7 +157,6 @@ extern page_server_api * page_server;
extern char *page_server_connstring;
extern int flush_every_n_requests;
extern int readahead_buffer_size;
extern int readahead_distance;
extern bool seqscan_prefetch_enabled;
extern int seqscan_prefetch_distance;
extern char *neon_timeline;
@@ -211,8 +210,5 @@ extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumbe
extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
extern void lfc_init(void);
/* Access statistic */
extern void access_stat_init(void);
extern bool is_sequential_access(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
#endif

View File

@@ -1881,7 +1881,6 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
XLogWaitForReplayOf(request_lsn);
/*
* Try to find prefetched page in the list of received pages.
*/
@@ -2004,10 +2003,6 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* If it is expected to be sequential access then initiate prefetch of next block */
if (is_sequential_access(reln->smgr_rnode.node, forkNum, blkno))
neon_prefetch(reln, forkNum, blkno + readahead_distance);
/* Try to read from local file cache */
if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
{

View File

@@ -257,7 +257,7 @@ nwp_register_gucs(void)
"Walproposer reconnects to offline safekeepers once in this interval.",
NULL,
&wal_acceptor_reconnect_timeout,
1000, 0, INT_MAX, /* default, min, max */
5000, 0, INT_MAX, /* default, min, max */
PGC_SIGHUP, /* context */
GUC_UNIT_MS, /* flags */
NULL, NULL, NULL);

56
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry and should not be changed by hand.
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -855,31 +855,35 @@ files = [
[[package]]
name = "cryptography"
version = "41.0.0"
version = "39.0.1"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
category = "main"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.6"
files = [
{file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"},
{file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"},
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"},
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"},
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"},
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"},
{file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"},
{file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"},
{file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"},
{file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"},
{file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"},
{file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"},
{file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"},
{file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"},
{file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"},
{file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"},
{file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"},
{file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"},
{file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"},
{file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"},
{file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"},
{file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"},
{file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"},
{file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
{file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
{file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
{file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"},
{file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"},
{file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"},
{file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"},
]
[package.dependencies]
@@ -888,12 +892,12 @@ cffi = ">=1.12"
[package.extras]
docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
nox = ["nox"]
pep8test = ["black", "check-sdist", "mypy", "ruff"]
sdist = ["build"]
pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"]
sdist = ["setuptools-rust (>=0.11.4)"]
ssh = ["bcrypt (>=3.1.5)"]
test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"]
test-randomorder = ["pytest-randomly"]
tox = ["tox"]
[[package]]
name = "docker"

View File

@@ -1,6 +1,5 @@
use futures::pin_mut;
use futures::StreamExt;
use futures::TryFutureExt;
use hyper::body::HttpBody;
use hyper::http::HeaderName;
use hyper::http::HeaderValue;
@@ -12,13 +11,8 @@ use serde_json::Value;
use tokio_postgres::types::Kind;
use tokio_postgres::types::Type;
use tokio_postgres::Row;
use tracing::error;
use tracing::info;
use tracing::instrument;
use url::Url;
use crate::proxy::invalidate_cache;
use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
use crate::{auth, config::ProxyConfig, console};
#[derive(serde::Deserialize)]
@@ -96,17 +90,10 @@ fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::E
}
}
struct ConnInfo {
username: String,
dbname: String,
hostname: String,
password: String,
}
fn get_conn_info(
headers: &HeaderMap,
sni_hostname: Option<String>,
) -> Result<ConnInfo, anyhow::Error> {
) -> Result<(String, String, String, String), anyhow::Error> {
let connection_string = headers
.get("Neon-Connection-String")
.ok_or(anyhow::anyhow!("missing connection string"))?
@@ -159,12 +146,12 @@ fn get_conn_info(
}
}
Ok(ConnInfo {
username: username.to_owned(),
dbname: dbname.to_owned(),
hostname: hostname.to_owned(),
password: password.to_owned(),
})
Ok((
username.to_owned(),
dbname.to_owned(),
hostname.to_owned(),
password.to_owned(),
))
}
// TODO: return different http error codes
@@ -177,10 +164,10 @@ pub async fn handle(
// Determine the destination and connection params
//
let headers = request.headers();
let conn_info = get_conn_info(headers, sni_hostname)?;
let (username, dbname, hostname, password) = get_conn_info(headers, sni_hostname)?;
let credential_params = StartupMessageParams::new([
("user", &conn_info.username),
("database", &conn_info.dbname),
("user", &username),
("database", &dbname),
("application_name", APP_NAME),
]);
@@ -199,20 +186,21 @@ pub async fn handle(
let creds = config
.auth_backend
.as_ref()
.map(|_| {
auth::ClientCredentials::parse(
&credential_params,
Some(&conn_info.hostname),
common_names,
)
})
.map(|_| auth::ClientCredentials::parse(&credential_params, Some(&hostname), common_names))
.transpose()?;
let extra = console::ConsoleReqExtra {
session_id: uuid::Uuid::new_v4(),
application_name: Some(APP_NAME),
};
let mut node_info = creds.wake_compute(&extra).await?.expect("msg");
let node = creds.wake_compute(&extra).await?.expect("msg");
let conf = node.value.config;
let port = *conf.get_ports().first().expect("no port");
let host = match conf.get_hosts().first().expect("no host") {
tokio_postgres::config::Host::Tcp(host) => host,
tokio_postgres::config::Host::Unix(_) => {
return Err(anyhow::anyhow!("unix socket is not supported"));
}
};
let request_content_length = match request.body().size_hint().upper() {
Some(v) => v,
@@ -232,10 +220,28 @@ pub async fn handle(
let QueryData { query, params } = serde_json::from_slice(&body)?;
let query_params = json_to_pg_text(params)?;
//
// Connenct to the destination
//
let (client, connection) = tokio_postgres::Config::new()
.host(host)
.port(port)
.user(&username)
.password(&password)
.dbname(&dbname)
.max_backend_message_size(MAX_RESPONSE_SIZE)
.connect(tokio_postgres::NoTls)
.await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
//
// Now execute the query and return the result
//
let client = connect_to_compute(&mut node_info, &extra, &creds, &conn_info).await?;
let row_stream = client.query_raw_txt(query, query_params).await?;
// Manually drain the stream into a vector to leave row_stream hanging
@@ -274,11 +280,6 @@ pub async fn handle(
json!({
"name": Value::String(c.name().to_owned()),
"dataTypeID": Value::Number(c.type_().oid().into()),
"tableID": c.table_oid(),
"columnID": c.column_id(),
"dataTypeSize": c.type_size(),
"dataTypeModifier": c.type_modifier(),
"format": "text",
})
})
.collect::<Vec<_>>()
@@ -302,70 +303,6 @@ pub async fn handle(
}))
}
/// This function is a copy of `connect_to_compute` from `src/proxy.rs` with
/// the difference that it uses `tokio_postgres` for the connection.
#[instrument(skip_all)]
async fn connect_to_compute(
node_info: &mut console::CachedNodeInfo,
extra: &console::ConsoleReqExtra<'_>,
creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
conn_info: &ConnInfo,
) -> anyhow::Result<tokio_postgres::Client> {
let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
loop {
match connect_to_compute_once(node_info, conn_info).await {
Err(e) if num_retries > 0 => {
info!("compute node's state has changed; requesting a wake-up");
match creds.wake_compute(extra).await? {
// Update `node_info` and try one more time.
Some(new) => {
*node_info = new;
}
// Link auth doesn't work that way, so we just exit.
None => return Err(e),
}
}
other => return other,
}
num_retries -= 1;
info!("retrying after wake-up ({num_retries} attempts left)");
}
}
async fn connect_to_compute_once(
node_info: &console::CachedNodeInfo,
conn_info: &ConnInfo,
) -> anyhow::Result<tokio_postgres::Client> {
let mut config = (*node_info.config).clone();
let (client, connection) = config
.user(&conn_info.username)
.password(&conn_info.password)
.dbname(&conn_info.dbname)
.max_backend_message_size(MAX_RESPONSE_SIZE)
.connect(tokio_postgres::NoTls)
.inspect_err(|e: &tokio_postgres::Error| {
error!(
"failed to connect to compute node hosts={:?} ports={:?}: {}",
node_info.config.get_hosts(),
node_info.config.get_ports(),
e
);
invalidate_cache(node_info)
})
.await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
error!("connection error: {}", e);
}
});
Ok(client)
}
//
// Convert postgres row with text-encoded values to JSON object
//

View File

@@ -26,6 +26,7 @@ use tls_listener::TlsListener;
use tokio::{
io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
net::TcpListener,
select,
};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, warn, Instrument};
@@ -192,9 +193,14 @@ async fn ws_handler(
// TODO: that deserves a refactor as now this function also handles http json client besides websockets.
// Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
let result = sql_over_http::handle(config, request, sni_hostname)
.instrument(info_span!("sql-over-http"))
.await;
let result = select! {
_ = tokio::time::sleep(std::time::Duration::from_secs(10)) => {
Err(anyhow::anyhow!("Query timed out"))
}
response = sql_over_http::handle(config, request, sni_hostname) => {
response
}
};
let status_code = match result {
Ok(_) => StatusCode::OK,
Err(_) => StatusCode::BAD_REQUEST,

View File

@@ -22,7 +22,7 @@ use tracing::{error, info, warn};
use utils::measured_stream::MeasuredStream;
/// Number of times we should retry the `/proxy_wake_compute` http request.
pub const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -283,35 +283,34 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
}
}
/// If we couldn't connect, a cached connection info might be to blame
/// (e.g. the compute node's address might've changed at the wrong time).
/// Invalidate the cache entry (if any) to prevent subsequent errors.
#[tracing::instrument(name = "invalidate_cache", skip_all)]
pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
let is_cached = node_info.cached();
if is_cached {
warn!("invalidating stalled compute node info cache entry");
node_info.invalidate();
}
let label = match is_cached {
true => "compute_cached",
false => "compute_uncached",
};
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
}
/// Try to connect to the compute node once.
#[tracing::instrument(name = "connect_once", skip_all)]
async fn connect_to_compute_once(
node_info: &console::CachedNodeInfo,
) -> Result<PostgresConnection, compute::ConnectionError> {
// If we couldn't connect, a cached connection info might be to blame
// (e.g. the compute node's address might've changed at the wrong time).
// Invalidate the cache entry (if any) to prevent subsequent errors.
let invalidate_cache = |_: &compute::ConnectionError| {
let is_cached = node_info.cached();
if is_cached {
warn!("invalidating stalled compute node info cache entry");
node_info.invalidate();
}
let label = match is_cached {
true => "compute_cached",
false => "compute_uncached",
};
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
};
let allow_self_signed_compute = node_info.allow_self_signed_compute;
node_info
.config
.connect(allow_self_signed_compute)
.inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
.inspect_err(invalidate_cache)
.await
}

View File

@@ -3,19 +3,15 @@
//
use anyhow::{bail, Context, Result};
use clap::Parser;
use futures::future::BoxFuture;
use futures::stream::FuturesUnordered;
use futures::{FutureExt, StreamExt};
use remote_storage::RemoteStorageConfig;
use tokio::runtime::Handle;
use tokio::signal::unix::{signal, SignalKind};
use tokio::task::JoinError;
use toml_edit::Document;
use utils::signals::ShutdownSignals;
use std::fs::{self, File};
use std::io::{ErrorKind, Write};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
use storage_broker::Uri;
use tokio::sync::mpsc;
@@ -24,21 +20,22 @@ use tracing::*;
use utils::pid_file;
use metrics::set_build_info_metric;
use safekeeper::broker;
use safekeeper::control_file;
use safekeeper::defaults::{
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
DEFAULT_PG_LISTEN_ADDR,
};
use safekeeper::http;
use safekeeper::remove_wal;
use safekeeper::wal_backup;
use safekeeper::wal_service;
use safekeeper::GlobalTimelines;
use safekeeper::SafeKeeperConf;
use safekeeper::{broker, WAL_SERVICE_RUNTIME};
use safekeeper::{control_file, BROKER_RUNTIME};
use safekeeper::{http, WAL_REMOVER_RUNTIME};
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
use safekeeper::{wal_backup, HTTP_RUNTIME};
use storage_broker::DEFAULT_ENDPOINT;
use utils::auth::JwtAuth;
use utils::{
http::endpoint,
id::NodeId,
logging::{self, LogFormat},
project_git_version,
@@ -107,6 +104,10 @@ struct Args {
/// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
#[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
max_offloader_lag: u64,
/// Number of threads for wal backup runtime, by default number of cores
/// available to the system.
#[arg(long)]
wal_backup_threads: Option<usize>,
/// Number of max parallel WAL segments to be offloaded to remote storage.
#[arg(long, default_value = "5")]
wal_backup_parallel_jobs: usize,
@@ -120,14 +121,9 @@ struct Args {
/// Format for logging, either 'plain' or 'json'.
#[arg(long, default_value = "plain")]
log_format: String,
/// Run everything in single threaded current thread runtime, might be
/// useful for debugging.
#[arg(long)]
current_thread_runtime: bool,
}
#[tokio::main(flavor = "current_thread")]
async fn main() -> anyhow::Result<()> {
fn main() -> anyhow::Result<()> {
let args = Args::parse();
if let Some(addr) = args.dump_control_file {
@@ -187,10 +183,10 @@ async fn main() -> anyhow::Result<()> {
heartbeat_timeout: args.heartbeat_timeout,
remote_storage: args.remote_storage,
max_offloader_lag_bytes: args.max_offloader_lag,
backup_runtime_threads: args.wal_backup_threads,
wal_backup_enabled: !args.disable_wal_backup,
backup_parallel_jobs: args.wal_backup_parallel_jobs,
auth,
current_thread_runtime: args.current_thread_runtime,
};
// initialize sentry if SENTRY_DSN is provided
@@ -198,14 +194,10 @@ async fn main() -> anyhow::Result<()> {
Some(GIT_VERSION.into()),
&[("node_id", &conf.my_id.to_string())],
);
start_safekeeper(conf).await
start_safekeeper(conf)
}
/// Result of joining any of main tasks: upper error means task failed to
/// complete, e.g. panicked, inner is error produced by task itself.
type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
// Prevent running multiple safekeepers on the same directory
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
let lock_file =
@@ -216,18 +208,14 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
// we need to release the lock file only when the current process is gone
std::mem::forget(lock_file);
info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
e
})?;
info!(
"starting safekeeper HTTP service on {}",
conf.listen_http_addr
);
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
info!("starting safekeeper on {}", conf.listen_pg_addr);
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
e
})?;
@@ -236,88 +224,71 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
metrics::register_internal(Box::new(timeline_collector))?;
let mut threads = vec![];
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
// Load all timelines from disk to memory.
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
// Keep handles to main tasks to die if any of them disappears.
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
FuturesUnordered::new();
let conf_ = conf.clone();
threads.push(
thread::Builder::new()
.name("http_endpoint_thread".into())
.spawn(|| {
let router = http::make_router(conf_);
endpoint::serve_thread_main(
router,
http_listener,
std::future::pending(), // never shut down
)
.unwrap();
})?,
);
let conf_cloned = conf.clone();
let safekeeper_thread = thread::Builder::new()
.name("WAL service thread".into())
.spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
.unwrap();
threads.push(safekeeper_thread);
let conf_ = conf.clone();
// Run everything in current thread rt, if asked.
if conf.current_thread_runtime {
info!("running in current thread runtime");
}
let current_thread_rt = conf
.current_thread_runtime
.then(|| Handle::try_current().expect("no runtime in main"));
let wal_service_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
.spawn(wal_service::task_main(conf_, pg_listener))
// wrap with task name for error reporting
.map(|res| ("WAL service main".to_owned(), res));
tasks_handles.push(Box::pin(wal_service_handle));
threads.push(
thread::Builder::new()
.name("broker thread".into())
.spawn(|| {
broker::thread_main(conf_);
})?,
);
let conf_ = conf.clone();
let http_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| HTTP_RUNTIME.handle())
.spawn(http::task_main(conf_, http_listener))
.map(|res| ("HTTP service main".to_owned(), res));
tasks_handles.push(Box::pin(http_handle));
threads.push(
thread::Builder::new()
.name("WAL removal thread".into())
.spawn(|| {
remove_wal::thread_main(conf_);
})?,
);
let conf_ = conf.clone();
let broker_task_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| BROKER_RUNTIME.handle())
.spawn(broker::task_main(conf_).instrument(info_span!("broker")))
.map(|res| ("broker main".to_owned(), res));
tasks_handles.push(Box::pin(broker_task_handle));
let conf_ = conf.clone();
let wal_remover_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
.spawn(remove_wal::task_main(conf_))
.map(|res| ("WAL remover".to_owned(), res));
tasks_handles.push(Box::pin(wal_remover_handle));
let conf_ = conf.clone();
let wal_backup_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
.spawn(wal_backup::wal_backup_launcher_task_main(
conf_,
wal_backup_launcher_rx,
))
.map(|res| ("WAL backup launcher".to_owned(), res));
tasks_handles.push(Box::pin(wal_backup_handle));
threads.push(
thread::Builder::new()
.name("WAL backup launcher thread".into())
.spawn(move || {
wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx);
})?,
);
set_build_info_metric(GIT_VERSION);
// TODO: put more thoughts into handling of failed threads
// We should catch & die if they are in trouble.
// TODO: update tokio-stream, convert to real async Stream with
// SignalStream, map it to obtain missing signal name, combine streams into
// single stream we can easily sit on.
let mut sigquit_stream = signal(SignalKind::quit())?;
let mut sigint_stream = signal(SignalKind::interrupt())?;
let mut sigterm_stream = signal(SignalKind::terminate())?;
tokio::select! {
Some((task_name, res)) = tasks_handles.next()=> {
error!("{} task failed: {:?}, exiting", task_name, res);
std::process::exit(1);
}
// On any shutdown signal, log receival and exit. Additionally, handling
// SIGQUIT prevents coredump.
_ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"),
_ = sigint_stream.recv() => info!("received SIGINT, terminating"),
_ = sigterm_stream.recv() => info!("received SIGTERM, terminating")
};
std::process::exit(0);
// On any shutdown signal, log receival and exit. Additionally, handling
// SIGQUIT prevents coredump.
ShutdownSignals::handle(|signal| {
info!("received {}, terminating", signal.name());
std::process::exit(0);
})
}
/// Determine safekeeper id.

View File

@@ -8,7 +8,7 @@ use anyhow::Error;
use anyhow::Result;
use storage_broker::parse_proto_ttid;
use storage_broker::proto::broker_service_client::BrokerServiceClient;
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::Request;
@@ -16,7 +16,7 @@ use storage_broker::Request;
use std::time::Duration;
use std::time::Instant;
use tokio::task::JoinHandle;
use tokio::time::sleep;
use tokio::{runtime, time::sleep};
use tracing::*;
use crate::metrics::BROKER_ITERATION_TIMELINES;
@@ -29,10 +29,23 @@ use crate::SafeKeeperConf;
const RETRY_INTERVAL_MSEC: u64 = 1000;
const PUSH_INTERVAL_MSEC: u64 = 1000;
pub fn thread_main(conf: SafeKeeperConf) {
let runtime = runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let _enter = info_span!("broker").entered();
info!("started, broker endpoint {:?}", conf.broker_endpoint);
runtime.block_on(async {
main_loop(conf).await;
});
}
/// Push once in a while data about all active timelines to the broker.
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
let mut client =
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?;
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
let outbound = async_stream::stream! {
@@ -42,27 +55,20 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
// sensitive and there is no risk of deadlock as we don't await while
// lock is held.
let now = Instant::now();
let all_tlis = GlobalTimelines::get_all();
let mut n_pushed_tlis = 0;
for tli in &all_tlis {
// filtering alternative futures::stream::iter(all_tlis)
// .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
// doesn't look better, and I'm not sure how to do that without collect.
if !tli.is_active().await {
continue;
}
let sk_info = tli.get_safekeeper_info(&conf).await;
let mut active_tlis = GlobalTimelines::get_all();
active_tlis.retain(|tli| tli.is_active());
for tli in &active_tlis {
let sk_info = tli.get_safekeeper_info(&conf);
yield sk_info;
BROKER_PUSHED_UPDATES.inc();
n_pushed_tlis += 1;
}
let elapsed = now.elapsed();
BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
BROKER_ITERATION_TIMELINES.observe(n_pushed_tlis as f64);
BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);
if elapsed > push_interval / 2 {
info!("broker push is too long, pushed {} timeline updates to broker in {:?}", n_pushed_tlis, elapsed);
info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
}
sleep(push_interval).await;
@@ -119,13 +125,10 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
bail!("end of stream");
}
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
info!("started, broker endpoint {:?}", conf.broker_endpoint);
async fn main_loop(conf: SafeKeeperConf) {
let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
// Selecting on JoinHandles requires some squats; is there a better way to
// reap tasks individually?

View File

@@ -2,10 +2,9 @@
use anyhow::{bail, ensure, Context, Result};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use tokio::fs::{self, File};
use tokio::io::AsyncWriteExt;
use std::io::Read;
use std::fs::{self, File, OpenOptions};
use std::io::{Read, Write};
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::time::Instant;
@@ -27,10 +26,9 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
/// Storage should keep actual state inside of it. It should implement Deref
/// trait to access state fields and have persist method for updating that state.
#[async_trait::async_trait]
pub trait Storage: Deref<Target = SafeKeeperState> {
/// Persist safekeeper state on disk and update internal state.
async fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
/// Timestamp of last persist.
fn last_persist_at(&self) -> Instant;
@@ -84,7 +82,7 @@ impl FileStorage {
/// Check the magic/version in the on-disk data and deserialize it, if possible.
fn deser_sk_state(buf: &mut &[u8]) -> Result<SafeKeeperState> {
// Read the version independent part
let magic = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
let magic = buf.read_u32::<LittleEndian>()?;
if magic != SK_MAGIC {
bail!(
"bad control file magic: {:X}, expected {:X}",
@@ -92,7 +90,7 @@ impl FileStorage {
SK_MAGIC
);
}
let version = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
let version = buf.read_u32::<LittleEndian>()?;
if version == SK_FORMAT_VERSION {
let res = SafeKeeperState::des(buf)?;
return Ok(res);
@@ -112,7 +110,7 @@ impl FileStorage {
/// Read in the control file.
pub fn load_control_file<P: AsRef<Path>>(control_file_path: P) -> Result<SafeKeeperState> {
let mut control_file = std::fs::OpenOptions::new()
let mut control_file = OpenOptions::new()
.read(true)
.write(true)
.open(&control_file_path)
@@ -161,31 +159,30 @@ impl Deref for FileStorage {
}
}
#[async_trait::async_trait]
impl Storage for FileStorage {
/// persists state durably to underlying storage
/// for description see https://lwn.net/Articles/457667/
async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer();
// write data to safekeeper.control.partial
let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
let mut control_partial = File::create(&control_partial_path).await.with_context(|| {
let mut control_partial = File::create(&control_partial_path).with_context(|| {
format!(
"failed to create partial control file at: {}",
&control_partial_path.display()
)
})?;
let mut buf: Vec<u8> = Vec::new();
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
buf.write_u32::<LittleEndian>(SK_MAGIC)?;
buf.write_u32::<LittleEndian>(SK_FORMAT_VERSION)?;
s.ser_into(&mut buf)?;
// calculate checksum before resize
let checksum = crc32c::crc32c(&buf);
buf.extend_from_slice(&checksum.to_le_bytes());
control_partial.write_all(&buf).await.with_context(|| {
control_partial.write_all(&buf).with_context(|| {
format!(
"failed to write safekeeper state into control file at: {}",
control_partial_path.display()
@@ -194,7 +191,7 @@ impl Storage for FileStorage {
// fsync the file
if !self.conf.no_sync {
control_partial.sync_all().await.with_context(|| {
control_partial.sync_all().with_context(|| {
format!(
"failed to sync partial control file at {}",
control_partial_path.display()
@@ -205,22 +202,21 @@ impl Storage for FileStorage {
let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
// rename should be atomic
fs::rename(&control_partial_path, &control_path).await?;
fs::rename(&control_partial_path, &control_path)?;
// this sync is not required by any standard but postgres does this (see durable_rename)
if !self.conf.no_sync {
let new_f = File::open(&control_path).await?;
new_f.sync_all().await.with_context(|| {
format!(
"failed to sync control file at: {}",
&control_path.display()
)
})?;
File::open(&control_path)
.and_then(|f| f.sync_all())
.with_context(|| {
format!(
"failed to sync control file at: {}",
&control_path.display()
)
})?;
// fsync the directory (linux specific)
let tli_dir = File::open(&self.timeline_dir).await?;
tli_dir
.sync_all()
.await
File::open(&self.timeline_dir)
.and_then(|f| f.sync_all())
.context("failed to sync control file directory")?;
}
@@ -240,6 +236,7 @@ mod test {
use super::*;
use crate::{safekeeper::SafeKeeperState, SafeKeeperConf};
use anyhow::Result;
use std::fs;
use utils::{id::TenantTimelineId, lsn::Lsn};
fn stub_conf() -> SafeKeeperConf {
@@ -250,75 +247,59 @@ mod test {
}
}
async fn load_from_control_file(
fn load_from_control_file(
conf: &SafeKeeperConf,
ttid: &TenantTimelineId,
) -> Result<(FileStorage, SafeKeeperState)> {
fs::create_dir_all(conf.timeline_dir(ttid))
.await
.expect("failed to create timeline dir");
fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
Ok((
FileStorage::restore_new(ttid, conf)?,
FileStorage::load_control_file_conf(conf, ttid)?,
))
}
async fn create(
fn create(
conf: &SafeKeeperConf,
ttid: &TenantTimelineId,
) -> Result<(FileStorage, SafeKeeperState)> {
fs::create_dir_all(conf.timeline_dir(ttid))
.await
.expect("failed to create timeline dir");
fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
let state = SafeKeeperState::empty();
let storage = FileStorage::create_new(ttid, conf, state.clone())?;
Ok((storage, state))
}
#[tokio::test]
async fn test_read_write_safekeeper_state() {
#[test]
fn test_read_write_safekeeper_state() {
let conf = stub_conf();
let ttid = TenantTimelineId::generate();
{
let (mut storage, mut state) =
create(&conf, &ttid).await.expect("failed to create state");
let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state");
// change something
state.commit_lsn = Lsn(42);
storage
.persist(&state)
.await
.expect("failed to persist state");
storage.persist(&state).expect("failed to persist state");
}
let (_, state) = load_from_control_file(&conf, &ttid)
.await
.expect("failed to read state");
let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state");
assert_eq!(state.commit_lsn, Lsn(42));
}
#[tokio::test]
async fn test_safekeeper_state_checksum_mismatch() {
#[test]
fn test_safekeeper_state_checksum_mismatch() {
let conf = stub_conf();
let ttid = TenantTimelineId::generate();
{
let (mut storage, mut state) =
create(&conf, &ttid).await.expect("failed to read state");
let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state");
// change something
state.commit_lsn = Lsn(42);
storage
.persist(&state)
.await
.expect("failed to persist state");
storage.persist(&state).expect("failed to persist state");
}
let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
let mut data = fs::read(&control_path).await.unwrap();
let mut data = fs::read(&control_path).unwrap();
data[0] += 1; // change the first byte of the file to fail checksum validation
fs::write(&control_path, &data)
.await
.expect("failed to write control file");
fs::write(&control_path, &data).expect("failed to write control file");
match load_from_control_file(&conf, &ttid).await {
match load_from_control_file(&conf, &ttid) {
Err(err) => assert!(err
.to_string()
.contains("safekeeper control file checksum mismatch")),

View File

@@ -121,7 +121,7 @@ pub struct FileInfo {
}
/// Build debug dump response, using the provided [`Args`] filters.
pub async fn build(args: Args) -> Result<Response> {
pub fn build(args: Args) -> Result<Response> {
let start_time = Utc::now();
let timelines_count = GlobalTimelines::timelines_count();
@@ -155,7 +155,7 @@ pub async fn build(args: Args) -> Result<Response> {
}
let control_file = if args.dump_control_file {
let mut state = tli.get_state().await.1;
let mut state = tli.get_state().1;
if !args.dump_term_history {
state.acceptor_state.term_history = TermHistory(vec![]);
}
@@ -165,7 +165,7 @@ pub async fn build(args: Args) -> Result<Response> {
};
let memory = if args.dump_memory {
Some(tli.memory_dump().await)
Some(tli.memory_dump())
} else {
None
};

View File

@@ -256,14 +256,14 @@ impl SafekeeperPostgresHandler {
let lsn = if self.is_walproposer_recovery() {
// walproposer should get all local WAL until flush_lsn
tli.get_flush_lsn().await
tli.get_flush_lsn()
} else {
// other clients shouldn't get any uncommitted WAL
tli.get_state().await.0.commit_lsn
tli.get_state().0.commit_lsn
}
.to_string();
let sysid = tli.get_state().await.1.server.system_id.to_string();
let sysid = tli.get_state().1.server.system_id.to_string();
let lsn_bytes = lsn.as_bytes();
let tli = PG_TLI.to_string();
let tli_bytes = tli.as_bytes();

View File

@@ -2,18 +2,3 @@ pub mod routes;
pub use routes::make_router;
pub use safekeeper_api::models;
use crate::SafeKeeperConf;
pub async fn task_main(
conf: SafeKeeperConf,
http_listener: std::net::TcpListener,
) -> anyhow::Result<()> {
let router = make_router(conf)
.build()
.map_err(|err| anyhow::anyhow!(err))?;
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?;
server.serve(service).await?;
Ok(()) // unreachable
}

View File

@@ -13,7 +13,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use tokio::fs::File;
use tokio::io::AsyncReadExt;
use utils::http::endpoint::request_span;
use tokio::task::JoinError;
use crate::safekeeper::ServerInfo;
use crate::safekeeper::Term;
@@ -116,8 +116,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
check_permission(&request, Some(ttid.tenant_id))?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let (inmem, state) = tli.get_state().await;
let flush_lsn = tli.get_flush_lsn().await;
let (inmem, state) = tli.get_state();
let flush_lsn = tli.get_flush_lsn();
let epoch = state.acceptor_state.get_epoch(flush_lsn);
let term_history = state
@@ -232,11 +232,13 @@ async fn timeline_delete_force_handler(
);
check_permission(&request, Some(ttid.tenant_id))?;
ensure_no_body(&mut request).await?;
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
// error handling here when we're able to.
let resp = GlobalTimelines::delete_force(&ttid)
.await
.map_err(ApiError::InternalServerError)?;
let resp = tokio::task::spawn_blocking(move || {
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
// error handling here when we're able to.
GlobalTimelines::delete_force(&ttid).map_err(ApiError::InternalServerError)
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
json_response(StatusCode::OK, resp)
}
@@ -248,11 +250,14 @@ async fn tenant_delete_force_handler(
let tenant_id = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
ensure_no_body(&mut request).await?;
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
// Using an `InternalServerError` should be fixed when the types support it
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
.await
.map_err(ApiError::InternalServerError)?;
let delete_info = tokio::task::spawn_blocking(move || {
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
// Using an `InternalServerError` should be fixed when the types support it
GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
.map_err(ApiError::InternalServerError)
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
json_response(
StatusCode::OK,
delete_info
@@ -348,9 +353,11 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
timeline_id,
};
let resp = debug_dump::build(args)
.await
.map_err(ApiError::InternalServerError)?;
let resp = tokio::task::spawn_blocking(move || {
debug_dump::build(args).map_err(ApiError::InternalServerError)
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
// TODO: use streaming response
json_response(StatusCode::OK, resp)
@@ -379,32 +386,29 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
router
.data(Arc::new(conf))
.data(auth)
.get("/v1/status", |r| request_span(r, status_handler))
.get("/v1/status", status_handler)
// Will be used in the future instead of implicit timeline creation
.post("/v1/tenant/timeline", |r| {
request_span(r, timeline_create_handler)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
request_span(r, timeline_status_handler)
})
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
request_span(r, timeline_delete_force_handler)
})
.delete("/v1/tenant/:tenant_id", |r| {
request_span(r, tenant_delete_force_handler)
})
.post("/v1/pull_timeline", |r| {
request_span(r, timeline_pull_handler)
})
.post("/v1/tenant/timeline", timeline_create_handler)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id",
timeline_status_handler,
)
.delete(
"/v1/tenant/:tenant_id/timeline/:timeline_id",
timeline_delete_force_handler,
)
.delete("/v1/tenant/:tenant_id", tenant_delete_force_handler)
.post("/v1/pull_timeline", timeline_pull_handler)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
|r| request_span(r, timeline_files_handler),
timeline_files_handler,
)
// for tests
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
request_span(r, record_safekeeper_info)
})
.get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
.post(
"/v1/record_safekeeper_info/:tenant_id/:timeline_id",
record_safekeeper_info,
)
.get("/v1/debug_dump", dump_debug_handler)
}
#[cfg(test)]

View File

@@ -73,12 +73,12 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
// if send_proposer_elected is true, we need to update local history
if append_request.send_proposer_elected {
send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?;
send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?;
}
let inserted_wal = append_logical_message(&tli, append_request).await?;
let inserted_wal = append_logical_message(&tli, append_request)?;
let response = AppendResult {
state: tli.get_state().await.1,
state: tli.get_state().1,
inserted_wal,
};
let response_data = serde_json::to_vec(&response)
@@ -114,9 +114,9 @@ async fn prepare_safekeeper(
.await
}
async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
// add new term to existing history
let history = tli.get_state().await.1.acceptor_state.term_history;
let history = tli.get_state().1.acceptor_state.term_history;
let history = history.up_to(lsn.checked_sub(1u64).unwrap());
let mut history_entries = history.0;
history_entries.push(TermSwitchEntry { term, lsn });
@@ -129,7 +129,7 @@ async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> any
timeline_start_lsn: lsn,
});
tli.process_msg(&proposer_elected_request).await?;
tli.process_msg(&proposer_elected_request)?;
Ok(())
}
@@ -142,12 +142,12 @@ pub struct InsertedWAL {
/// Extend local WAL with new LogicalMessage record. To do that,
/// create AppendRequest with new WAL and pass it to safekeeper.
pub async fn append_logical_message(
pub fn append_logical_message(
tli: &Arc<Timeline>,
msg: &AppendLogicalMessage,
) -> anyhow::Result<InsertedWAL> {
let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
let sk_state = tli.get_state().await.1;
let sk_state = tli.get_state().1;
let begin_lsn = msg.begin_lsn;
let end_lsn = begin_lsn + wal_data.len() as u64;
@@ -171,7 +171,7 @@ pub async fn append_logical_message(
wal_data: Bytes::from(wal_data),
});
let response = tli.process_msg(&append_request).await?;
let response = tli.process_msg(&append_request)?;
let append_response = match response {
Some(AcceptorProposerMessage::AppendResponse(resp)) => resp,

View File

@@ -1,6 +1,4 @@
use once_cell::sync::Lazy;
use remote_storage::RemoteStorageConfig;
use tokio::runtime::Runtime;
use std::path::PathBuf;
use std::time::Duration;
@@ -38,6 +36,7 @@ pub mod defaults {
DEFAULT_PG_LISTEN_PORT,
};
pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8;
pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
}
@@ -61,10 +60,10 @@ pub struct SafeKeeperConf {
pub heartbeat_timeout: Duration,
pub remote_storage: Option<RemoteStorageConfig>,
pub max_offloader_lag_bytes: u64,
pub backup_runtime_threads: Option<usize>,
pub backup_parallel_jobs: usize,
pub wal_backup_enabled: bool,
pub auth: Option<Arc<JwtAuth>>,
pub current_thread_runtime: bool,
}
impl SafeKeeperConf {
@@ -93,64 +92,12 @@ impl SafeKeeperConf {
.parse()
.expect("failed to parse default broker endpoint"),
broker_keepalive_interval: Duration::from_secs(5),
backup_runtime_threads: None,
wal_backup_enabled: true,
backup_parallel_jobs: 1,
auth: None,
heartbeat_timeout: Duration::new(5, 0),
max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
current_thread_runtime: false,
}
}
}
// Tokio runtimes.
pub static WAL_SERVICE_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("WAL service worker")
.enable_all()
.build()
.expect("Failed to create WAL service runtime")
});
pub static HTTP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("HTTP worker")
.enable_all()
.build()
.expect("Failed to create WAL service runtime")
});
pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("broker worker")
.worker_threads(2) // there are only 2 tasks, having more threads doesn't make sense
.enable_all()
.build()
.expect("Failed to create broker runtime")
});
pub static WAL_REMOVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("WAL remover")
.worker_threads(1)
.enable_all()
.build()
.expect("Failed to create broker runtime")
});
pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("WAL backup worker")
.enable_all()
.build()
.expect("Failed to create WAL backup runtime")
});
pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("metric shifter")
.worker_threads(1)
.enable_all()
.build()
.expect("Failed to create broker runtime")
});

View File

@@ -7,7 +7,6 @@ use std::{
use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS};
use anyhow::Result;
use futures::Future;
use metrics::{
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
proto::MetricFamily,
@@ -293,17 +292,14 @@ impl WalStorageMetrics {
}
}
/// Accepts async function that returns empty anyhow result, and returns the duration of its execution.
pub async fn time_io_closure<E: Into<anyhow::Error>>(
closure: impl Future<Output = Result<(), E>>,
) -> Result<f64> {
/// Accepts a closure that returns a result, and returns the duration of the closure.
pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result<f64> {
let start = std::time::Instant::now();
closure.await.map_err(|e| e.into())?;
closure()?;
Ok(start.elapsed().as_secs_f64())
}
/// Metrics for a single timeline.
#[derive(Clone)]
pub struct FullTimelineInfo {
pub ttid: TenantTimelineId,
pub ps_feedback: PageserverFeedback,
@@ -579,19 +575,13 @@ impl Collector for TimelineCollector {
let timelines = GlobalTimelines::get_all();
let timelines_count = timelines.len();
// Prometheus Collector is sync, and data is stored under async lock. To
// bridge the gap with a crutch, collect data in spawned thread with
// local tokio runtime.
let infos = std::thread::spawn(|| {
let rt = tokio::runtime::Builder::new_current_thread()
.build()
.expect("failed to create rt");
rt.block_on(collect_timeline_metrics())
})
.join()
.expect("collect_timeline_metrics thread panicked");
for arc_tli in timelines {
let tli = arc_tli.info_for_metrics();
if tli.is_none() {
continue;
}
let tli = tli.unwrap();
for tli in &infos {
let tenant_id = tli.ttid.tenant_id.to_string();
let timeline_id = tli.ttid.timeline_id.to_string();
let labels = &[tenant_id.as_str(), timeline_id.as_str()];
@@ -692,15 +682,3 @@ impl Collector for TimelineCollector {
mfs
}
}
async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
let mut res = vec![];
let timelines = GlobalTimelines::get_all();
for tli in timelines {
if let Some(info) = tli.info_for_metrics().await {
res.push(info);
}
}
res
}

View File

@@ -231,7 +231,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
info!(
"Loaded timeline {}, flush_lsn={}",
ttid,
tli.get_flush_lsn().await
tli.get_flush_lsn()
);
Ok(Response {

View File

@@ -18,14 +18,15 @@ use postgres_backend::QueryError;
use pq_proto::BeMessage;
use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
use std::thread::JoinHandle;
use tokio::io::AsyncRead;
use tokio::io::AsyncWrite;
use tokio::sync::mpsc::channel;
use tokio::sync::mpsc::error::TryRecvError;
use tokio::sync::mpsc::Receiver;
use tokio::sync::mpsc::Sender;
use tokio::task;
use tokio::task::JoinHandle;
use tokio::task::spawn_blocking;
use tokio::time::Duration;
use tokio::time::Instant;
use tracing::*;
@@ -96,7 +97,7 @@ impl SafekeeperPostgresHandler {
Err(res.expect_err("no error with WalAcceptor not spawn"))
}
Some(handle) => {
let wal_acceptor_res = handle.await;
let wal_acceptor_res = handle.join();
// If there was any network error, return it.
res?;
@@ -106,7 +107,7 @@ impl SafekeeperPostgresHandler {
Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination
Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))),
Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!(
"WalAcceptor task panicked",
"WalAcceptor thread panicked",
))),
}
}
@@ -153,12 +154,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
}
};
*self.acceptor_handle = Some(WalAcceptor::spawn(
tli.clone(),
msg_rx,
reply_tx,
self.conn_id,
));
*self.acceptor_handle = Some(
WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id)
.context("spawn WalAcceptor thread")?,
);
// Forward all messages to WalAcceptor
read_network_loop(self.pgb_reader, msg_tx, next_msg).await
@@ -227,19 +226,28 @@ impl WalAcceptor {
msg_rx: Receiver<ProposerAcceptorMessage>,
reply_tx: Sender<AcceptorProposerMessage>,
conn_id: ConnectionId,
) -> JoinHandle<anyhow::Result<()>> {
task::spawn(async move {
let mut wa = WalAcceptor {
tli,
msg_rx,
reply_tx,
};
) -> anyhow::Result<JoinHandle<anyhow::Result<()>>> {
let thread_name = format!("WAL acceptor {}", tli.ttid);
thread::Builder::new()
.name(thread_name)
.spawn(move || -> anyhow::Result<()> {
let mut wa = WalAcceptor {
tli,
msg_rx,
reply_tx,
};
let span_ttid = wa.tli.ttid; // satisfy borrow checker
wa.run()
.instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid))
.await
})
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
let span_ttid = wa.tli.ttid; // satisfy borrow checker
runtime.block_on(
wa.run()
.instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)),
)
})
.map_err(anyhow::Error::from)
}
/// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
@@ -273,7 +281,7 @@ impl WalAcceptor {
while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
if let Some(reply) = self.tli.process_msg(&noflush_msg)? {
if self.reply_tx.send(reply).await.is_err() {
return Ok(()); // chan closed, streaming terminated
}
@@ -292,12 +300,10 @@ impl WalAcceptor {
}
// flush all written WAL to the disk
self.tli
.process_msg(&ProposerAcceptorMessage::FlushWAL)
.await?
self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?
} else {
// process message other than AppendRequest
self.tli.process_msg(&next_msg).await?
self.tli.process_msg(&next_msg)?
};
if let Some(reply) = reply_msg {
@@ -320,8 +326,8 @@ impl Drop for ComputeConnectionGuard {
let tli = self.timeline.clone();
// tokio forbids to call blocking_send inside the runtime, and see
// comments in on_compute_disconnect why we call blocking_send.
tokio::spawn(async move {
if let Err(e) = tli.on_compute_disconnect().await {
spawn_blocking(move || {
if let Err(e) = tli.on_compute_disconnect() {
error!("failed to unregister compute connection: {}", e);
}
});

View File

@@ -1,36 +1,29 @@
//! Thread removing old WAL.
use std::time::Duration;
use std::{thread, time::Duration};
use tokio::time::sleep;
use tracing::*;
use crate::{GlobalTimelines, SafeKeeperConf};
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
pub fn thread_main(conf: SafeKeeperConf) {
let wal_removal_interval = Duration::from_millis(5000);
loop {
let tlis = GlobalTimelines::get_all();
for tli in &tlis {
if !tli.is_active().await {
if !tli.is_active() {
continue;
}
let ttid = tli.ttid;
if let Err(e) = tli
.maybe_persist_control_file()
.instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
.await
{
let _enter =
info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered();
if let Err(e) = tli.maybe_pesist_control_file() {
warn!("failed to persist control file: {e}");
}
if let Err(e) = tli
.remove_old_wal(conf.wal_backup_enabled)
.instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
.await
{
error!("failed to remove WAL: {}", e);
if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) {
warn!("failed to remove WAL: {}", e);
}
}
sleep(wal_removal_interval).await;
thread::sleep(wal_removal_interval)
}
}

View File

@@ -568,27 +568,25 @@ where
/// Process message from proposer and possibly form reply. Concurrent
/// callers must exclude each other.
pub async fn process_msg(
pub fn process_msg(
&mut self,
msg: &ProposerAcceptorMessage,
) -> Result<Option<AcceptorProposerMessage>> {
match msg {
ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await,
ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await,
ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await,
ProposerAcceptorMessage::AppendRequest(msg) => {
self.handle_append_request(msg, true).await
}
ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg),
ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg),
ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg),
ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg, true),
ProposerAcceptorMessage::NoFlushAppendRequest(msg) => {
self.handle_append_request(msg, false).await
self.handle_append_request(msg, false)
}
ProposerAcceptorMessage::FlushWAL => self.handle_flush().await,
ProposerAcceptorMessage::FlushWAL => self.handle_flush(),
}
}
/// Handle initial message from proposer: check its sanity and send my
/// current term.
async fn handle_greeting(
fn handle_greeting(
&mut self,
msg: &ProposerGreeting,
) -> Result<Option<AcceptorProposerMessage>> {
@@ -651,7 +649,7 @@ where
if msg.pg_version != UNKNOWN_SERVER_VERSION {
state.server.pg_version = msg.pg_version;
}
self.state.persist(&state).await?;
self.state.persist(&state)?;
}
info!(
@@ -666,7 +664,7 @@ where
}
/// Give vote for the given term, if we haven't done that previously.
async fn handle_vote_request(
fn handle_vote_request(
&mut self,
msg: &VoteRequest,
) -> Result<Option<AcceptorProposerMessage>> {
@@ -680,7 +678,7 @@ where
// handle_elected instead. Currently not a big deal, as proposer is the
// only source of WAL; with peer2peer recovery it would be more
// important.
self.wal_store.flush_wal().await?;
self.wal_store.flush_wal()?;
// initialize with refusal
let mut resp = VoteResponse {
term: self.state.acceptor_state.term,
@@ -694,7 +692,7 @@ where
let mut state = self.state.clone();
state.acceptor_state.term = msg.term;
// persist vote before sending it out
self.state.persist(&state).await?;
self.state.persist(&state)?;
resp.term = self.state.acceptor_state.term;
resp.vote_given = true as u64;
@@ -717,15 +715,12 @@ where
ar
}
async fn handle_elected(
&mut self,
msg: &ProposerElected,
) -> Result<Option<AcceptorProposerMessage>> {
fn handle_elected(&mut self, msg: &ProposerElected) -> Result<Option<AcceptorProposerMessage>> {
info!("received ProposerElected {:?}", msg);
if self.state.acceptor_state.term < msg.term {
let mut state = self.state.clone();
state.acceptor_state.term = msg.term;
self.state.persist(&state).await?;
self.state.persist(&state)?;
}
// If our term is higher, ignore the message (next feedback will inform the compute)
@@ -755,7 +750,7 @@ where
// intersection of our history and history from msg
// truncate wal, update the LSNs
self.wal_store.truncate_wal(msg.start_streaming_at).await?;
self.wal_store.truncate_wal(msg.start_streaming_at)?;
// and now adopt term history from proposer
{
@@ -789,7 +784,7 @@ where
self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn);
state.acceptor_state.term_history = msg.term_history.clone();
self.persist_control_file(state).await?;
self.persist_control_file(state)?;
}
info!("start receiving WAL since {:?}", msg.start_streaming_at);
@@ -801,7 +796,7 @@ where
///
/// Note: it is assumed that 'WAL we have is from the right term' check has
/// already been done outside.
async fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
// Both peers and walproposer communicate this value, we might already
// have a fresher (higher) version.
candidate = max(candidate, self.inmem.commit_lsn);
@@ -823,32 +818,29 @@ where
// that we receive new epoch_start_lsn, and we still need to sync
// control file in this case.
if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
self.persist_control_file(self.state.clone()).await?;
self.persist_control_file(self.state.clone())?;
}
Ok(())
}
/// Persist control file to disk, called only after timeline creation (bootstrap).
pub async fn persist(&mut self) -> Result<()> {
self.persist_control_file(self.state.clone()).await
pub fn persist(&mut self) -> Result<()> {
self.persist_control_file(self.state.clone())
}
/// Persist in-memory state to the disk, taking other data from state.
async fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
state.commit_lsn = self.inmem.commit_lsn;
state.backup_lsn = self.inmem.backup_lsn;
state.peer_horizon_lsn = self.inmem.peer_horizon_lsn;
state.proposer_uuid = self.inmem.proposer_uuid;
self.state.persist(&state).await
self.state.persist(&state)
}
/// Persist control file if there is something to save and enough time
/// passed after the last save.
pub async fn maybe_persist_control_file(
&mut self,
inmem_remote_consistent_lsn: Lsn,
) -> Result<()> {
pub fn maybe_persist_control_file(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
if self.state.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
return Ok(());
@@ -860,7 +852,7 @@ where
if need_persist {
let mut state = self.state.clone();
state.remote_consistent_lsn = inmem_remote_consistent_lsn;
self.persist_control_file(state).await?;
self.persist_control_file(state)?;
trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
}
Ok(())
@@ -868,7 +860,7 @@ where
/// Handle request to append WAL.
#[allow(clippy::comparison_chain)]
async fn handle_append_request(
fn handle_append_request(
&mut self,
msg: &AppendRequest,
require_flush: bool,
@@ -891,19 +883,17 @@ where
// do the job
if !msg.wal_data.is_empty() {
self.wal_store
.write_wal(msg.h.begin_lsn, &msg.wal_data)
.await?;
self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?;
}
// flush wal to the disk, if required
if require_flush {
self.wal_store.flush_wal().await?;
self.wal_store.flush_wal()?;
}
// Update commit_lsn.
if msg.h.commit_lsn != Lsn(0) {
self.update_commit_lsn(msg.h.commit_lsn).await?;
self.update_commit_lsn(msg.h.commit_lsn)?;
}
// Value calculated by walproposer can always lag:
// - safekeepers can forget inmem value and send to proposer lower
@@ -919,7 +909,7 @@ where
if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
< self.inmem.peer_horizon_lsn
{
self.persist_control_file(self.state.clone()).await?;
self.persist_control_file(self.state.clone())?;
}
trace!(
@@ -941,15 +931,15 @@ where
}
/// Flush WAL to disk. Return AppendResponse with latest LSNs.
async fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
self.wal_store.flush_wal().await?;
fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
self.wal_store.flush_wal()?;
Ok(Some(AcceptorProposerMessage::AppendResponse(
self.append_response(),
)))
}
/// Update timeline state with peer safekeeper data.
pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
let mut sync_control_file = false;
if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
@@ -957,7 +947,7 @@ where
// commit_lsn if our history matches (is part of) history of advanced
// commit_lsn provider.
if sk_info.last_log_term == self.get_epoch() {
self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
self.update_commit_lsn(Lsn(sk_info.commit_lsn))?;
}
}
@@ -983,7 +973,7 @@ where
// Note: we could make remote_consistent_lsn update in cf common by
// storing Arc to walsenders in Safekeeper.
state.remote_consistent_lsn = new_remote_consistent_lsn;
self.persist_control_file(state).await?;
self.persist_control_file(state)?;
}
Ok(())
}
@@ -1007,7 +997,6 @@ where
#[cfg(test)]
mod tests {
use futures::future::BoxFuture;
use postgres_ffi::WAL_SEGMENT_SIZE;
use super::*;
@@ -1019,9 +1008,8 @@ mod tests {
persisted_state: SafeKeeperState,
}
#[async_trait::async_trait]
impl control_file::Storage for InMemoryState {
async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
self.persisted_state = s.clone();
Ok(())
}
@@ -1051,28 +1039,27 @@ mod tests {
lsn: Lsn,
}
#[async_trait::async_trait]
impl wal_storage::Storage for DummyWalStore {
fn flush_lsn(&self) -> Lsn {
self.lsn
}
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
self.lsn = startpos + buf.len() as u64;
Ok(())
}
async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
self.lsn = end_pos;
Ok(())
}
async fn flush_wal(&mut self) -> Result<()> {
fn flush_wal(&mut self) -> Result<()> {
Ok(())
}
fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
Box::pin(async { Ok(()) })
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
}
fn get_metrics(&self) -> crate::metrics::WalStorageMetrics {
@@ -1080,8 +1067,8 @@ mod tests {
}
}
#[tokio::test]
async fn test_voting() {
#[test]
fn test_voting() {
let storage = InMemoryState {
persisted_state: test_sk_state(),
};
@@ -1090,7 +1077,7 @@ mod tests {
// check voting for 1 is ok
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
let mut vote_resp = sk.process_msg(&vote_request).await;
let mut vote_resp = sk.process_msg(&vote_request);
match vote_resp.unwrap() {
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0),
r => panic!("unexpected response: {:?}", r),
@@ -1105,15 +1092,15 @@ mod tests {
sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap();
// and ensure voting second time for 1 is not ok
vote_resp = sk.process_msg(&vote_request).await;
vote_resp = sk.process_msg(&vote_request);
match vote_resp.unwrap() {
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0),
r => panic!("unexpected response: {:?}", r),
}
}
#[tokio::test]
async fn test_epoch_switch() {
#[test]
fn test_epoch_switch() {
let storage = InMemoryState {
persisted_state: test_sk_state(),
};
@@ -1145,13 +1132,10 @@ mod tests {
timeline_start_lsn: Lsn(0),
};
sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
.await
.unwrap();
// check that AppendRequest before epochStartLsn doesn't switch epoch
let resp = sk
.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
.await;
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
assert!(resp.is_ok());
assert_eq!(sk.get_epoch(), 0);
@@ -1162,11 +1146,9 @@ mod tests {
h: ar_hdr,
wal_data: Bytes::from_static(b"b"),
};
let resp = sk
.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
.await;
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
assert!(resp.is_ok());
sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
sk.wal_store.truncate_wal(Lsn(3)).unwrap(); // imitate the complete record at 3 %)
assert_eq!(sk.get_epoch(), 1);
}
}

View File

@@ -396,7 +396,7 @@ impl SafekeeperPostgresHandler {
// on this safekeeper itself. That's ok as (old) proposer will never be
// able to commit such WAL.
let stop_pos: Option<Lsn> = if self.is_walproposer_recovery() {
let wal_end = tli.get_flush_lsn().await;
let wal_end = tli.get_flush_lsn();
Some(wal_end)
} else {
None
@@ -418,7 +418,7 @@ impl SafekeeperPostgresHandler {
// switch to copy
pgb.write_message(&BeMessage::CopyBothResponse).await?;
let (_, persisted_state) = tli.get_state().await;
let (_, persisted_state) = tli.get_state();
let wal_reader = WalReader::new(
self.conf.workdir.clone(),
self.conf.timeline_dir(&tli.ttid),
@@ -562,7 +562,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
.walsenders
.get_ws_remote_consistent_lsn(self.ws_guard.id)
{
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
if self.tli.should_walsender_stop(remote_consistent_lsn) {
// Terminate if there is nothing more to send.
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",

View File

@@ -2,13 +2,12 @@
//! to glue together SafeKeeper and all other background services.
use anyhow::{anyhow, bail, Result};
use parking_lot::{Mutex, MutexGuard};
use postgres_ffi::XLogSegNo;
use tokio::fs;
use std::cmp::max;
use std::path::PathBuf;
use std::sync::Arc;
use tokio::sync::{Mutex, MutexGuard};
use tokio::{
sync::{mpsc::Sender, watch},
time::Instant,
@@ -287,9 +286,8 @@ pub struct Timeline {
commit_lsn_watch_tx: watch::Sender<Lsn>,
commit_lsn_watch_rx: watch::Receiver<Lsn>,
/// Safekeeper and other state, that should remain consistent and
/// synchronized with the disk. This is tokio mutex as we write WAL to disk
/// while holding it, ensuring that consensus checks are in order.
/// Safekeeper and other state, that should remain consistent and synchronized
/// with the disk.
mutex: Mutex<SharedState>,
walsenders: Arc<WalSenders>,
@@ -363,8 +361,8 @@ impl Timeline {
///
/// Bootstrap is transactional, so if it fails, created files will be deleted,
/// and state on disk should remain unchanged.
pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> {
match fs::metadata(&self.timeline_dir).await {
pub fn bootstrap(&self, shared_state: &mut MutexGuard<SharedState>) -> Result<()> {
match std::fs::metadata(&self.timeline_dir) {
Ok(_) => {
// Timeline directory exists on disk, we should leave state unchanged
// and return error.
@@ -377,51 +375,53 @@ impl Timeline {
}
// Create timeline directory.
fs::create_dir_all(&self.timeline_dir).await?;
std::fs::create_dir_all(&self.timeline_dir)?;
// Write timeline to disk and TODO: start background tasks.
if let Err(e) = shared_state.sk.persist().await {
// Bootstrap failed, cancel timeline and remove timeline directory.
self.cancel(shared_state);
match || -> Result<()> {
shared_state.sk.persist()?;
// TODO: add more initialization steps here
self.update_status(shared_state);
Ok(())
}() {
Ok(_) => Ok(()),
Err(e) => {
// Bootstrap failed, cancel timeline and remove timeline directory.
self.cancel(shared_state);
if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await {
warn!(
"failed to remove timeline {} directory after bootstrap failure: {}",
self.ttid, fs_err
);
if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) {
warn!(
"failed to remove timeline {} directory after bootstrap failure: {}",
self.ttid, fs_err
);
}
Err(e)
}
return Err(e);
}
// TODO: add more initialization steps here
self.update_status(shared_state);
Ok(())
}
/// Delete timeline from disk completely, by removing timeline directory. Background
/// timeline activities will stop eventually.
pub async fn delete_from_disk(
pub fn delete_from_disk(
&self,
shared_state: &mut MutexGuard<'_, SharedState>,
shared_state: &mut MutexGuard<SharedState>,
) -> Result<(bool, bool)> {
let was_active = shared_state.active;
self.cancel(shared_state);
let dir_existed = delete_dir(&self.timeline_dir).await?;
let dir_existed = delete_dir(&self.timeline_dir)?;
Ok((dir_existed, was_active))
}
/// Cancel timeline to prevent further usage. Background tasks will stop
/// eventually after receiving cancellation signal.
///
/// Note that we can't notify backup launcher here while holding
/// shared_state lock, as this is a potential deadlock: caller is
/// responsible for that. Generally we should probably make WAL backup tasks
/// to shut down on their own, checking once in a while whether it is the
/// time.
fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
fn cancel(&self, shared_state: &mut MutexGuard<SharedState>) {
info!("timeline {} is cancelled", self.ttid);
let _ = self.cancellation_tx.send(true);
let res = self.wal_backup_launcher_tx.blocking_send(self.ttid);
if let Err(e) = res {
error!("Failed to send stop signal to wal_backup_launcher: {}", e);
}
// Close associated FDs. Nobody will be able to touch timeline data once
// it is cancelled, so WAL storage won't be opened again.
shared_state.sk.wal_store.close();
@@ -433,8 +433,8 @@ impl Timeline {
}
/// Take a writing mutual exclusive lock on timeline shared_state.
pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
self.mutex.lock().await
pub fn write_shared_state(&self) -> MutexGuard<SharedState> {
self.mutex.lock()
}
fn update_status(&self, shared_state: &mut SharedState) -> bool {
@@ -450,7 +450,7 @@ impl Timeline {
let is_wal_backup_action_pending: bool;
{
let mut shared_state = self.write_shared_state().await;
let mut shared_state = self.write_shared_state();
shared_state.num_computes += 1;
is_wal_backup_action_pending = self.update_status(&mut shared_state);
}
@@ -464,17 +464,22 @@ impl Timeline {
/// De-register compute connection, shutting down timeline activity if
/// pageserver doesn't need catchup.
pub async fn on_compute_disconnect(&self) -> Result<()> {
pub fn on_compute_disconnect(&self) -> Result<()> {
let is_wal_backup_action_pending: bool;
{
let mut shared_state = self.write_shared_state().await;
let mut shared_state = self.write_shared_state();
shared_state.num_computes -= 1;
is_wal_backup_action_pending = self.update_status(&mut shared_state);
}
// Wake up wal backup launcher, if it is time to stop the offloading.
if is_wal_backup_action_pending {
// Can fail only if channel to a static thread got closed, which is not normal at all.
self.wal_backup_launcher_tx.send(self.ttid).await?;
//
// Note: this is blocking_send because on_compute_disconnect is called in Drop, there is
// no async Drop and we use current thread runtimes. With current thread rt spawning
// task in drop impl is racy, as thread along with runtime might finish before the task.
// This should be switched send.await when/if we go to full async.
self.wal_backup_launcher_tx.blocking_send(self.ttid)?;
}
Ok(())
}
@@ -484,11 +489,11 @@ impl Timeline {
/// computes. While there might be nothing to stream already, we learn about
/// remote_consistent_lsn update through replication feedback, and we want
/// to stop pushing to the broker if pageserver is fully caughtup.
pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
pub fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
if self.is_cancelled() {
return true;
}
let shared_state = self.write_shared_state().await;
let shared_state = self.write_shared_state();
if shared_state.num_computes == 0 {
return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn;
@@ -498,12 +503,12 @@ impl Timeline {
/// Returns whether s3 offloading is required and sets current status as
/// matching it.
pub async fn wal_backup_attend(&self) -> bool {
pub fn wal_backup_attend(&self) -> bool {
if self.is_cancelled() {
return false;
}
self.write_shared_state().await.wal_backup_attend()
self.write_shared_state().wal_backup_attend()
}
/// Returns commit_lsn watch channel.
@@ -512,7 +517,7 @@ impl Timeline {
}
/// Pass arrived message to the safekeeper.
pub async fn process_msg(
pub fn process_msg(
&self,
msg: &ProposerAcceptorMessage,
) -> Result<Option<AcceptorProposerMessage>> {
@@ -523,8 +528,8 @@ impl Timeline {
let mut rmsg: Option<AcceptorProposerMessage>;
let commit_lsn: Lsn;
{
let mut shared_state = self.write_shared_state().await;
rmsg = shared_state.sk.process_msg(msg).await?;
let mut shared_state = self.write_shared_state();
rmsg = shared_state.sk.process_msg(msg)?;
// if this is AppendResponse, fill in proper pageserver and hot
// standby feedback.
@@ -541,37 +546,37 @@ impl Timeline {
}
/// Returns wal_seg_size.
pub async fn get_wal_seg_size(&self) -> usize {
self.write_shared_state().await.get_wal_seg_size()
pub fn get_wal_seg_size(&self) -> usize {
self.write_shared_state().get_wal_seg_size()
}
/// Returns true only if the timeline is loaded and active.
pub async fn is_active(&self) -> bool {
pub fn is_active(&self) -> bool {
if self.is_cancelled() {
return false;
}
self.write_shared_state().await.active
self.write_shared_state().active
}
/// Returns state of the timeline.
pub async fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
let state = self.write_shared_state().await;
pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
let state = self.write_shared_state();
(state.sk.inmem.clone(), state.sk.state.clone())
}
/// Returns latest backup_lsn.
pub async fn get_wal_backup_lsn(&self) -> Lsn {
self.write_shared_state().await.sk.inmem.backup_lsn
pub fn get_wal_backup_lsn(&self) -> Lsn {
self.write_shared_state().sk.inmem.backup_lsn
}
/// Sets backup_lsn to the given value.
pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
let mut state = self.write_shared_state().await;
let mut state = self.write_shared_state();
state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn);
// we should check whether to shut down offloader, but this will be done
// soon by peer communication anyway.
@@ -579,8 +584,8 @@ impl Timeline {
}
/// Get safekeeper info for broadcasting to broker and other peers.
pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
let shared_state = self.write_shared_state().await;
pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
let shared_state = self.write_shared_state();
shared_state.get_safekeeper_info(
&self.ttid,
conf,
@@ -599,8 +604,8 @@ impl Timeline {
let is_wal_backup_action_pending: bool;
let commit_lsn: Lsn;
{
let mut shared_state = self.write_shared_state().await;
shared_state.sk.record_safekeeper_info(&sk_info).await?;
let mut shared_state = self.write_shared_state();
shared_state.sk.record_safekeeper_info(&sk_info)?;
let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
shared_state.peers_info.upsert(&peer_info);
is_wal_backup_action_pending = self.update_status(&mut shared_state);
@@ -617,8 +622,8 @@ impl Timeline {
/// Get our latest view of alive peers status on the timeline.
/// We pass our own info through the broker as well, so when we don't have connection
/// to the broker returned vec is empty.
pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
let shared_state = self.write_shared_state().await;
pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
let shared_state = self.write_shared_state();
let now = Instant::now();
shared_state
.peers_info
@@ -635,34 +640,34 @@ impl Timeline {
}
/// Returns flush_lsn.
pub async fn get_flush_lsn(&self) -> Lsn {
self.write_shared_state().await.sk.wal_store.flush_lsn()
pub fn get_flush_lsn(&self) -> Lsn {
self.write_shared_state().sk.wal_store.flush_lsn()
}
/// Delete WAL segments from disk that are no longer needed. This is determined
/// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
let horizon_segno: XLogSegNo;
let remover = {
let shared_state = self.write_shared_state().await;
let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
{
let shared_state = self.write_shared_state();
horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled);
remover = shared_state.sk.wal_store.remove_up_to();
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
return Ok(()); // nothing to do
return Ok(());
}
let remover = shared_state.sk.wal_store.remove_up_to(horizon_segno - 1);
// release the lock before removing
remover
};
}
// delete old WAL files
remover.await?;
remover(horizon_segno - 1)?;
// update last_removed_segno
let mut shared_state = self.write_shared_state().await;
let mut shared_state = self.write_shared_state();
shared_state.last_removed_segno = horizon_segno;
Ok(())
}
@@ -671,24 +676,22 @@ impl Timeline {
/// passed after the last save. This helps to keep remote_consistent_lsn up
/// to date so that storage nodes restart doesn't cause many pageserver ->
/// safekeeper reconnections.
pub async fn maybe_persist_control_file(&self) -> Result<()> {
pub fn maybe_pesist_control_file(&self) -> Result<()> {
let remote_consistent_lsn = self.walsenders.get_remote_consistent_lsn();
self.write_shared_state()
.await
.sk
.maybe_persist_control_file(remote_consistent_lsn)
.await
}
/// Gather timeline data for metrics. If the timeline is not active, returns
/// None, we do not collect these.
pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
/// Returns full timeline info, required for the metrics. If the timeline is
/// not active, returns None instead.
pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
if self.is_cancelled() {
return None;
}
let ps_feedback = self.walsenders.get_ps_feedback();
let state = self.write_shared_state().await;
let state = self.write_shared_state();
if state.active {
Some(FullTimelineInfo {
ttid: self.ttid,
@@ -710,8 +713,8 @@ impl Timeline {
}
/// Returns in-memory timeline state to build a full debug dump.
pub async fn memory_dump(&self) -> debug_dump::Memory {
let state = self.write_shared_state().await;
pub fn memory_dump(&self) -> debug_dump::Memory {
let state = self.write_shared_state();
let (write_lsn, write_record_lsn, flush_lsn, file_open) =
state.sk.wal_store.internal_state();
@@ -735,8 +738,8 @@ impl Timeline {
}
/// Deletes directory and it's contents. Returns false if directory does not exist.
async fn delete_dir(path: &PathBuf) -> Result<bool> {
match fs::remove_dir_all(path).await {
fn delete_dir(path: &PathBuf) -> Result<bool> {
match std::fs::remove_dir_all(path) {
Ok(_) => Ok(true),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false),
Err(e) => Err(e.into()),

View File

@@ -113,17 +113,9 @@ impl GlobalTimelines {
Ok(())
}
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir
/// errors if any.
///
/// Note: This function (and all reading/loading below) is sync because
/// timelines are loaded while holding GlobalTimelinesState lock. Which is
/// fine as this is called only from single threaded main runtime on boot,
/// but clippy complains anyway, and suppressing that isn't trivial as async
/// is the keyword, ha. That only other user is pull_timeline.rs for which
/// being blocked is not that bad, and we can do spawn_blocking.
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any.
fn load_tenant_timelines(
state: &mut MutexGuard<'_, GlobalTimelinesState>,
state: &mut MutexGuard<GlobalTimelinesState>,
tenant_id: TenantId,
) -> Result<()> {
let timelines_dir = state.get_conf().tenant_dir(&tenant_id);
@@ -228,7 +220,7 @@ impl GlobalTimelines {
// Take a lock and finish the initialization holding this mutex. No other threads
// can interfere with creation after we will insert timeline into the map.
{
let mut shared_state = timeline.write_shared_state().await;
let mut shared_state = timeline.write_shared_state();
// We can get a race condition here in case of concurrent create calls, but only
// in theory. create() will return valid timeline on the next try.
@@ -240,7 +232,7 @@ impl GlobalTimelines {
// Write the new timeline to the disk and start background workers.
// Bootstrap is transactional, so if it fails, the timeline will be deleted,
// and the state on disk should remain unchanged.
if let Err(e) = timeline.bootstrap(&mut shared_state).await {
if let Err(e) = timeline.bootstrap(&mut shared_state) {
// Note: the most likely reason for bootstrap failure is that the timeline
// directory already exists on disk. This happens when timeline is corrupted
// and wasn't loaded from disk on startup because of that. We want to preserve
@@ -302,16 +294,15 @@ impl GlobalTimelines {
}
/// Cancels timeline, then deletes the corresponding data directory.
pub async fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
pub fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
match tli_res {
Ok(timeline) => {
// Take a lock and finish the deletion holding this mutex.
let mut shared_state = timeline.write_shared_state().await;
let mut shared_state = timeline.write_shared_state();
info!("deleting timeline {}", ttid);
let (dir_existed, was_active) =
timeline.delete_from_disk(&mut shared_state).await?;
let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?;
// Remove timeline from the map.
// FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -344,7 +335,7 @@ impl GlobalTimelines {
/// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
/// created simultaneously. In that case the function will return error and the caller should
/// retry tenant deletion again later.
pub async fn delete_force_all_for_tenant(
pub fn delete_force_all_for_tenant(
tenant_id: &TenantId,
) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
info!("deleting all timelines for tenant {}", tenant_id);
@@ -354,7 +345,7 @@ impl GlobalTimelines {
let mut deleted = HashMap::new();
for tli in &to_delete {
match Self::delete_force(&tli.ttid).await {
match Self::delete_force(&tli.ttid) {
Ok(result) => {
deleted.insert(tli.ttid, result);
}

View File

@@ -17,6 +17,7 @@ use postgres_ffi::XLogFileName;
use postgres_ffi::{XLogSegNo, PG_TLI};
use remote_storage::{GenericRemoteStorage, RemotePath};
use tokio::fs::File;
use tokio::runtime::Builder;
use tokio::select;
use tokio::sync::mpsc::{self, Receiver, Sender};
@@ -35,16 +36,30 @@ use once_cell::sync::OnceCell;
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
pub fn wal_backup_launcher_thread_main(
conf: SafeKeeperConf,
wal_backup_launcher_rx: Receiver<TenantTimelineId>,
) {
let mut builder = Builder::new_multi_thread();
if let Some(num_threads) = conf.backup_runtime_threads {
builder.worker_threads(num_threads);
}
let rt = builder
.enable_all()
.build()
.expect("failed to create wal backup runtime");
rt.block_on(async {
wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await;
});
}
/// Check whether wal backup is required for timeline. If yes, mark that launcher is
/// aware of current status and return the timeline.
async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
match GlobalTimelines::get(ttid).ok() {
Some(tli) => {
tli.wal_backup_attend().await;
Some(tli)
}
None => None,
}
fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
GlobalTimelines::get(ttid)
.ok()
.filter(|tli| tli.wal_backup_attend())
}
struct WalBackupTaskHandle {
@@ -128,8 +143,8 @@ async fn update_task(
ttid: TenantTimelineId,
entry: &mut WalBackupTimelineEntry,
) {
let alive_peers = entry.timeline.get_peers(conf).await;
let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
let alive_peers = entry.timeline.get_peers(conf);
let wal_backup_lsn = entry.timeline.get_wal_backup_lsn();
let (offloader, election_dbg_str) =
determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
let elected_me = Some(conf.my_id) == offloader;
@@ -168,10 +183,10 @@ const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
/// tasks. Having this in separate task simplifies locking, allows to reap
/// panics and separate elections from offloading itself.
pub async fn wal_backup_launcher_task_main(
async fn wal_backup_launcher_main_loop(
conf: SafeKeeperConf,
mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
) -> anyhow::Result<()> {
) {
info!(
"WAL backup launcher started, remote config {:?}",
conf.remote_storage
@@ -199,7 +214,7 @@ pub async fn wal_backup_launcher_task_main(
if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
continue; /* just drain the channel and do nothing */
}
let timeline = is_wal_backup_required(ttid).await;
let timeline = is_wal_backup_required(ttid);
// do we need to do anything at all?
if timeline.is_some() != tasks.contains_key(&ttid) {
if let Some(timeline) = timeline {
@@ -254,7 +269,7 @@ async fn backup_task_main(
let tli = res.unwrap();
let mut wb = WalBackupTask {
wal_seg_size: tli.get_wal_seg_size().await,
wal_seg_size: tli.get_wal_seg_size(),
commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
timeline: tli,
timeline_dir,
@@ -311,7 +326,7 @@ impl WalBackupTask {
continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
}
// Perhaps peers advanced the position, check shmem value.
backup_lsn = self.timeline.get_wal_backup_lsn().await;
backup_lsn = self.timeline.get_wal_backup_lsn();
if backup_lsn.segment_number(self.wal_seg_size)
>= commit_lsn.segment_number(self.wal_seg_size)
{
@@ -387,7 +402,6 @@ pub async fn backup_lsn_range(
let new_backup_lsn = segment.end_lsn;
timeline
.set_wal_backup_lsn(new_backup_lsn)
.await
.context("setting wal_backup_lsn")?;
*backup_lsn = new_backup_lsn;
} else {

View File

@@ -4,7 +4,7 @@
//!
use anyhow::{Context, Result};
use postgres_backend::QueryError;
use std::{future, time::Duration};
use std::{future, thread, time::Duration};
use tokio::net::TcpStream;
use tokio_io_timeout::TimeoutReader;
use tracing::*;
@@ -16,82 +16,104 @@ use crate::SafeKeeperConf;
use postgres_backend::{AuthType, PostgresBackend};
/// Accept incoming TCP connections and spawn them into a background thread.
pub async fn task_main(
conf: SafeKeeperConf,
pg_listener: std::net::TcpListener,
) -> anyhow::Result<()> {
// Tokio's from_std won't do this for us, per its comment.
pg_listener.set_nonblocking(true)?;
pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) {
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.context("create runtime")
// todo catch error in main thread
.expect("failed to create runtime");
let listener = tokio::net::TcpListener::from_std(pg_listener)?;
let mut connection_count: ConnectionCount = 0;
runtime
.block_on(async move {
// Tokio's from_std won't do this for us, per its comment.
pg_listener.set_nonblocking(true)?;
let listener = tokio::net::TcpListener::from_std(pg_listener)?;
let mut connection_count: ConnectionCount = 0;
loop {
let (socket, peer_addr) = listener.accept().await.context("accept")?;
debug!("accepted connection from {}", peer_addr);
let conf = conf.clone();
let conn_id = issue_connection_id(&mut connection_count);
loop {
match listener.accept().await {
Ok((socket, peer_addr)) => {
debug!("accepted connection from {}", peer_addr);
let conf = conf.clone();
let conn_id = issue_connection_id(&mut connection_count);
tokio::spawn(async move {
if let Err(err) = handle_socket(socket, conf, conn_id)
.instrument(info_span!("", cid = %conn_id))
.await
{
error!("connection handler exited: {}", err);
let _ = thread::Builder::new()
.name("WAL service thread".into())
.spawn(move || {
if let Err(err) = handle_socket(socket, conf, conn_id) {
error!("connection handler exited: {}", err);
}
})
.unwrap();
}
Err(e) => error!("Failed to accept connection: {}", e),
}
}
});
}
#[allow(unreachable_code)] // hint compiler the closure return type
Ok::<(), anyhow::Error>(())
})
.expect("listener failed")
}
/// This is run by `task_main` above, inside a background thread.
/// This is run by `thread_main` above, inside a background thread.
///
async fn handle_socket(
fn handle_socket(
socket: TcpStream,
conf: SafeKeeperConf,
conn_id: ConnectionId,
) -> Result<(), QueryError> {
let _enter = info_span!("", cid = %conn_id).entered();
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
socket.set_nodelay(true)?;
let peer_addr = socket.peer_addr()?;
// Set timeout on reading from the socket. It prevents hanged up connection
// if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
// default, and tokio doesn't provide ability to set it out of the box.
let mut socket = TimeoutReader::new(socket);
let wal_service_timeout = Duration::from_secs(60 * 10);
socket.set_timeout(Some(wal_service_timeout));
// pin! is here because TimeoutReader (due to storing sleep future inside)
// is not Unpin, and all pgbackend/framed/tokio dependencies require stream
// to be Unpin. Which is reasonable, as indeed something like TimeoutReader
// shouldn't be moved.
tokio::pin!(socket);
// TimeoutReader wants async runtime during creation.
runtime.block_on(async move {
// Set timeout on reading from the socket. It prevents hanged up connection
// if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
// default, and tokio doesn't provide ability to set it out of the box.
let mut socket = TimeoutReader::new(socket);
let wal_service_timeout = Duration::from_secs(60 * 10);
socket.set_timeout(Some(wal_service_timeout));
// pin! is here because TimeoutReader (due to storing sleep future inside)
// is not Unpin, and all pgbackend/framed/tokio dependencies require stream
// to be Unpin. Which is reasonable, as indeed something like TimeoutReader
// shouldn't be moved.
tokio::pin!(socket);
let traffic_metrics = TrafficMetrics::new();
if let Some(current_az) = conf.availability_zone.as_deref() {
traffic_metrics.set_sk_az(current_az);
}
let traffic_metrics = TrafficMetrics::new();
if let Some(current_az) = conf.availability_zone.as_deref() {
traffic_metrics.set_sk_az(current_az);
}
let socket = MeasuredStream::new(
socket,
|cnt| {
traffic_metrics.observe_read(cnt);
},
|cnt| {
traffic_metrics.observe_write(cnt);
},
);
let socket = MeasuredStream::new(
socket,
|cnt| {
traffic_metrics.observe_read(cnt);
},
|cnt| {
traffic_metrics.observe_write(cnt);
},
);
let auth_type = match conf.auth {
None => AuthType::Trust,
Some(_) => AuthType::NeonJWT,
};
let mut conn_handler =
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
// libpq protocol between safekeeper and walproposer / pageserver
// We don't use shutdown.
pgbackend
.run(&mut conn_handler, future::pending::<()>)
.await
let auth_type = match conf.auth {
None => AuthType::Trust,
Some(_) => AuthType::NeonJWT,
};
let mut conn_handler =
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
// libpq protocol between safekeeper and walproposer / pageserver
// We don't use shutdown.
pgbackend
.run(&mut conn_handler, future::pending::<()>)
.await
})
}
/// Unique WAL service connection ids are logged in spans for observability.

View File

@@ -8,47 +8,54 @@
//! Note that last file has `.partial` suffix, that's different from postgres.
use anyhow::{bail, Context, Result};
use bytes::Bytes;
use futures::future::BoxFuture;
use remote_storage::RemotePath;
use std::io::{self, Seek, SeekFrom};
use std::pin::Pin;
use tokio::io::AsyncRead;
use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName};
use postgres_ffi::{XLogSegNo, PG_TLI};
use remote_storage::RemotePath;
use std::cmp::{max, min};
use std::io::{self, SeekFrom};
use bytes::Bytes;
use std::fs::{self, remove_file, File, OpenOptions};
use std::io::Write;
use std::path::{Path, PathBuf};
use std::pin::Pin;
use tokio::fs::{self, remove_file, File, OpenOptions};
use tokio::io::{AsyncRead, AsyncWriteExt};
use tokio::io::{AsyncReadExt, AsyncSeekExt};
use tracing::*;
use utils::{id::TenantTimelineId, lsn::Lsn};
use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
use crate::safekeeper::SafeKeeperState;
use crate::wal_backup::read_object;
use crate::SafeKeeperConf;
use postgres_ffi::waldecoder::WalStreamDecoder;
use postgres_ffi::XLogFileName;
use postgres_ffi::XLOG_BLCKSZ;
use pq_proto::SystemId;
use utils::{id::TenantTimelineId, lsn::Lsn};
#[async_trait::async_trait]
use postgres_ffi::waldecoder::WalStreamDecoder;
use pq_proto::SystemId;
use tokio::io::{AsyncReadExt, AsyncSeekExt};
pub trait Storage {
/// LSN of last durably stored WAL record.
fn flush_lsn(&self) -> Lsn;
/// Write piece of WAL from buf to disk, but not necessarily sync it.
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
/// Truncate WAL at specified LSN, which must be the end of WAL record.
async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;
/// Durably store WAL on disk, up to the last written WAL record.
async fn flush_wal(&mut self) -> Result<()>;
fn flush_wal(&mut self) -> Result<()>;
/// Remove all segments <= given segno. Returns function doing that as we
/// want to perform it without timeline lock.
fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>>;
/// Remove all segments <= given segno. Returns closure as we want to do
/// that without timeline lock.
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>>;
/// Release resources associated with the storage -- technically, close FDs.
/// Currently we don't remove timelines until restart (#3146), so need to
@@ -98,22 +105,6 @@ pub struct PhysicalStorage {
/// - points to write_lsn, so no seek is needed for writing
/// - doesn't point to the end of the segment
file: Option<File>,
/// When false, we have just initialized storage using the LSN from find_end_of_wal().
/// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
/// there can be a case with unexpected .partial file.
///
/// Imagine the following:
/// - 000000010000000000000001
/// - it was fully written, but the last record is split between 2 segments
/// - after restart, find_end_of_wal() returned 0/1FFFFF0, which is in the end of this segment
/// - write_lsn, write_record_lsn and flush_record_lsn were initialized to 0/1FFFFF0
/// - 000000010000000000000002.partial
/// - it has only 1 byte written, which is not enough to make a full WAL record
///
/// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
/// This flag will be set to true after the first truncate_wal() call.
is_truncated_after_restart: bool,
}
impl PhysicalStorage {
@@ -173,7 +164,6 @@ impl PhysicalStorage {
flush_record_lsn: flush_lsn,
decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
file: None,
is_truncated_after_restart: false,
})
}
@@ -188,37 +178,33 @@ impl PhysicalStorage {
}
/// Call fdatasync if config requires so.
async fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
if !self.conf.no_sync {
self.metrics
.observe_flush_seconds(time_io_closure(file.sync_data()).await?);
.observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?);
}
Ok(())
}
/// Call fsync if config requires so.
async fn fsync_file(&mut self, file: &mut File) -> Result<()> {
fn fsync_file(&mut self, file: &mut File) -> Result<()> {
if !self.conf.no_sync {
self.metrics
.observe_flush_seconds(time_io_closure(file.sync_all()).await?);
.observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?);
}
Ok(())
}
/// Open or create WAL segment file. Caller must call seek to the wanted position.
/// Returns `file` and `is_partial`.
async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
let (wal_file_path, wal_file_partial_path) =
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
// Try to open already completed segment
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await {
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
Ok((file, false))
} else if let Ok(file) = OpenOptions::new()
.write(true)
.open(&wal_file_partial_path)
.await
{
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
// Try to open existing partial file
Ok((file, true))
} else {
@@ -227,36 +213,35 @@ impl PhysicalStorage {
.create(true)
.write(true)
.open(&wal_file_partial_path)
.await
.with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?;
write_zeroes(&mut file, self.wal_seg_size).await?;
self.fsync_file(&mut file).await?;
write_zeroes(&mut file, self.wal_seg_size)?;
self.fsync_file(&mut file)?;
Ok((file, true))
}
}
/// Write WAL bytes, which are known to be located in a single WAL segment.
async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
let mut file = if let Some(file) = self.file.take() {
file
} else {
let (mut file, is_partial) = self.open_or_create(segno).await?;
let (mut file, is_partial) = self.open_or_create(segno)?;
assert!(is_partial, "unexpected write into non-partial segment file");
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
file.seek(SeekFrom::Start(xlogoff as u64))?;
file
};
file.write_all(buf).await?;
file.write_all(buf)?;
if xlogoff + buf.len() == self.wal_seg_size {
// If we reached the end of a WAL segment, flush and close it.
self.fdatasync_file(&mut file).await?;
self.fdatasync_file(&mut file)?;
// Rename partial file to completed file
let (wal_file_path, wal_file_partial_path) =
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
fs::rename(wal_file_partial_path, wal_file_path).await?;
fs::rename(wal_file_partial_path, wal_file_path)?;
} else {
// otherwise, file can be reused later
self.file = Some(file);
@@ -270,11 +255,11 @@ impl PhysicalStorage {
/// be flushed separately later.
///
/// Updates `write_lsn`.
async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
if self.write_lsn != pos {
// need to flush the file before discarding it
if let Some(mut file) = self.file.take() {
self.fdatasync_file(&mut file).await?;
self.fdatasync_file(&mut file)?;
}
self.write_lsn = pos;
@@ -292,8 +277,7 @@ impl PhysicalStorage {
buf.len()
};
self.write_in_segment(segno, xlogoff, &buf[..bytes_write])
.await?;
self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?;
self.write_lsn += bytes_write as u64;
buf = &buf[bytes_write..];
}
@@ -302,7 +286,6 @@ impl PhysicalStorage {
}
}
#[async_trait::async_trait]
impl Storage for PhysicalStorage {
/// flush_lsn returns LSN of last durably stored WAL record.
fn flush_lsn(&self) -> Lsn {
@@ -310,7 +293,7 @@ impl Storage for PhysicalStorage {
}
/// Write WAL to disk.
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
// Disallow any non-sequential writes, which can result in gaps or overwrites.
// If we need to move the pointer, use truncate_wal() instead.
if self.write_lsn > startpos {
@@ -328,7 +311,7 @@ impl Storage for PhysicalStorage {
);
}
let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?;
let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?;
// WAL is written, updating write metrics
self.metrics.observe_write_seconds(write_seconds);
self.metrics.observe_write_bytes(buf.len());
@@ -357,14 +340,14 @@ impl Storage for PhysicalStorage {
Ok(())
}
async fn flush_wal(&mut self) -> Result<()> {
fn flush_wal(&mut self) -> Result<()> {
if self.flush_record_lsn == self.write_record_lsn {
// no need to do extra flush
return Ok(());
}
if let Some(mut unflushed_file) = self.file.take() {
self.fdatasync_file(&mut unflushed_file).await?;
self.fdatasync_file(&mut unflushed_file)?;
self.file = Some(unflushed_file);
} else {
// We have unflushed data (write_lsn != flush_lsn), but no file.
@@ -386,7 +369,7 @@ impl Storage for PhysicalStorage {
/// Truncate written WAL by removing all WAL segments after the given LSN.
/// end_pos must point to the end of the WAL record.
async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
// Streaming must not create a hole, so truncate cannot be called on non-written lsn
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
bail!(
@@ -398,51 +381,47 @@ impl Storage for PhysicalStorage {
// Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
// disk (this happens on each connect).
if self.is_truncated_after_restart
&& end_pos == self.write_lsn
&& end_pos == self.flush_record_lsn
{
if end_pos == self.write_lsn {
return Ok(());
}
// Close previously opened file, if any
if let Some(mut unflushed_file) = self.file.take() {
self.fdatasync_file(&mut unflushed_file).await?;
self.fdatasync_file(&mut unflushed_file)?;
}
let xlogoff = end_pos.segment_offset(self.wal_seg_size);
let segno = end_pos.segment_number(self.wal_seg_size);
// Remove all segments after the given LSN.
remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?;
remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?;
let (mut file, is_partial) = self.open_or_create(segno).await?;
let (mut file, is_partial) = self.open_or_create(segno)?;
// Fill end with zeroes
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?;
self.fdatasync_file(&mut file).await?;
file.seek(SeekFrom::Start(xlogoff as u64))?;
write_zeroes(&mut file, self.wal_seg_size - xlogoff)?;
self.fdatasync_file(&mut file)?;
if !is_partial {
// Make segment partial once again
let (wal_file_path, wal_file_partial_path) =
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
fs::rename(wal_file_path, wal_file_partial_path).await?;
fs::rename(wal_file_path, wal_file_partial_path)?;
}
// Update LSNs
self.write_lsn = end_pos;
self.write_record_lsn = end_pos;
self.flush_record_lsn = end_pos;
self.is_truncated_after_restart = true;
Ok(())
}
fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
let timeline_dir = self.timeline_dir.clone();
let wal_seg_size = self.wal_seg_size;
Box::pin(async move {
remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to).await
Box::new(move |segno_up_to: XLogSegNo| {
remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to)
})
}
@@ -457,7 +436,7 @@ impl Storage for PhysicalStorage {
}
/// Remove all WAL segments in timeline_dir that match the given predicate.
async fn remove_segments_from_disk(
fn remove_segments_from_disk(
timeline_dir: &Path,
wal_seg_size: usize,
remove_predicate: impl Fn(XLogSegNo) -> bool,
@@ -466,8 +445,8 @@ async fn remove_segments_from_disk(
let mut min_removed = u64::MAX;
let mut max_removed = u64::MIN;
let mut entries = fs::read_dir(timeline_dir).await?;
while let Some(entry) = entries.next_entry().await? {
for entry in fs::read_dir(timeline_dir)? {
let entry = entry?;
let entry_path = entry.path();
let fname = entry_path.file_name().unwrap();
@@ -478,7 +457,7 @@ async fn remove_segments_from_disk(
}
let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
if remove_predicate(segno) {
remove_file(entry_path).await?;
remove_file(entry_path)?;
n_removed += 1;
min_removed = min(min_removed, segno);
max_removed = max(max_removed, segno);
@@ -710,12 +689,12 @@ impl WalReader {
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
/// Helper for filling file with zeroes.
async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
while count >= XLOG_BLCKSZ {
file.write_all(ZERO_BLOCK).await?;
file.write_all(ZERO_BLOCK)?;
count -= XLOG_BLCKSZ;
}
file.write_all(&ZERO_BLOCK[0..count]).await?;
file.write_all(&ZERO_BLOCK[0..count])?;
Ok(())
}

191
scripts/comment-test-report.js Executable file → Normal file
View File

@@ -1,5 +1,3 @@
#! /usr/bin/env node
//
// The script parses Allure reports and posts a comment with a summary of the test results to the PR or to the latest commit in the branch.
//
@@ -21,7 +19,7 @@
// })
//
// Equivalent of Python's defaultdict.
// Analog of Python's defaultdict.
//
// const dm = new DefaultMap(() => new DefaultMap(() => []))
// dm["firstKey"]["secondKey"].push("value")
@@ -34,7 +32,34 @@ class DefaultMap extends Map {
}
}
const parseReportJson = async ({ reportJsonUrl, fetch }) => {
module.exports = async ({ github, context, fetch, report }) => {
// Marker to find the comment in the subsequent runs
const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
// If we run the script in the PR or in the branch (main/release/...)
const isPullRequest = !!context.payload.pull_request
// Latest commit in PR or in the branch
const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha
// Let users know that the comment is updated automatically
const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${commitSha} at ${new Date().toISOString()} :recycle:</sub></div>`
// GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
const githubActionsBotId = 41898282
// Commend body itself
let commentBody = `${startMarker}\n`
// Common parameters for GitHub API requests
const ownerRepoParams = {
owner: context.repo.owner,
repo: context.repo.repo,
}
const {reportUrl, reportJsonUrl} = report
if (!reportUrl || !reportJsonUrl) {
commentBody += `#### No tests were run or test report is not available\n`
commentBody += autoupdateNotice
return
}
const suites = await (await fetch(reportJsonUrl)).json()
// Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
@@ -58,7 +83,7 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
let buildType, pgVersion
const match = test.name.match(/[\[-](?<buildType>debug|release)-pg(?<pgVersion>\d+)[-\]]/)?.groups
if (match) {
({ buildType, pgVersion } = match)
({buildType, pgVersion} = match)
} else {
// It's ok, we embed BUILD_TYPE and Postgres Version into the test name only for regress suite and do not for other suites (like performance).
console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
@@ -98,68 +123,37 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
}
}
return {
failedTests,
failedTestsCount,
passedTests,
passedTestsCount,
skippedTests,
skippedTestsCount,
flakyTests,
flakyTestsCount,
retriedTests,
pgVersions,
}
}
const reportSummary = async (params) => {
const {
failedTests,
failedTestsCount,
passedTests,
passedTestsCount,
skippedTests,
skippedTestsCount,
flakyTests,
flakyTestsCount,
retriedTests,
pgVersions,
reportUrl,
} = params
let summary = ""
const totalTestsCount = failedTestsCount + passedTestsCount + skippedTestsCount
summary += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}))\n___\n`
commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}))\n___\n`
// Print test resuls from the newest to the oldest Postgres version for release and debug builds.
for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
if (Object.keys(failedTests[pgVersion]).length > 0) {
summary += `#### Failures on Posgres ${pgVersion}\n\n`
commentBody += `#### Failures on Posgres ${pgVersion}\n\n`
for (const [testName, tests] of Object.entries(failedTests[pgVersion])) {
const links = []
for (const test of tests) {
const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
links.push(`[${test.buildType}](${allureLink})`)
}
summary += `- \`${testName}\`: ${links.join(", ")}\n`
commentBody += `- \`${testName}\`: ${links.join(", ")}\n`
}
const testsToRerun = Object.values(failedTests[pgVersion]).map(x => x[0].name)
const command = `DEFAULT_PG_VERSION=${pgVersion} scripts/pytest -k "${testsToRerun.join(" or ")}"`
summary += "```\n"
summary += `# Run failed on Postgres ${pgVersion} tests locally:\n`
summary += `${command}\n`
summary += "```\n"
commentBody += "```\n"
commentBody += `# Run failed on Postgres ${pgVersion} tests locally:\n`
commentBody += `${command}\n`
commentBody += "```\n"
}
}
if (flakyTestsCount > 0) {
summary += `<details>\n<summary>Flaky tests (${flakyTestsCount})</summary>\n\n`
commentBody += `<details>\n<summary>Flaky tests (${flakyTestsCount})</summary>\n\n`
for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
if (Object.keys(flakyTests[pgVersion]).length > 0) {
summary += `#### Postgres ${pgVersion}\n\n`
commentBody += `#### Postgres ${pgVersion}\n\n`
for (const [testName, tests] of Object.entries(flakyTests[pgVersion])) {
const links = []
for (const test of tests) {
@@ -167,57 +161,11 @@ const reportSummary = async (params) => {
const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
links.push(`[${status} ${test.buildType}](${allureLink})`)
}
summary += `- \`${testName}\`: ${links.join(", ")}\n`
commentBody += `- \`${testName}\`: ${links.join(", ")}\n`
}
}
}
summary += "\n</details>\n"
}
return summary
}
module.exports = async ({ github, context, fetch, report }) => {
// Marker to find the comment in the subsequent runs
const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
// If we run the script in the PR or in the branch (main/release/...)
const isPullRequest = !!context.payload.pull_request
// Latest commit in PR or in the branch
const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha
// Let users know that the comment is updated automatically
const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${commitSha} at ${new Date().toISOString()} :recycle:</sub></div>`
// GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
const githubActionsBotId = 41898282
// Commend body itself
let commentBody = `${startMarker}\n`
// Common parameters for GitHub API requests
const ownerRepoParams = {
owner: context.repo.owner,
repo: context.repo.repo,
}
const {reportUrl, reportJsonUrl} = report
if (!reportUrl || !reportJsonUrl) {
commentBody += `#### No tests were run or test report is not available\n`
commentBody += autoupdateNotice
return
}
try {
const parsed = await parseReportJson({ reportJsonUrl, fetch })
commentBody += await reportSummary({ ...parsed, reportUrl })
} catch (error) {
commentBody += `### [full report](${reportUrl})\n___\n`
commentBody += `#### Failed to create a summary for the test run: \n`
commentBody += "```\n"
commentBody += `${error.stack}\n`
commentBody += "```\n"
commentBody += "\nTo reproduce and debug the error locally run:\n"
commentBody += "```\n"
commentBody += `scripts/comment-test-report.js ${reportJsonUrl}`
commentBody += "\n```\n"
commentBody += "\n</details>\n"
}
commentBody += autoupdateNotice
@@ -259,60 +207,3 @@ module.exports = async ({ github, context, fetch, report }) => {
})
}
}
// Equivalent of Python's `if __name__ == "__main__":`
// https://nodejs.org/docs/latest/api/modules.html#accessing-the-main-module
if (require.main === module) {
// Poor man's argument parsing: we expect the third argument is a JSON URL (0: node binary, 1: this script, 2: JSON url)
if (process.argv.length !== 3) {
console.error(`Unexpected number of arguments\nUsage: node ${process.argv[1]} <jsonUrl>`)
process.exit(1)
}
const jsonUrl = process.argv[2]
try {
new URL(jsonUrl)
} catch (error) {
console.error(`Invalid URL: ${jsonUrl}\nUsage: node ${process.argv[1]} <jsonUrl>`)
process.exit(1)
}
const htmlUrl = jsonUrl.replace("/data/suites.json", "/index.html")
const githubMock = {
rest: {
issues: {
createComment: console.log,
listComments: async () => ({ data: [] }),
updateComment: console.log
},
repos: {
createCommitComment: console.log,
listCommentsForCommit: async () => ({ data: [] }),
updateCommitComment: console.log
}
}
}
const contextMock = {
repo: {
owner: 'testOwner',
repo: 'testRepo'
},
payload: {
number: 42,
pull_request: null,
},
sha: '0000000000000000000000000000000000000000',
}
module.exports({
github: githubMock,
context: contextMock,
fetch: fetch,
report: {
reportUrl: htmlUrl,
reportJsonUrl: jsonUrl,
}
})
}

View File

@@ -1,14 +1,12 @@
#!/usr/bin/env python3
import argparse
import json
import logging
import os
import sys
from contextlib import contextmanager
from datetime import datetime
from pathlib import Path
import backoff
import psycopg2
import psycopg2.extras
@@ -37,20 +35,9 @@ def get_connection_cursor():
connstr = os.getenv("DATABASE_URL")
if not connstr:
err("DATABASE_URL environment variable is not set")
@backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150)
def connect(connstr):
conn = psycopg2.connect(connstr, connect_timeout=30)
conn.autocommit = True
return conn
conn = connect(connstr)
try:
with psycopg2.connect(connstr, connect_timeout=30) as conn:
with conn.cursor() as cur:
yield cur
finally:
if conn is not None:
conn.close()
def create_table(cur):
@@ -128,7 +115,6 @@ def main():
parser.add_argument(
"--ingest",
type=Path,
required=True,
help="Path to perf test result file, or directory with perf test result files",
)
parser.add_argument("--initdb", action="store_true", help="Initialuze database")
@@ -154,5 +140,4 @@ def main():
if __name__ == "__main__":
logging.getLogger("backoff").addHandler(logging.StreamHandler())
main()

View File

@@ -1,13 +1,11 @@
#!/usr/bin/env python3
import argparse
import logging
import os
import re
import sys
from contextlib import contextmanager
from pathlib import Path
import backoff
import psycopg2
CREATE_TABLE = """
@@ -31,20 +29,9 @@ def get_connection_cursor():
connstr = os.getenv("DATABASE_URL")
if not connstr:
err("DATABASE_URL environment variable is not set")
@backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150)
def connect(connstr):
conn = psycopg2.connect(connstr, connect_timeout=30)
conn.autocommit = True
return conn
conn = connect(connstr)
try:
with psycopg2.connect(connstr, connect_timeout=30) as conn:
with conn.cursor() as cur:
yield cur
finally:
if conn is not None:
conn.close()
def create_table(cur):
@@ -114,5 +101,4 @@ def main():
if __name__ == "__main__":
logging.getLogger("backoff").addHandler(logging.StreamHandler())
main()

View File

@@ -32,7 +32,6 @@ pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000);
// BrokerServiceClient charged with tonic provided Channel transport; helps to
// avoid depending on tonic directly in user crates.
@@ -59,8 +58,7 @@ where
}
tonic_endpoint = tonic_endpoint
.http2_keep_alive_interval(keepalive_interval)
.keep_alive_while_idle(true)
.connect_timeout(DEFAULT_CONNECT_TIMEOUT);
.keep_alive_while_idle(true);
// keep_alive_timeout is 20s by default on both client and server side
let channel = tonic_endpoint.connect_lazy();
Ok(BrokerClientChannel::new(channel))

View File

@@ -1,6 +1,6 @@
import json
from pathlib import Path
from typing import Any, List, MutableMapping, cast
from typing import List
import pytest
from _pytest.config import Config
@@ -56,15 +56,3 @@ def pytest_collection_modifyitems(config: Config, items: List[pytest.Item]):
# Rerun 3 times = 1 original run + 2 reruns
log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
item.add_marker(pytest.mark.flaky(reruns=2))
# pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
# we can workaround it by setting `timeout_func_only` to True[1].
# Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
# but we still can do it using pytest marker.
#
# - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
# - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
timeout_marker = item.get_closest_marker("timeout")
if timeout_marker is not None:
kwargs = cast(MutableMapping[str, Any], timeout_marker.kwargs)
kwargs["func_only"] = True

View File

@@ -57,16 +57,14 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
"libmetrics_launch_timestamp",
"libmetrics_build_info",
"libmetrics_tracing_event_count_total",
"pageserver_materialized_cache_hits_total",
"pageserver_materialized_cache_hits_direct_total",
"pageserver_getpage_reconstruct_seconds_bucket",
"pageserver_getpage_reconstruct_seconds_count",
"pageserver_getpage_reconstruct_seconds_sum",
)
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_current_logical_size",
"pageserver_resident_physical_size",
"pageserver_getpage_reconstruct_seconds_bucket",
"pageserver_getpage_reconstruct_seconds_count",
"pageserver_getpage_reconstruct_seconds_sum",
"pageserver_getpage_get_reconstruct_data_seconds_bucket",
"pageserver_getpage_get_reconstruct_data_seconds_count",
"pageserver_getpage_get_reconstruct_data_seconds_sum",
@@ -75,6 +73,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_io_operations_seconds_count",
"pageserver_io_operations_seconds_sum",
"pageserver_last_record_lsn",
"pageserver_materialized_cache_hits_total",
"pageserver_materialized_cache_hits_direct_total",
"pageserver_read_num_fs_layers_bucket",
"pageserver_read_num_fs_layers_count",
"pageserver_read_num_fs_layers_sum",

View File

@@ -1631,8 +1631,6 @@ class NeonPageserver(PgProtocol):
r".*ERROR.*ancestor timeline \S+ is being stopped",
# this is expected given our collaborative shutdown approach for the UploadQueue
".*Compaction failed, retrying in .*: queue is in state Stopped.*",
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
".*Error processing HTTP request: NotFound: Timeline .* was not found",
]
def start(
@@ -2914,6 +2912,7 @@ SKIP_FILES = frozenset(
"pg_internal.init",
"pg.log",
"zenith.signal",
"neon_compute_spec_id.txt",
"pg_hba.conf",
"postgresql.conf",
"postmaster.opts",

View File

@@ -342,11 +342,6 @@ class PageserverHttpClient(requests.Session):
return res_json
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
"""
Note that deletion is not instant, it is scheduled and performed mostly in the background.
So if you need to wait for it to complete use `timeline_delete_wait_completed`.
For longer description consult with pageserver openapi spec.
"""
res = self.delete(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", **kwargs
)

View File

@@ -193,30 +193,19 @@ def wait_for_upload_queue_empty(
time.sleep(0.2)
def wait_timeline_detail_404(
pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
):
last_exc = None
for _ in range(2):
time.sleep(0.250)
try:
data = pageserver_http.timeline_detail(tenant_id, timeline_id)
log.error(f"detail {data}")
except PageserverApiException as e:
log.debug(e)
if e.status_code == 404:
return
last_exc = e
raise last_exc or RuntimeError(f"Timeline wasnt deleted in time, state: {data['state']}")
def timeline_delete_wait_completed(
def assert_timeline_detail_404(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
timeline_id: TimelineId,
**delete_args,
):
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
"""Asserts that timeline_detail returns 404, or dumps the detail."""
try:
data = pageserver_http.timeline_detail(tenant_id, timeline_id)
log.error(f"detail {data}")
except PageserverApiException as e:
log.error(e)
if e.status_code == 404:
return
else:
raise
raise Exception("detail succeeded (it should return 404)")

View File

@@ -15,11 +15,7 @@ from fixtures.neon_fixtures import (
PortDistributor,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
from fixtures.pg_version import PgVersion
from fixtures.types import Lsn
from pytest import FixtureRequest
@@ -421,7 +417,7 @@ def check_neon_works(
)
shutil.rmtree(repo_dir / "local_fs_remote_storage")
timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
pageserver_http.timeline_delete(tenant_id, timeline_id)
pageserver_http.timeline_create(pg_version, tenant_id, timeline_id)
pg_bin.run(
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]

View File

@@ -14,11 +14,7 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
PgBin,
)
from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import subprocess_capture
@@ -155,7 +151,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
".*files not bound to index_file.json, proceeding with their deletion.*"
)
timeline_delete_wait_completed(client, tenant, timeline)
client.timeline_delete(tenant, timeline)
# Importing correct backup works
import_tar(base_tar, wal_tar)

View File

@@ -24,13 +24,7 @@ def test_basic_eviction(
test_name="test_download_remote_layers_api",
)
env = neon_env_builder.init_start(
initial_tenant_conf={
# disable gc and compaction background loops because they perform on-demand downloads
"gc_period": "0s",
"compaction_period": "0s",
}
)
env = neon_env_builder.init_start()
client = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
@@ -53,11 +47,6 @@ def test_basic_eviction(
client.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(client, tenant_id, timeline_id, current_lsn)
# disable compute & sks to avoid on-demand downloads by walreceiver / getpage
endpoint.stop()
for sk in env.safekeepers:
sk.stop()
timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
initial_local_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))

View File

@@ -163,6 +163,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
assert conn.get_parameter_status(name) == value
@pytest.mark.timeout(5)
def test_close_on_connections_exit(static_proxy: NeonProxy):
# Open two connections, send SIGTERM, then ensure that proxy doesn't exit
# until after connections close.

View File

@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
assert_timeline_detail_404,
wait_for_last_record_lsn,
wait_for_upload,
wait_until_tenant_active,
@@ -597,11 +597,14 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
env.pageserver.allowed_errors.append(
".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
)
client.timeline_delete(tenant_id, timeline_id)
env.pageserver.allowed_errors.append(f".*Timeline {tenant_id}/{timeline_id} was not found.*")
env.pageserver.allowed_errors.append(
".*files not bound to index_file.json, proceeding with their deletion.*"
)
timeline_delete_wait_completed(client, tenant_id, timeline_id)
wait_until(2, 0.5, lambda: assert_timeline_detail_404(client, tenant_id, timeline_id))
assert not timeline_path.exists()

View File

@@ -11,7 +11,6 @@ from fixtures.neon_fixtures import (
wait_for_wal_insert_lsn,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import timeline_delete_wait_completed
from fixtures.pg_version import PgVersion, xfail_on_postgres
from fixtures.types import Lsn, TenantId, TimelineId
@@ -629,12 +628,12 @@ def test_get_tenant_size_with_multiple_branches(
size_debug_file_before.write(size_debug)
# teardown, delete branches, and the size should be going down
timeline_delete_wait_completed(http_client, tenant_id, first_branch_timeline_id)
http_client.timeline_delete(tenant_id, first_branch_timeline_id)
size_after_deleting_first = http_client.tenant_size(tenant_id)
assert size_after_deleting_first < size_after_thinning_branch
timeline_delete_wait_completed(http_client, tenant_id, second_branch_timeline_id)
http_client.timeline_delete(tenant_id, second_branch_timeline_id)
size_after_deleting_second = http_client.tenant_size(tenant_id)
assert size_after_deleting_second < size_after_deleting_first

View File

@@ -1,10 +1,6 @@
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.pageserver.utils import (
assert_tenant_state,
timeline_delete_wait_completed,
wait_until_tenant_active,
)
from fixtures.pageserver.utils import assert_tenant_state, wait_until_tenant_active
from fixtures.types import TenantId, TimelineId
from fixtures.utils import wait_until
@@ -28,7 +24,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
def delete_all_timelines(tenant: TenantId):
timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)]
for t in timelines:
timeline_delete_wait_completed(client, tenant, t)
client.timeline_delete(tenant, t)
# Create tenant, start compute
tenant, _ = env.neon_cli.create_tenant()

View File

@@ -21,7 +21,6 @@ from fixtures.neon_fixtures import (
RemoteStorageKind,
available_remote_storages,
)
from fixtures.pageserver.utils import timeline_delete_wait_completed
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
from prometheus_client.samples import Sample
@@ -214,7 +213,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
# Test (a subset of) pageserver global metrics
for metric in PAGESERVER_GLOBAL_METRICS:
ps_samples = ps_metrics.query_all(metric, {})
assert len(ps_samples) > 0, f"expected at least one sample for {metric}"
assert len(ps_samples) > 0
for sample in ps_samples:
labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()])
log.info(f"{sample.name}{{{labels}}} {sample.value}")
@@ -319,10 +318,9 @@ def test_pageserver_with_empty_tenants(
client.tenant_create(tenant_with_empty_timelines)
temp_timelines = client.timeline_list(tenant_with_empty_timelines)
for temp_timeline in temp_timelines:
timeline_delete_wait_completed(
client, tenant_with_empty_timelines, TimelineId(temp_timeline["timeline_id"])
client.timeline_delete(
tenant_with_empty_timelines, TimelineId(temp_timeline["timeline_id"])
)
files_in_timelines_dir = sum(
1
for _p in Path.iterdir(

View File

@@ -17,10 +17,9 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
assert_timeline_detail_404,
wait_for_last_record_lsn,
wait_for_upload,
wait_timeline_detail_404,
wait_until_tenant_active,
wait_until_timeline_state,
)
@@ -84,7 +83,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
wait_until(
number_of_iterations=3,
interval=0.2,
func=lambda: timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id),
func=lambda: ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id),
)
assert not timeline_path.exists()
@@ -95,15 +94,15 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
) as exc:
ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
assert exc.value.status_code == 404
wait_until(
number_of_iterations=3,
interval=0.2,
func=lambda: timeline_delete_wait_completed(
ps_http, env.initial_tenant, parent_timeline_id
),
)
# FIXME leaves tenant without timelines, should we prevent deletion of root timeline?
wait_until(
number_of_iterations=3,
interval=0.2,
func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id),
)
assert exc.value.status_code == 404
# Check that we didn't pick up the timeline again after restart.
# See https://github.com/neondatabase/neon/issues/3560
@@ -144,6 +143,7 @@ def test_delete_timeline_post_rm_failure(
ps_http.configure_failpoints((failpoint_name, "return"))
ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
timeline_info = wait_until_timeline_state(
pageserver_http=ps_http,
tenant_id=env.initial_tenant,
@@ -165,7 +165,13 @@ def test_delete_timeline_post_rm_failure(
# this should succeed
# this also checks that delete can be retried even when timeline is in Broken state
timeline_delete_wait_completed(ps_http, env.initial_tenant, env.initial_timeline)
ps_http.timeline_delete(env.initial_tenant, env.initial_timeline, timeout=2)
with pytest.raises(PageserverApiException) as e:
ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
assert e.value.status_code == 404
env.pageserver.allowed_errors.append(f".*NotFound: Timeline.*{env.initial_timeline}.*")
env.pageserver.allowed_errors.append(
f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
)
@@ -241,7 +247,13 @@ def test_timeline_resurrection_on_attach(
pass
# delete new timeline
timeline_delete_wait_completed(ps_http, tenant_id=tenant_id, timeline_id=branch_timeline_id)
ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=branch_timeline_id)
env.pageserver.allowed_errors.append(
f".*Timeline {tenant_id}/{branch_timeline_id} was not found.*"
)
wait_until(2, 0.5, lambda: assert_timeline_detail_404(ps_http, tenant_id, branch_timeline_id))
##### Stop the pageserver instance, erase all its data
env.endpoints.stop_all()
@@ -326,6 +338,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
)
ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
timeline_info = wait_until_timeline_state(
pageserver_http=ps_http,
tenant_id=env.initial_tenant,
@@ -344,15 +357,12 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
# Wait for tenant to finish loading.
wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1)
try:
data = ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
log.debug(f"detail {data}")
except PageserverApiException as e:
log.debug(e)
if e.status_code != 404:
raise
else:
raise Exception("detail succeeded (it should return 404)")
env.pageserver.allowed_errors.append(
f".*Timeline {env.initial_tenant}/{leaf_timeline_id} was not found.*"
)
wait_until(
2, 0.5, lambda: assert_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id)
)
assert (
not leaf_timeline_path.exists()
@@ -379,8 +389,13 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
assert env.initial_timeline is not None
for timeline_id in (intermediate_timeline_id, env.initial_timeline):
timeline_delete_wait_completed(
ps_http, tenant_id=env.initial_tenant, timeline_id=timeline_id
ps_http.timeline_delete(env.initial_tenant, timeline_id)
env.pageserver.allowed_errors.append(
f".*Timeline {env.initial_tenant}/{timeline_id} was not found.*"
)
wait_until(
2, 0.5, lambda: assert_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
)
assert_prefix_empty(
@@ -404,27 +419,23 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
)
@pytest.mark.parametrize(
"stuck_failpoint",
["persist_deleted_index_part", "in_progress_delete"],
)
def test_concurrent_timeline_delete_stuck_on(
neon_env_builder: NeonEnvBuilder, stuck_failpoint: str
def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
neon_env_builder: NeonEnvBuilder,
):
"""
If delete is stuck console will eventually retry deletion.
So we need to be sure that these requests wont interleave with each other.
In this tests we check two places where we can spend a lot of time.
This is a regression test because there was a bug when DeletionGuard wasnt propagated
to the background task.
Ensure that when retry comes if we're still stuck request will get an immediate error response,
signalling to console that it should retry later.
If we're stuck uploading the index file with the is_delete flag,
eventually console will hand up and retry.
If we're still stuck at the retry time, ensure that the retry
fails with status 500, signalling to console that it should retry
later.
Ideally, timeline_delete should return 202 Accepted and require
console to poll for completion, but, that would require changing
the API contract.
"""
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
test_name=f"concurrent_timeline_delete_stuck_on_{stuck_failpoint}",
test_name="test_concurrent_timeline_delete_if_first_stuck_at_index_upload",
)
env = neon_env_builder.init_start()
@@ -434,14 +445,13 @@ def test_concurrent_timeline_delete_stuck_on(
ps_http = env.pageserver.http_client()
# make the first call sleep practically forever
ps_http.configure_failpoints((stuck_failpoint, "pause"))
failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
ps_http.configure_failpoints((failpoint_name, "pause"))
def first_call(result_queue):
try:
log.info("first call start")
timeline_delete_wait_completed(
ps_http, env.initial_tenant, child_timeline_id, timeout=10
)
ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=10)
log.info("first call success")
result_queue.put("success")
except Exception:
@@ -456,7 +466,7 @@ def test_concurrent_timeline_delete_stuck_on(
def first_call_hit_failpoint():
assert env.pageserver.log_contains(
f".*{child_timeline_id}.*at failpoint {stuck_failpoint}"
f".*{child_timeline_id}.*at failpoint {failpoint_name}"
)
wait_until(50, 0.1, first_call_hit_failpoint)
@@ -474,12 +484,8 @@ def test_concurrent_timeline_delete_stuck_on(
)
log.info("second call failed as expected")
# ensure it is not 404 and stopping
detail = ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
assert detail["state"] == "Stopping"
# by now we know that the second call failed, let's ensure the first call will finish
ps_http.configure_failpoints((stuck_failpoint, "off"))
ps_http.configure_failpoints((failpoint_name, "off"))
result = first_call_result.get()
assert result == "success"
@@ -492,10 +498,8 @@ def test_concurrent_timeline_delete_stuck_on(
def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
"""
If the client hangs up before we start the index part upload but after deletion is scheduled
we mark it
If the client hangs up before we start the index part upload but after we mark it
deleted in local memory, a subsequent delete_timeline call should be able to do
another delete timeline operation.
This tests cancel safety up to the given failpoint.
@@ -511,18 +515,12 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
ps_http = env.pageserver.http_client()
failpoint_name = "persist_deleted_index_part"
failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
ps_http.configure_failpoints((failpoint_name, "pause"))
with pytest.raises(requests.exceptions.Timeout):
ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
env.pageserver.allowed_errors.append(
f".*{child_timeline_id}.*timeline deletion is already in progress.*"
)
with pytest.raises(PageserverApiException, match="timeline deletion is already in progress"):
ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
# make sure the timeout was due to the failpoint
at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*"
@@ -554,7 +552,12 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
wait_until(50, 0.1, first_request_finished)
# check that the timeline is gone
wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
notfound_message = f"Timeline {env.initial_tenant}/{child_timeline_id} was not found"
env.pageserver.allowed_errors.append(".*" + notfound_message)
with pytest.raises(PageserverApiException, match=notfound_message) as exc:
ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
assert exc.value.status_code == 404
@pytest.mark.parametrize(
@@ -613,7 +616,12 @@ def test_timeline_delete_works_for_remote_smoke(
for timeline_id in reversed(timeline_ids):
# note that we need to finish previous deletion before scheduling next one
# otherwise we can get an "HasChildren" error if deletion is not fast enough (real_s3)
timeline_delete_wait_completed(ps_http, tenant_id=tenant_id, timeline_id=timeline_id)
ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id)
env.pageserver.allowed_errors.append(
f".*Timeline {env.initial_tenant}/{timeline_id} was not found.*"
)
wait_until(2, 0.5, lambda: assert_timeline_detail_404(ps_http, tenant_id, timeline_id))
assert_prefix_empty(
neon_env_builder,

View File

@@ -24,7 +24,6 @@ from fixtures.neon_fixtures import (
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.pageserver.utils import (
assert_tenant_state,
timeline_delete_wait_completed,
wait_for_upload_queue_empty,
wait_until_tenant_active,
)
@@ -273,7 +272,7 @@ def test_timeline_initial_logical_size_calculation_cancellation(
if deletion_method == "tenant_detach":
client.tenant_detach(tenant_id)
elif deletion_method == "timeline_delete":
timeline_delete_wait_completed(client, tenant_id, timeline_id)
client.timeline_delete(tenant_id, timeline_id)
delete_timeline_success.put(True)
except PageserverApiException:
delete_timeline_success.put(False)

View File

@@ -31,11 +31,7 @@ from fixtures.neon_fixtures import (
SafekeeperPort,
available_remote_storages,
)
from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
from fixtures.pg_version import PgVersion
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import get_dir_size, query_scalar, start_in_background
@@ -552,15 +548,15 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
f"sk_id={sk.id} to flush {last_lsn}",
)
ps_http = env.pageserver.http_client()
pageserver_lsn = Lsn(ps_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
ps_cli = env.pageserver.http_client()
pageserver_lsn = Lsn(ps_cli.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
lag = last_lsn - pageserver_lsn
log.info(
f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb"
)
endpoint.stop_and_destroy()
timeline_delete_wait_completed(ps_http, tenant_id, timeline_id)
ps_cli.timeline_delete(tenant_id, timeline_id)
# Also delete and manually create timeline on safekeepers -- this tests
# scenario of manual recovery on different set of safekeepers.
@@ -575,21 +571,11 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
pg_version = sk.http_client().timeline_status(tenant_id, timeline_id).pg_version
# Terminate first all safekeepers to prevent communication unexpectantly
# advancing peer_horizon_lsn.
for sk in env.safekeepers:
cli = sk.http_client()
cli.timeline_delete_force(tenant_id, timeline_id)
# restart safekeeper to clear its in-memory state
sk.stop()
# wait all potenital in flight pushes to broker arrive before starting
# safekeepers (even without sleep, it is very unlikely they are not
# delivered yet).
time.sleep(1)
for sk in env.safekeepers:
sk.start()
cli = sk.http_client()
sk.stop().start()
cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn)
f_partial_path = (
Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
@@ -597,7 +583,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
shutil.copy(f_partial_saved, f_partial_path)
# recreate timeline on pageserver from scratch
ps_http.timeline_create(
ps_cli.timeline_create(
pg_version=PgVersion(pg_version),
tenant_id=tenant_id,
new_timeline_id=timeline_id,
@@ -612,7 +598,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
if elapsed > wait_lsn_timeout:
raise RuntimeError("Timed out waiting for WAL redo")
tenant_status = ps_http.tenant_status(tenant_id)
tenant_status = ps_cli.tenant_status(tenant_id)
if tenant_status["state"]["slug"] == "Loading":
log.debug(f"Tenant {tenant_id} is still loading, retrying")
else:

View File

@@ -1,5 +1,3 @@
import time
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
from fixtures.types import Lsn, TenantId
@@ -42,10 +40,7 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
# Kills one of the safekeepers and ensures that only the active ones are printed in the state.
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
# Trigger WAL wait timeout faster
neon_env_builder.pageserver_config_override = """
wait_lsn_timeout = "1s"
tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"}
"""
neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
# Have notable SK ids to ensure we check logs for their presence, not some other random numbers
neon_env_builder.safekeepers_id_start = 12345
neon_env_builder.num_safekeepers = 3
@@ -75,8 +70,6 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
stopped_safekeeper_id = stopped_safekeeper.id
log.info(f"Stopping safekeeper {stopped_safekeeper.id}")
stopped_safekeeper.stop()
# sleep until stopped safekeeper is removed from candidates
time.sleep(2)
# Spend some more time inserting, to ensure SKs report updated statuses and walreceiver in PS have time to update its connection stats.
insert_test_elements(env, tenant_id, start=elements_to_insert + 1, count=elements_to_insert)