mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-10 22:20:38 +00:00
Compare commits
290 Commits
problame/c
...
release-pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7cf0f6b37e | ||
|
|
03c2c569be | ||
|
|
eff6d4538a | ||
|
|
5ef7782e9c | ||
|
|
73101db8c4 | ||
|
|
bccdfc6d39 | ||
|
|
99595813bb | ||
|
|
fe07b54758 | ||
|
|
a42d173e7b | ||
|
|
e07f689238 | ||
|
|
7831eddc88 | ||
|
|
943b1bc80c | ||
|
|
95a184e9b7 | ||
|
|
3fa17e9d17 | ||
|
|
55e0fd9789 | ||
|
|
2a88889f44 | ||
|
|
5bad8126dc | ||
|
|
27bc242085 | ||
|
|
192b49cc6d | ||
|
|
e1b60f3693 | ||
|
|
2804f5323b | ||
|
|
676adc6b32 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
8
.github/workflows/build_and_test.yml
vendored
8
.github/workflows/build_and_test.yml
vendored
@@ -432,9 +432,8 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug ]
|
||||
pg_version: [ v15 ]
|
||||
pageserver_compaction_algorithm_kind: [ "legacy", "tiered" ]
|
||||
build_type: [ debug, release ]
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -462,9 +461,6 @@ jobs:
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: true
|
||||
PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM: 'kind="${{ matrix.pageserver_compaction_algorithm_kind }}"'
|
||||
# catch the tests that override `tenant_config` as a whole without specifying the compaction algorithm `kind`
|
||||
NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM: true
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
## Storage & Compute release ${RELEASE_DATE}
|
||||
## Release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
@@ -243,15 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# because we build the images on different machines than where we run them.
|
||||
# Pass OPTFLAGS="" to remove it.
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
|
||||
@@ -451,8 +451,6 @@ impl EvictionPolicy {
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum CompactionAlgorithm {
|
||||
#[strum(disabled)]
|
||||
NotSpecified,
|
||||
Legacy,
|
||||
Tiered,
|
||||
}
|
||||
|
||||
@@ -178,13 +178,6 @@ impl PgConnectionConfig {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// The password is intentionally hidden and not part of this display string.
|
||||
write!(f, "postgresql://{}:{}", self.host, self.port)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
|
||||
|
||||
@@ -29,6 +29,7 @@ use http_types::{StatusCode, Url};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::RemoteStorageActivity;
|
||||
use crate::{
|
||||
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
|
||||
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
@@ -525,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
self.concurrency_limiter.activity()
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
|
||||
@@ -263,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), TimeTravelError>;
|
||||
|
||||
/// Query how busy we currently are: may be used by callers which wish to politely
|
||||
/// back off if there are already a lot of operations underway.
|
||||
fn activity(&self) -> RemoteStorageActivity;
|
||||
}
|
||||
|
||||
pub struct RemoteStorageActivity {
|
||||
pub read_available: usize,
|
||||
pub read_total: usize,
|
||||
pub write_available: usize,
|
||||
pub write_total: usize,
|
||||
}
|
||||
|
||||
/// DownloadStream is sensitive to the timeout and cancellation used with the original
|
||||
@@ -444,6 +455,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn activity(&self) -> RemoteStorageActivity {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.activity(),
|
||||
Self::AwsS3(s) => s.activity(),
|
||||
Self::AzureBlob(s) => s.activity(),
|
||||
Self::Unreliable(s) => s.activity(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
@@ -774,6 +794,9 @@ struct ConcurrencyLimiter {
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
write: Arc<Semaphore>,
|
||||
read: Arc<Semaphore>,
|
||||
|
||||
write_total: usize,
|
||||
read_total: usize,
|
||||
}
|
||||
|
||||
impl ConcurrencyLimiter {
|
||||
@@ -802,10 +825,21 @@ impl ConcurrencyLimiter {
|
||||
Arc::clone(self.for_kind(kind)).acquire_owned().await
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
RemoteStorageActivity {
|
||||
read_available: self.read.available_permits(),
|
||||
read_total: self.read_total,
|
||||
write_available: self.write.available_permits(),
|
||||
write_total: self.write_total,
|
||||
}
|
||||
}
|
||||
|
||||
fn new(limit: usize) -> ConcurrencyLimiter {
|
||||
Self {
|
||||
read: Arc::new(Semaphore::new(limit)),
|
||||
write: Arc::new(Semaphore::new(limit)),
|
||||
read_total: limit,
|
||||
write_total: limit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
|
||||
TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
@@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs {
|
||||
) -> Result<(), TimeTravelError> {
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
// LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
|
||||
RemoteStorageActivity {
|
||||
read_available: 16,
|
||||
read_total: 16,
|
||||
write_available: 16,
|
||||
write_total: 16,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
|
||||
@@ -47,8 +47,8 @@ use utils::backoff;
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
|
||||
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
|
||||
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
|
||||
TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
@@ -975,6 +975,10 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
self.concurrency_limiter.activity()
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
|
||||
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
StorageMetadata, TimeTravelError,
|
||||
RemoteStorageActivity, StorageMetadata, TimeTravelError,
|
||||
};
|
||||
|
||||
pub struct UnreliableWrapper {
|
||||
@@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
fn activity(&self) -> RemoteStorageActivity {
|
||||
self.inner.activity()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,33 +9,6 @@ use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
/// Declare a failpoint that can use the `pause` failpoint action.
|
||||
/// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
#[macro_export]
|
||||
macro_rules! pausable_failpoint {
|
||||
($name:literal) => {
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint {}", $name);
|
||||
fail::fail_point!($name);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
};
|
||||
($name:literal, $cond:expr) => {
|
||||
if cfg!(feature = "testing") {
|
||||
if $cond {
|
||||
pausable_failpoint!($name)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
///
|
||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||
|
||||
@@ -135,8 +135,7 @@ impl Gate {
|
||||
let started_at = std::time::Instant::now();
|
||||
let mut do_close = std::pin::pin!(self.do_close());
|
||||
|
||||
// with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
|
||||
let nag_after = Duration::from_millis(100);
|
||||
let nag_after = Duration::from_secs(1);
|
||||
|
||||
let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
|
||||
return;
|
||||
|
||||
@@ -380,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.file_size,
|
||||
MockLayer::Image(this) => this.file_size,
|
||||
MockLayer::Delta(this) => this.file_size(),
|
||||
MockLayer::Image(this) => this.file_size(),
|
||||
}
|
||||
}
|
||||
fn short_id(&self) -> String {
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output<'a> {
|
||||
layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
|
||||
layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
timeline_metadata: &'a TimelineMetadata,
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
//! See also `settings.md` for better description on every parameter.
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use pageserver_api::{models::CompactionAlgorithm, shard::TenantShardId};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use serde;
|
||||
use serde::de::IntoDeserializer;
|
||||
@@ -15,7 +15,7 @@ use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::logging::SecretString;
|
||||
|
||||
use once_cell::sync::{Lazy, OnceCell};
|
||||
use once_cell::sync::OnceCell;
|
||||
use reqwest::Url;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::str::FromStr;
|
||||
@@ -1067,19 +1067,6 @@ impl PageServerConf {
|
||||
|
||||
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
|
||||
|
||||
{
|
||||
const VAR_NAME: &str = "NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM";
|
||||
static VAR: Lazy<Option<bool>> = Lazy::new(|| utils::env::var(VAR_NAME));
|
||||
if VAR.unwrap_or(false)
|
||||
&& conf.default_tenant_conf.compaction_algorithm.kind
|
||||
== CompactionAlgorithm::NotSpecified
|
||||
{
|
||||
panic!(
|
||||
"Unspecified compaction algorithm in default tenant configuration. \
|
||||
Set the algorithm explicitly in the pageserver.toml's `tenant_config` field or unset the environment variable {VAR_NAME}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(conf)
|
||||
}
|
||||
|
||||
|
||||
@@ -534,7 +534,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
});
|
||||
}
|
||||
EvictionLayer::Secondary(layer) => {
|
||||
let file_size = layer.metadata.file_size;
|
||||
let file_size = layer.metadata.file_size();
|
||||
|
||||
js.spawn(async move {
|
||||
layer
|
||||
@@ -641,7 +641,7 @@ impl EvictionLayer {
|
||||
pub(crate) fn get_file_size(&self) -> u64 {
|
||||
match self {
|
||||
Self::Attached(l) => l.layer_desc().file_size,
|
||||
Self::Secondary(sl) => sl.metadata.file_size,
|
||||
Self::Secondary(sl) => sl.metadata.file_size(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -395,7 +395,7 @@ async fn build_timeline_info_common(
|
||||
let guard = timeline.last_received_wal.lock().unwrap();
|
||||
if let Some(info) = guard.as_ref() {
|
||||
(
|
||||
Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
|
||||
Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
|
||||
Some(info.last_received_msg_lsn),
|
||||
Some(info.last_received_msg_ts),
|
||||
)
|
||||
|
||||
@@ -260,8 +260,6 @@ async fn page_service_conn_main(
|
||||
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
|
||||
let socket = std::pin::pin!(socket);
|
||||
|
||||
fail::fail_point!("ps::connection-start::pre-login");
|
||||
|
||||
// XXX: pgbackend.run() should take the connection_ctx,
|
||||
// and create a child per-query context when it invokes process_query.
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
@@ -605,7 +603,6 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
trace!("query: {copy_data_bytes:?}");
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
// Trace request if needed
|
||||
if let Some(t) = tracer.as_mut() {
|
||||
@@ -620,7 +617,6 @@ impl PageServerHandler {
|
||||
|
||||
let (response, span) = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -630,7 +626,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -640,7 +635,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
// shard_id is filled in by the handler
|
||||
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
@@ -651,7 +645,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -661,7 +654,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -1513,7 +1505,6 @@ where
|
||||
_pgb: &mut PostgresBackend<IO>,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
fail::fail_point!("ps::connection-start::startup-packet");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1528,8 +1519,6 @@ where
|
||||
Err(QueryError::SimulatedConnectionError)
|
||||
});
|
||||
|
||||
fail::fail_point!("ps::connection-start::process-query");
|
||||
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
debug!("process query {query_string:?}");
|
||||
let parts = query_string.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
@@ -42,7 +42,6 @@ use utils::completion;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::failpoint_support;
|
||||
use utils::fs_ext;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::GateGuard;
|
||||
use utils::timeout::timeout_cancellable;
|
||||
@@ -123,6 +122,32 @@ use utils::{
|
||||
lsn::{Lsn, RecordLsn},
|
||||
};
|
||||
|
||||
/// Declare a failpoint that can use the `pause` failpoint action.
|
||||
/// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
macro_rules! pausable_failpoint {
|
||||
($name:literal) => {
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint {}", $name);
|
||||
fail::fail_point!($name);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
};
|
||||
($name:literal, $cond:expr) => {
|
||||
if cfg!(feature = "testing") {
|
||||
if $cond {
|
||||
pausable_failpoint!($name)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub mod blob_io;
|
||||
pub mod block_io;
|
||||
pub mod vectored_blob_io;
|
||||
|
||||
@@ -238,13 +238,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
io_buf,
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({len} bytes)"),
|
||||
format!("blob too large ({} bytes)", len),
|
||||
)),
|
||||
);
|
||||
}
|
||||
if len > 0x0fff_ffff {
|
||||
tracing::warn!("writing blob above future limit ({len} bytes)");
|
||||
}
|
||||
let mut len_buf = (len as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
io_buf.extend_from_slice(&len_buf[..]);
|
||||
|
||||
@@ -40,6 +40,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
|
||||
super::CompactionAlgorithm::Legacy;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
|
||||
@@ -552,13 +554,7 @@ impl Default for TenantConf {
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
compaction_algorithm: CompactionAlgorithmSettings {
|
||||
kind: if cfg!(test) {
|
||||
// Rust tests rely on a valid implicit default (TODO: fix this)
|
||||
CompactionAlgorithm::Legacy
|
||||
} else {
|
||||
// Python tests are subject to NotSpecified handling
|
||||
CompactionAlgorithm::NotSpecified
|
||||
},
|
||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||
},
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
|
||||
@@ -8,7 +8,7 @@ use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, instrument, Instrument};
|
||||
|
||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
|
||||
@@ -197,7 +197,6 @@ pub(crate) use upload::upload_initdb_dir;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::pausable_failpoint;
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
@@ -1193,7 +1192,7 @@ impl RemoteTimelineClient {
|
||||
&self.storage_impl,
|
||||
uploaded.local_path(),
|
||||
&remote_path,
|
||||
uploaded.metadata().file_size,
|
||||
uploaded.metadata().file_size(),
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
@@ -1574,7 +1573,7 @@ impl RemoteTimelineClient {
|
||||
&self.storage_impl,
|
||||
local_path,
|
||||
&remote_path,
|
||||
layer_metadata.file_size,
|
||||
layer_metadata.file_size(),
|
||||
&self.cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -1769,7 +1768,7 @@ impl RemoteTimelineClient {
|
||||
UploadOp::UploadLayer(_, m) => (
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
|
||||
),
|
||||
UploadOp::UploadMetadata(_, _) => (
|
||||
RemoteOpFileKind::Index,
|
||||
|
||||
@@ -84,7 +84,7 @@ pub async fn download_layer_file<'a>(
|
||||
)
|
||||
.await?;
|
||||
|
||||
let expected = layer_metadata.file_size;
|
||||
let expected = layer_metadata.file_size();
|
||||
if expected != bytes_amount {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
|
||||
|
||||
@@ -17,6 +17,46 @@ use pageserver_api::shard::ShardIndex;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
//#[cfg_attr(test, derive(Default))]
|
||||
pub struct LayerFileMetadata {
|
||||
file_size: u64,
|
||||
|
||||
pub(crate) generation: Generation,
|
||||
|
||||
pub(crate) shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||
fn from(other: &IndexLayerMetadata) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
shard: other.shard,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size,
|
||||
generation,
|
||||
shard,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
}
|
||||
|
||||
// TODO seems like another part of the remote storage file format
|
||||
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
|
||||
/// In-memory representation of an `index_part.json` file
|
||||
///
|
||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||
@@ -37,7 +77,7 @@ pub struct IndexPart {
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
|
||||
pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated for convenience when reading the serialized structure, but is
|
||||
@@ -87,7 +127,10 @@ impl IndexPart {
|
||||
lineage: Lineage,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> Self {
|
||||
let layer_metadata = layers_and_metadata.clone();
|
||||
let layer_metadata = layers_and_metadata
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
version: Self::LATEST_VERSION,
|
||||
@@ -151,12 +194,9 @@ impl From<&UploadQueueInitialized> for IndexPart {
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub struct LayerFileMetadata {
|
||||
/// Serialized form of [`LayerFileMetadata`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexLayerMetadata {
|
||||
pub file_size: u64,
|
||||
|
||||
#[serde(default = "Generation::none")]
|
||||
@@ -168,12 +208,12 @@ pub struct LayerFileMetadata {
|
||||
pub shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size,
|
||||
generation,
|
||||
shard,
|
||||
impl From<&LayerFileMetadata> for IndexLayerMetadata {
|
||||
fn from(other: &LayerFileMetadata) -> Self {
|
||||
IndexLayerMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
shard: other.shard,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -267,12 +307,12 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -309,12 +349,12 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -352,12 +392,12 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 2,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -440,12 +480,12 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
version: 4,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
@@ -482,12 +522,12 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
version: 5,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 23289856,
|
||||
generation: Generation::new(1),
|
||||
shard: ShardIndex::unsharded(),
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 1015808,
|
||||
generation: Generation::new(1),
|
||||
shard: ShardIndex::unsharded(),
|
||||
@@ -529,12 +569,12 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
version: 6,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
|
||||
@@ -9,7 +9,7 @@ use std::time::SystemTime;
|
||||
use tokio::fs::{self, File};
|
||||
use tokio::io::AsyncSeekExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, pausable_failpoint};
|
||||
use utils::backoff;
|
||||
|
||||
use super::Generation;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
|
||||
@@ -45,10 +45,10 @@ use crate::tenant::{
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::Future;
|
||||
use futures::{Future, StreamExt};
|
||||
use pageserver_api::models::SecondaryProgress;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
@@ -67,6 +67,12 @@ use super::{
|
||||
/// download, if the uploader populated it.
|
||||
const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
|
||||
/// Range of concurrency we may use when downloading layers within a timeline. This is independent
|
||||
/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
|
||||
/// `PageServerConf::secondary_download_concurrency`
|
||||
const MAX_LAYER_CONCURRENCY: usize = 16;
|
||||
const MIN_LAYER_CONCURRENCY: usize = 1;
|
||||
|
||||
pub(super) async fn downloader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
@@ -75,18 +81,19 @@ pub(super) async fn downloader_task(
|
||||
cancel: CancellationToken,
|
||||
root_ctx: RequestContext,
|
||||
) {
|
||||
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
|
||||
// How many tenants' secondary download operations we will run concurrently
|
||||
let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
|
||||
|
||||
let generator = SecondaryDownloader {
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
root_ctx,
|
||||
};
|
||||
let mut scheduler = Scheduler::new(generator, concurrency);
|
||||
let mut scheduler = Scheduler::new(generator, tenant_concurrency);
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("secondary_download_scheduler"))
|
||||
.instrument(info_span!("secondary_downloads"))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -407,7 +414,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
tracing::warn!("Insufficient space while downloading. Will retry later.");
|
||||
}
|
||||
Err(UpdateError::Cancelled) => {
|
||||
tracing::info!("Shut down while downloading");
|
||||
tracing::debug!("Shut down while downloading");
|
||||
},
|
||||
Err(UpdateError::Deserialize(e)) => {
|
||||
tracing::error!("Corrupt content while downloading tenant: {e}");
|
||||
@@ -709,7 +716,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let mut layer_byte_count: u64 = timeline_state
|
||||
.on_disk_layers
|
||||
.values()
|
||||
.map(|l| l.metadata.file_size)
|
||||
.map(|l| l.metadata.file_size())
|
||||
.sum();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
@@ -720,7 +727,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
.get(layer_file_name)
|
||||
.unwrap()
|
||||
.metadata
|
||||
.file_size;
|
||||
.file_size();
|
||||
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
@@ -841,6 +848,8 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
let mut download_futs = Vec::new();
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
@@ -877,7 +886,9 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
|
||||
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
||||
|| on_disk.access_time != layer.access_time
|
||||
{
|
||||
// We already have this layer on disk. Update its access time.
|
||||
tracing::debug!(
|
||||
"Access time updated for layer {}: {} -> {}",
|
||||
@@ -913,14 +924,31 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
match self
|
||||
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
|
||||
.await?
|
||||
{
|
||||
Some(layer) => touched.push(layer),
|
||||
None => {
|
||||
// Not an error but we didn't download it: remote layer is missing. Don't add it to the list of
|
||||
// things to consider touched.
|
||||
download_futs.push(self.download_layer(
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
layer,
|
||||
ctx,
|
||||
));
|
||||
}
|
||||
|
||||
// Break up layer downloads into chunks, so that for each chunk we can re-check how much
|
||||
// concurrency to use based on activity level of remote storage.
|
||||
while !download_futs.is_empty() {
|
||||
let chunk =
|
||||
download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
|
||||
|
||||
let concurrency = Self::layer_concurrency(self.remote_storage.activity());
|
||||
|
||||
let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
|
||||
let mut result_stream = std::pin::pin!(result_stream);
|
||||
while let Some(result) = result_stream.next().await {
|
||||
match result {
|
||||
Err(e) => return Err(e),
|
||||
Ok(None) => {
|
||||
// No error, but we didn't download the layer. Don't mark it touched
|
||||
}
|
||||
Ok(Some(layer)) => touched.push(layer),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -951,7 +979,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
t.name,
|
||||
t.metadata.clone(),
|
||||
LayerFileMetadata::from(&t.metadata),
|
||||
t.access_time,
|
||||
local_path,
|
||||
));
|
||||
@@ -985,18 +1013,13 @@ impl<'a> TenantDownloader<'a> {
|
||||
);
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
tracing::info!(
|
||||
"Starting download of layer {}, size {}",
|
||||
layer.name,
|
||||
layer.metadata.file_size
|
||||
);
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
self.remote_storage,
|
||||
*tenant_shard_id,
|
||||
*timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata,
|
||||
&LayerFileMetadata::from(&layer.metadata),
|
||||
&local_path,
|
||||
&self.secondary_state.cancel,
|
||||
ctx,
|
||||
@@ -1055,6 +1078,19 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
Ok(Some(layer))
|
||||
}
|
||||
|
||||
/// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
|
||||
fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
|
||||
// When less than 75% of units are available, use minimum concurrency. Else, do a linear mapping
|
||||
// of our concurrency range to the units available within the remaining 25%.
|
||||
let clamp_at = (activity.read_total * 3) / 4;
|
||||
if activity.read_available > clamp_at {
|
||||
(MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
|
||||
/ (activity.read_total - clamp_at)
|
||||
} else {
|
||||
MIN_LAYER_CONCURRENCY
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
|
||||
@@ -1144,7 +1180,7 @@ async fn init_timeline_state(
|
||||
tenant_shard_id,
|
||||
&heatmap.timeline_id,
|
||||
name,
|
||||
remote_meta.metadata.clone(),
|
||||
LayerFileMetadata::from(&remote_meta.metadata),
|
||||
remote_meta.access_time,
|
||||
file_path,
|
||||
),
|
||||
@@ -1178,3 +1214,58 @@ async fn init_timeline_state(
|
||||
|
||||
detail
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn layer_concurrency() {
|
||||
// Totally idle
|
||||
assert_eq!(
|
||||
TenantDownloader::layer_concurrency(RemoteStorageActivity {
|
||||
read_available: 16,
|
||||
read_total: 16,
|
||||
write_available: 16,
|
||||
write_total: 16
|
||||
}),
|
||||
MAX_LAYER_CONCURRENCY
|
||||
);
|
||||
|
||||
// Totally busy
|
||||
assert_eq!(
|
||||
TenantDownloader::layer_concurrency(RemoteStorageActivity {
|
||||
read_available: 0,
|
||||
read_total: 16,
|
||||
|
||||
write_available: 16,
|
||||
write_total: 16
|
||||
}),
|
||||
MIN_LAYER_CONCURRENCY
|
||||
);
|
||||
|
||||
// Edge of the range at which we interpolate
|
||||
assert_eq!(
|
||||
TenantDownloader::layer_concurrency(RemoteStorageActivity {
|
||||
read_available: 12,
|
||||
read_total: 16,
|
||||
|
||||
write_available: 16,
|
||||
write_total: 16
|
||||
}),
|
||||
MIN_LAYER_CONCURRENCY
|
||||
);
|
||||
|
||||
// Midpoint of the range in which we interpolate
|
||||
assert_eq!(
|
||||
TenantDownloader::layer_concurrency(RemoteStorageActivity {
|
||||
read_available: 14,
|
||||
read_total: 16,
|
||||
|
||||
write_available: 16,
|
||||
write_total: 16
|
||||
}),
|
||||
MAX_LAYER_CONCURRENCY / 2
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::time::SystemTime;
|
||||
|
||||
use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
|
||||
use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
|
||||
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapLayer {
|
||||
pub(super) name: LayerName,
|
||||
pub(super) metadata: LayerFileMetadata,
|
||||
pub(super) metadata: IndexLayerMetadata,
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(super) access_time: SystemTime,
|
||||
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
|
||||
impl HeatMapLayer {
|
||||
pub(crate) fn new(
|
||||
name: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
metadata: IndexLayerMetadata,
|
||||
access_time: SystemTime,
|
||||
) -> Self {
|
||||
Self {
|
||||
|
||||
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("heatmap_upload_scheduler"))
|
||||
.instrument(info_span!("heatmap_uploader"))
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
@@ -179,13 +179,6 @@ where
|
||||
// Schedule some work, if concurrency limit permits it
|
||||
self.spawn_pending();
|
||||
|
||||
// This message is printed every scheduling iteration as proof of liveness when looking at logs
|
||||
tracing::info!(
|
||||
"Status: {} tasks running, {} pending",
|
||||
self.running.len(),
|
||||
self.pending.len()
|
||||
);
|
||||
|
||||
// Between scheduling iterations, we will:
|
||||
// - Drain any complete tasks and spawn pending tasks
|
||||
// - Handle incoming administrative commands
|
||||
@@ -265,11 +258,7 @@ where
|
||||
|
||||
self.tasks.spawn(fut);
|
||||
|
||||
let replaced = self.running.insert(tenant_shard_id, in_progress);
|
||||
debug_assert!(replaced.is_none());
|
||||
if replaced.is_some() {
|
||||
tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
|
||||
}
|
||||
self.running.insert(tenant_shard_id, in_progress);
|
||||
}
|
||||
|
||||
/// For all pending tenants that are elegible for execution, spawn their task.
|
||||
@@ -279,9 +268,7 @@ where
|
||||
while !self.pending.is_empty() && self.running.len() < self.concurrency {
|
||||
// unwrap: loop condition includes !is_empty()
|
||||
let pending = self.pending.pop_front().unwrap();
|
||||
if !self.running.contains_key(pending.get_tenant_shard_id()) {
|
||||
self.do_spawn(pending);
|
||||
}
|
||||
self.do_spawn(pending);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -334,8 +321,7 @@ where
|
||||
|
||||
let tenant_shard_id = job.get_tenant_shard_id();
|
||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||
tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Command already running, waiting for it");
|
||||
tracing::info!("Command already running, waiting for it");
|
||||
barrier
|
||||
} else {
|
||||
let running = self.spawn_now(job);
|
||||
|
||||
@@ -47,7 +47,7 @@ use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs::File;
|
||||
@@ -473,7 +473,7 @@ impl ImageLayerInner {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let reads = self
|
||||
.plan_reads(keyspace, None, ctx)
|
||||
.plan_reads(keyspace, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
@@ -485,15 +485,9 @@ impl ImageLayerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Traverse the layer's index to build read operations on the overlap of the input keyspace
|
||||
/// and the keys in this layer.
|
||||
///
|
||||
/// If shard_identity is provided, it will be used to filter keys down to those stored on
|
||||
/// this shard.
|
||||
async fn plan_reads(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
shard_identity: Option<&ShardIdentity>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>> {
|
||||
let mut planner = VectoredReadPlanner::new(
|
||||
@@ -513,6 +507,7 @@ impl ImageLayerInner {
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
range.start.write_to_byte_slice(&mut search_key);
|
||||
|
||||
@@ -525,22 +520,12 @@ impl ImageLayerInner {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
assert!(key >= range.start);
|
||||
|
||||
let flag = if let Some(shard_identity) = shard_identity {
|
||||
if shard_identity.is_key_disposable(&key) {
|
||||
BlobFlag::Ignore
|
||||
} else {
|
||||
BlobFlag::None
|
||||
}
|
||||
} else {
|
||||
BlobFlag::None
|
||||
};
|
||||
|
||||
if key >= range.end {
|
||||
planner.handle_range_end(offset);
|
||||
range_end_handled = true;
|
||||
break;
|
||||
} else {
|
||||
planner.handle(key, self.lsn, offset, flag);
|
||||
planner.handle(key, self.lsn, offset, BlobFlag::None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -553,50 +538,6 @@ impl ImageLayerInner {
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
/// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
|
||||
/// then execute vectored GET operations, passing the results of all read keys into the writer.
|
||||
pub(super) async fn filter(
|
||||
&self,
|
||||
shard_identity: &ShardIdentity,
|
||||
writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
// Fragment the range into the regions owned by this ShardIdentity
|
||||
let plan = self
|
||||
.plan_reads(
|
||||
KeySpace {
|
||||
// If asked for the total key space, plan_reads will give us all the keys in the layer
|
||||
ranges: vec![Key::MIN..Key::MAX],
|
||||
},
|
||||
Some(shard_identity),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
let mut key_count = 0;
|
||||
for read in plan.into_iter() {
|
||||
let buf_size = read.size();
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
|
||||
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
|
||||
key_count += 1;
|
||||
writer
|
||||
.put_image(meta.meta.key, img_buf, ctx)
|
||||
.await
|
||||
.context(format!("Storing key {}", meta.meta.key))?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(key_count)
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
@@ -709,7 +650,7 @@ impl ImageLayerWriterInner {
|
||||
lsn,
|
||||
},
|
||||
);
|
||||
trace!("creating image layer {}", path);
|
||||
info!("new image layer {path}");
|
||||
let mut file = {
|
||||
VirtualFile::open_with_options(
|
||||
&path,
|
||||
@@ -829,7 +770,7 @@ impl ImageLayerWriterInner {
|
||||
// FIXME: why not carry the virtualfile here, it supports renaming?
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
|
||||
info!("created image layer {}", layer.local_path());
|
||||
trace!("created image layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
@@ -914,136 +855,3 @@ impl Drop for ImageLayerWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
};
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
|
||||
|
||||
use super::ImageLayerWriter;
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_rewrite() {
|
||||
let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
// The LSN at which we will create an image layer to filter
|
||||
let lsn = Lsn(0xdeadbeef0000);
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
let range = input_start..input_end;
|
||||
|
||||
// Build an image layer to filter
|
||||
let resident = {
|
||||
let mut writer = ImageLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
writer.finish(&timeline, &ctx).await.unwrap()
|
||||
};
|
||||
let original_size = resident.metadata().file_size;
|
||||
|
||||
// Filter for various shards: this exercises cases like values at start of key range, end of key
|
||||
// range, middle of key range.
|
||||
for shard_number in 0..4 {
|
||||
let mut filtered_writer = ImageLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
|
||||
// to exercise filter()
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount::new(4),
|
||||
ShardStripeSize(0x8000),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let wrote_keys = resident
|
||||
.filter(&shard_identity, &mut filtered_writer, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let replacement = if wrote_keys > 0 {
|
||||
Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// This exact size and those below will need updating as/when the layer encoding changes, but
|
||||
// should be deterministic for a given version of the format, as we used no randomness generating the input.
|
||||
assert_eq!(original_size, 1597440);
|
||||
|
||||
match shard_number {
|
||||
0 => {
|
||||
// We should have written out just one stripe for our shard identity
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
let replacement = replacement.unwrap();
|
||||
|
||||
// We should have dropped some of the data
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
|
||||
// Assert that we dropped ~3/4 of the data.
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
}
|
||||
1 => {
|
||||
// Shard 1 has no keys in our input range
|
||||
assert_eq!(wrote_keys, 0x0);
|
||||
assert!(replacement.is_none());
|
||||
}
|
||||
2 => {
|
||||
// Shard 2 has one stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
}
|
||||
3 => {
|
||||
// Shard 3 has two stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x10000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 811008);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{
|
||||
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Weak};
|
||||
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
|
||||
|
||||
use super::delta_layer::{self, DeltaEntry};
|
||||
use super::image_layer::{self};
|
||||
use super::image_layer;
|
||||
use super::{
|
||||
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
||||
PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
|
||||
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
|
||||
ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
|
||||
};
|
||||
|
||||
use utils::generation::Generation;
|
||||
@@ -161,7 +161,7 @@ impl Layer {
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
file_name,
|
||||
metadata.file_size,
|
||||
metadata.file_size(),
|
||||
);
|
||||
|
||||
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
|
||||
@@ -194,7 +194,7 @@ impl Layer {
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
file_name,
|
||||
metadata.file_size,
|
||||
metadata.file_size(),
|
||||
);
|
||||
|
||||
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
|
||||
@@ -227,7 +227,7 @@ impl Layer {
|
||||
|
||||
timeline
|
||||
.metrics
|
||||
.resident_physical_size_add(metadata.file_size);
|
||||
.resident_physical_size_add(metadata.file_size());
|
||||
|
||||
ResidentLayer { downloaded, owner }
|
||||
}
|
||||
@@ -1802,15 +1802,16 @@ impl ResidentLayer {
|
||||
use LayerKind::*;
|
||||
|
||||
let owner = &self.owner.0;
|
||||
|
||||
match self.downloaded.get(owner, ctx).await? {
|
||||
Delta(ref d) => {
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
owner
|
||||
.access_stats
|
||||
.record_access(LayerAccessKind::KeyIter, ctx);
|
||||
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
delta_layer::DeltaLayerInner::load_keys(d, ctx)
|
||||
.await
|
||||
.with_context(|| format!("Layer index is corrupted for {self}"))
|
||||
@@ -1819,23 +1820,6 @@ impl ResidentLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Read all they keys in this layer which match the ShardIdentity, and write them all to
|
||||
/// the provided writer. Return the number of keys written.
|
||||
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
|
||||
pub(crate) async fn filter<'a>(
|
||||
&'a self,
|
||||
shard_identity: &ShardIdentity,
|
||||
writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self.downloaded.get(&self.owner.0, ctx).await? {
|
||||
Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
|
||||
Image(i) => i.filter(shard_identity, writer, ctx).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the amount of keys and values written to the writer.
|
||||
pub(crate) async fn copy_delta_prefix(
|
||||
&self,
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::tenant::{Tenant, TenantState};
|
||||
use rand::Rng;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{backoff, completion, pausable_failpoint};
|
||||
use utils::{backoff, completion};
|
||||
|
||||
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
|
||||
once_cell::sync::Lazy::new(|| {
|
||||
|
||||
@@ -41,7 +41,6 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
fs_ext, pausable_failpoint,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
vec_map::VecMap,
|
||||
};
|
||||
@@ -61,7 +60,6 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
tenant::{
|
||||
@@ -90,6 +88,9 @@ use crate::{
|
||||
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
|
||||
};
|
||||
use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
|
||||
use crate::{
|
||||
pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
|
||||
};
|
||||
use crate::{
|
||||
pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
@@ -1423,7 +1424,7 @@ impl Timeline {
|
||||
let layer_map = guard.layer_map();
|
||||
let mut size = 0;
|
||||
for l in layer_map.iter_historic_layers() {
|
||||
size += l.file_size;
|
||||
size += l.file_size();
|
||||
}
|
||||
size
|
||||
}
|
||||
@@ -1700,9 +1701,6 @@ impl Timeline {
|
||||
}
|
||||
|
||||
match self.get_compaction_algorithm_settings().kind {
|
||||
CompactionAlgorithm::NotSpecified => {
|
||||
unreachable!("should panic earlier when we construct the default tenant conf")
|
||||
}
|
||||
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
|
||||
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
|
||||
}
|
||||
@@ -2456,6 +2454,8 @@ impl Timeline {
|
||||
let span = tracing::Span::current();
|
||||
|
||||
// Copy to move into the task we're about to spawn
|
||||
let generation = self.generation;
|
||||
let shard = self.get_shard_index();
|
||||
let this = self.myself.upgrade().expect("&self method holds the arc");
|
||||
|
||||
let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
|
||||
@@ -2469,14 +2469,11 @@ impl Timeline {
|
||||
|
||||
for discovered in discovered {
|
||||
let (name, kind) = match discovered {
|
||||
Discovered::Layer(layer_file_name, local_metadata) => {
|
||||
discovered_layers.push((layer_file_name, local_metadata));
|
||||
Discovered::Layer(layer_file_name, local_path, file_size) => {
|
||||
discovered_layers.push((layer_file_name, local_path, file_size));
|
||||
continue;
|
||||
}
|
||||
Discovered::IgnoredBackup(path) => {
|
||||
std::fs::remove_file(path)
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.fatal_err("Removing .old file");
|
||||
Discovered::IgnoredBackup => {
|
||||
continue;
|
||||
}
|
||||
Discovered::Unknown(file_name) => {
|
||||
@@ -2502,8 +2499,13 @@ impl Timeline {
|
||||
);
|
||||
}
|
||||
|
||||
let decided =
|
||||
init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
|
||||
let decided = init::reconcile(
|
||||
discovered_layers,
|
||||
index_part.as_ref(),
|
||||
disk_consistent_lsn,
|
||||
generation,
|
||||
shard,
|
||||
);
|
||||
|
||||
let mut loaded_layers = Vec::new();
|
||||
let mut needs_cleanup = Vec::new();
|
||||
@@ -2511,6 +2513,21 @@ impl Timeline {
|
||||
|
||||
for (name, decision) in decided {
|
||||
let decision = match decision {
|
||||
Ok(UseRemote { local, remote }) => {
|
||||
// Remote is authoritative, but we may still choose to retain
|
||||
// the local file if the contents appear to match
|
||||
if local.metadata.file_size() == remote.file_size() {
|
||||
// Use the local file, but take the remote metadata so that we pick up
|
||||
// the correct generation.
|
||||
UseLocal(LocalLayerFileMetadata {
|
||||
metadata: remote,
|
||||
local_path: local.local_path,
|
||||
})
|
||||
} else {
|
||||
init::cleanup_local_file_for_remote(&local, &remote)?;
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
}
|
||||
Ok(decision) => decision,
|
||||
Err(DismissedLayer::Future { local }) => {
|
||||
if let Some(local) = local {
|
||||
@@ -2528,11 +2545,6 @@ impl Timeline {
|
||||
// this file never existed remotely, we will have to do rework
|
||||
continue;
|
||||
}
|
||||
Err(DismissedLayer::BadMetadata(local)) => {
|
||||
init::cleanup_local_file_for_remote(&local)?;
|
||||
// this file never existed remotely, we will have to do rework
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match &name {
|
||||
@@ -2543,12 +2555,14 @@ impl Timeline {
|
||||
tracing::debug!(layer=%name, ?decision, "applied");
|
||||
|
||||
let layer = match decision {
|
||||
Resident { local, remote } => {
|
||||
total_physical_size += local.file_size;
|
||||
Layer::for_resident(conf, &this, local.local_path, name, remote)
|
||||
UseLocal(local) => {
|
||||
total_physical_size += local.metadata.file_size();
|
||||
Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
|
||||
.drop_eviction_guard()
|
||||
}
|
||||
Evicted(remote) => Layer::for_evicted(conf, &this, name, remote),
|
||||
Evicted(remote) | UseRemote { remote, .. } => {
|
||||
Layer::for_evicted(conf, &this, name, remote)
|
||||
}
|
||||
};
|
||||
|
||||
loaded_layers.push(layer);
|
||||
@@ -3057,7 +3071,7 @@ impl Timeline {
|
||||
|
||||
HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
layer.metadata(),
|
||||
(&layer.metadata()).into(),
|
||||
last_activity_ts,
|
||||
)
|
||||
});
|
||||
@@ -4333,7 +4347,7 @@ impl Timeline {
|
||||
let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
|
||||
|
||||
let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
|
||||
debug!(
|
||||
info!(
|
||||
"generate image layers for metadata keys: trigger_generation={trigger_generation}, \
|
||||
delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
|
||||
total_key_retrieved={total_key_retrieved}"
|
||||
@@ -4711,16 +4725,11 @@ impl Timeline {
|
||||
|
||||
async fn rewrite_layers(
|
||||
self: &Arc<Self>,
|
||||
mut replace_layers: Vec<(Layer, ResidentLayer)>,
|
||||
mut drop_layers: Vec<Layer>,
|
||||
replace_layers: Vec<(Layer, ResidentLayer)>,
|
||||
drop_layers: Vec<Layer>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
// Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
|
||||
// to avoid double-removing, and avoid rewriting something that was removed.
|
||||
replace_layers.retain(|(l, _)| guard.contains(l));
|
||||
drop_layers.retain(|l| guard.contains(l));
|
||||
|
||||
guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
|
||||
|
||||
let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
|
||||
@@ -5595,6 +5604,26 @@ fn is_send() {
|
||||
_assert_send::<TimelineWriter<'_>>();
|
||||
}
|
||||
|
||||
/// Add a suffix to a layer file's name: .{num}.old
|
||||
/// Uses the first available num (starts at 0)
|
||||
fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
let filename = path
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow!("Path {path} don't have a file name"))?;
|
||||
let mut new_path = path.to_owned();
|
||||
|
||||
for i in 0u32.. {
|
||||
new_path.set_file_name(format!("{filename}.{i}.old"));
|
||||
if !new_path.exists() {
|
||||
std::fs::rename(path, &new_path)
|
||||
.with_context(|| format!("rename {path:?} to {new_path:?}"))?;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
bail!("couldn't find an unused backup number for {:?}", path)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
@@ -9,10 +9,7 @@ use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
|
||||
RecordedDuration, Timeline,
|
||||
};
|
||||
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use enumset::EnumSet;
|
||||
@@ -25,13 +22,14 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
|
||||
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
|
||||
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::DeltaLayer;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
use crate::{page_cache, ZERO_PAGE};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::Key;
|
||||
@@ -176,24 +174,13 @@ impl Timeline {
|
||||
async fn compact_shard_ancestors(
|
||||
self: &Arc<Self>,
|
||||
rewrite_max: usize,
|
||||
ctx: &RequestContext,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut drop_layers = Vec::new();
|
||||
let mut layers_to_rewrite: Vec<Layer> = Vec::new();
|
||||
let layers_to_rewrite: Vec<Layer> = Vec::new();
|
||||
|
||||
// We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
|
||||
// layer is behind this Lsn, it indicates that the layer is being retained beyond the
|
||||
// pitr_interval, for example because a branchpoint references it.
|
||||
//
|
||||
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
|
||||
// are rewriting layers.
|
||||
let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
|
||||
|
||||
tracing::info!(
|
||||
"latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
*latest_gc_cutoff,
|
||||
self.gc_info.read().unwrap().cutoffs.pitr
|
||||
);
|
||||
// We will use the PITR cutoff as a condition for rewriting layers.
|
||||
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
for layer_desc in layers.layer_map().iter_historic_layers() {
|
||||
@@ -252,9 +239,9 @@ impl Timeline {
|
||||
|
||||
// Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
|
||||
// without incurring the I/O cost of a rewrite.
|
||||
if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
|
||||
debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
|
||||
layer_desc.get_lsn_range().end, *latest_gc_cutoff);
|
||||
if layer_desc.get_lsn_range().end >= pitr_cutoff {
|
||||
debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
|
||||
layer_desc.get_lsn_range().end, pitr_cutoff);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -264,10 +251,13 @@ impl Timeline {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Only rewrite layers if their generations differ. This guarantees:
|
||||
// - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
|
||||
// - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
|
||||
if layer.metadata().generation == self.generation {
|
||||
// Only rewrite layers if they would have different remote paths: either they belong to this
|
||||
// shard but an old generation, or they belonged to another shard. This also implicitly
|
||||
// guarantees that the layer is persistent in remote storage (as only remote persistent
|
||||
// layers are carried across shard splits, any local-only layer would be in the current generation)
|
||||
if layer.metadata().generation == self.generation
|
||||
&& layer.metadata().shard.shard_count == self.shard_identity.count
|
||||
{
|
||||
debug!(%layer, "Skipping rewrite, is not from old generation");
|
||||
continue;
|
||||
}
|
||||
@@ -280,69 +270,18 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Fall through: all our conditions for doing a rewrite passed.
|
||||
layers_to_rewrite.push(layer);
|
||||
// TODO: implement rewriting
|
||||
tracing::debug!(%layer, "Would rewrite layer");
|
||||
}
|
||||
|
||||
// Drop read lock on layer map before we start doing time-consuming I/O
|
||||
// Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
|
||||
drop(layers);
|
||||
|
||||
let mut replace_image_layers = Vec::new();
|
||||
|
||||
for layer in layers_to_rewrite {
|
||||
tracing::info!(layer=%layer, "Rewriting layer after shard split...");
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&layer.layer_desc().key_range,
|
||||
layer.layer_desc().image_layer_lsn(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Safety of layer rewrites:
|
||||
// - We are writing to a different local file path than we are reading from, so the old Layer
|
||||
// cannot interfere with the new one.
|
||||
// - In the page cache, contents for a particular VirtualFile are stored with a file_id that
|
||||
// is different for two layers with the same name (in `ImageLayerInner::new` we always
|
||||
// acquire a fresh id from [`crate::page_cache::next_file_id`]. So readers do not risk
|
||||
// reading the index from one layer file, and then data blocks from the rewritten layer file.
|
||||
// - Any readers that have a reference to the old layer will keep it alive until they are done
|
||||
// with it. If they are trying to promote from remote storage, that will fail, but this is the same
|
||||
// as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
|
||||
// - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
|
||||
// - GC, which at worst witnesses us "undelete" a layer that they just deleted.
|
||||
// - ingestion, which only inserts layers, therefore cannot collide with us.
|
||||
let resident = layer.download_and_keep_resident().await?;
|
||||
|
||||
let keys_written = resident
|
||||
.filter(&self.shard_identity, &mut image_layer_writer, ctx)
|
||||
.await?;
|
||||
|
||||
if keys_written > 0 {
|
||||
let new_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
|
||||
layer.metadata().file_size,
|
||||
new_layer.metadata().file_size);
|
||||
|
||||
replace_image_layers.push((layer, new_layer));
|
||||
} else {
|
||||
// Drop the old layer. Usually for this case we would already have noticed that
|
||||
// the layer has no data for us with the ShardedRange check above, but
|
||||
drop_layers.push(layer);
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
|
||||
// metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
|
||||
// to remote index) and be removed. This is inefficient but safe.
|
||||
fail::fail_point!("compact-shard-ancestors-localonly");
|
||||
// TODO: collect layers to rewrite
|
||||
let replace_layers = Vec::new();
|
||||
|
||||
// Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
|
||||
self.rewrite_layers(replace_image_layers, drop_layers)
|
||||
.await?;
|
||||
|
||||
fail::fail_point!("compact-shard-ancestors-enqueued");
|
||||
self.rewrite_layers(replace_layers, drop_layers).await?;
|
||||
|
||||
// We wait for all uploads to complete before finishing this compaction stage. This is not
|
||||
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
|
||||
@@ -350,8 +289,6 @@ impl Timeline {
|
||||
// load.
|
||||
self.remote_client.wait_completion().await?;
|
||||
|
||||
fail::fail_point!("compact-shard-ancestors-persistent");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1213,10 +1150,10 @@ impl TimelineAdaptor {
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CreateImageLayersError> {
|
||||
) -> Result<(), PageReconstructError> {
|
||||
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
|
||||
|
||||
let image_layer_writer = ImageLayerWriter::new(
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.timeline.conf,
|
||||
self.timeline.timeline_id,
|
||||
self.timeline.tenant_shard_id,
|
||||
@@ -1227,34 +1164,47 @@ impl TimelineAdaptor {
|
||||
.await?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"failpoint image-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
|
||||
let keyspace = KeySpace {
|
||||
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
|
||||
};
|
||||
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
|
||||
let start = Key::MIN;
|
||||
let ImageLayerCreationOutcome {
|
||||
image,
|
||||
next_start_key: _,
|
||||
} = self
|
||||
.timeline
|
||||
.create_image_layer_for_rel_blocks(
|
||||
&keyspace,
|
||||
image_layer_writer,
|
||||
lsn,
|
||||
ctx,
|
||||
key_range.clone(),
|
||||
start,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(image_layer) = image {
|
||||
self.new_images.push(image_layer);
|
||||
let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
|
||||
for range in &keyspace_ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
let img = match self.timeline.get(key, lsn, ctx).await {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
|
||||
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
image_layer_writer.put_image(key, img, ctx).await?;
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
|
||||
|
||||
self.new_images.push(image_layer);
|
||||
|
||||
timer.stop_and_record();
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ use anyhow::Context;
|
||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, instrument, Instrument};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
|
||||
@@ -7,20 +7,19 @@ use crate::{
|
||||
index::{IndexPart, LayerFileMetadata},
|
||||
},
|
||||
storage_layer::LayerName,
|
||||
Generation,
|
||||
},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use std::{
|
||||
collections::{hash_map, HashMap},
|
||||
str::FromStr,
|
||||
};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Identified files in the timeline directory.
|
||||
pub(super) enum Discovered {
|
||||
/// The only one we care about
|
||||
Layer(LayerName, LocalLayerFileMetadata),
|
||||
Layer(LayerName, Utf8PathBuf, u64),
|
||||
/// Old ephmeral files from previous launches, should be removed
|
||||
Ephemeral(String),
|
||||
/// Old temporary timeline files, unsure what these really are, should be removed
|
||||
@@ -28,7 +27,7 @@ pub(super) enum Discovered {
|
||||
/// Temporary on-demand download files, should be removed
|
||||
TemporaryDownload(String),
|
||||
/// Backup file from previously future layers
|
||||
IgnoredBackup(Utf8PathBuf),
|
||||
IgnoredBackup,
|
||||
/// Unrecognized, warn about these
|
||||
Unknown(String),
|
||||
}
|
||||
@@ -44,15 +43,12 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
|
||||
let discovered = match LayerName::from_str(&file_name) {
|
||||
Ok(file_name) => {
|
||||
let file_size = direntry.metadata()?.len();
|
||||
Discovered::Layer(
|
||||
file_name,
|
||||
LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
|
||||
)
|
||||
Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
|
||||
}
|
||||
Err(_) => {
|
||||
if file_name.ends_with(".old") {
|
||||
// ignore these
|
||||
Discovered::IgnoredBackup(direntry.path().to_owned())
|
||||
Discovered::IgnoredBackup
|
||||
} else if remote_timeline_client::is_temp_download_file(direntry.path()) {
|
||||
Discovered::TemporaryDownload(file_name)
|
||||
} else if is_ephemeral_file(&file_name) {
|
||||
@@ -75,32 +71,37 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
|
||||
/// this structure extends it with metadata describing the layer's presence in local storage.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(super) struct LocalLayerFileMetadata {
|
||||
pub(super) file_size: u64,
|
||||
pub(super) metadata: LayerFileMetadata,
|
||||
pub(super) local_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
impl LocalLayerFileMetadata {
|
||||
pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
|
||||
pub fn new(
|
||||
local_path: Utf8PathBuf,
|
||||
file_size: u64,
|
||||
generation: Generation,
|
||||
shard: ShardIndex,
|
||||
) -> Self {
|
||||
Self {
|
||||
local_path,
|
||||
file_size,
|
||||
metadata: LayerFileMetadata::new(file_size, generation, shard),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// For a layer that is present in remote metadata, this type describes how to handle
|
||||
/// it during startup: it is either Resident (and we have some metadata about a local file),
|
||||
/// or it is Evicted (and we only have remote metadata).
|
||||
/// Decision on what to do with a layer file after considering its local and remote metadata.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(super) enum Decision {
|
||||
/// The layer is not present locally.
|
||||
Evicted(LayerFileMetadata),
|
||||
/// The layer is present locally, and metadata matches: we may hook up this layer to the
|
||||
/// existing file in local storage.
|
||||
Resident {
|
||||
/// The layer is present locally, but local metadata does not match remote; we must
|
||||
/// delete it and treat it as evicted.
|
||||
UseRemote {
|
||||
local: LocalLayerFileMetadata,
|
||||
remote: LayerFileMetadata,
|
||||
},
|
||||
/// The layer is present locally, and metadata matches.
|
||||
UseLocal(LocalLayerFileMetadata),
|
||||
}
|
||||
|
||||
/// A layer needs to be left out of the layer map.
|
||||
@@ -116,81 +117,77 @@ pub(super) enum DismissedLayer {
|
||||
/// In order to make crash safe updates to layer map, we must dismiss layers which are only
|
||||
/// found locally or not yet included in the remote `index_part.json`.
|
||||
LocalOnly(LocalLayerFileMetadata),
|
||||
|
||||
/// The layer exists in remote storage but the local layer's metadata (e.g. file size)
|
||||
/// does not match it
|
||||
BadMetadata(LocalLayerFileMetadata),
|
||||
}
|
||||
|
||||
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
|
||||
pub(super) fn reconcile(
|
||||
local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
|
||||
discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
|
||||
index_part: Option<&IndexPart>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
generation: Generation,
|
||||
shard: ShardIndex,
|
||||
) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
|
||||
let Some(index_part) = index_part else {
|
||||
// If we have no remote metadata, no local layer files are considered valid to load
|
||||
return local_layers
|
||||
.into_iter()
|
||||
.map(|(layer_name, local_metadata)| {
|
||||
(layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
|
||||
})
|
||||
.collect();
|
||||
};
|
||||
use Decision::*;
|
||||
|
||||
let mut result = Vec::new();
|
||||
// name => (local_metadata, remote_metadata)
|
||||
type Collected =
|
||||
HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
|
||||
|
||||
let mut remote_layers = HashMap::new();
|
||||
let mut discovered = discovered
|
||||
.into_iter()
|
||||
.map(|(layer_name, local_path, file_size)| {
|
||||
(
|
||||
layer_name,
|
||||
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
|
||||
// it is not in IndexPart, in which case using our current generation makes sense
|
||||
// because it will be uploaded in this generation.
|
||||
(
|
||||
Some(LocalLayerFileMetadata::new(
|
||||
local_path, file_size, generation, shard,
|
||||
)),
|
||||
None,
|
||||
),
|
||||
)
|
||||
})
|
||||
.collect::<Collected>();
|
||||
|
||||
// Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise
|
||||
// construct DismissedLayers to get rid of them.
|
||||
for (layer_name, local_metadata) in local_layers {
|
||||
let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
|
||||
result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
|
||||
continue;
|
||||
};
|
||||
|
||||
if remote_metadata.file_size != local_metadata.file_size {
|
||||
result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
|
||||
continue;
|
||||
}
|
||||
|
||||
remote_layers.insert(
|
||||
layer_name,
|
||||
Decision::Resident {
|
||||
local: local_metadata,
|
||||
remote: remote_metadata.clone(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Construct Decision for layers that were not found locally
|
||||
// merge any index_part information, when available
|
||||
index_part
|
||||
.layer_metadata
|
||||
.iter()
|
||||
.as_ref()
|
||||
.map(|ip| ip.layer_metadata.iter())
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
|
||||
.for_each(|(name, metadata)| {
|
||||
if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
|
||||
entry.insert(Decision::Evicted(metadata.clone()));
|
||||
if let Some(existing) = discovered.get_mut(name) {
|
||||
existing.1 = Some(metadata);
|
||||
} else {
|
||||
discovered.insert(name.to_owned(), (None, Some(metadata)));
|
||||
}
|
||||
});
|
||||
|
||||
// For layers that were found in authoritative remote metadata, apply a final check that they are within
|
||||
// the disk_consistent_lsn.
|
||||
result.extend(remote_layers.into_iter().map(|(name, decision)| {
|
||||
if name.is_in_future(disk_consistent_lsn) {
|
||||
match decision {
|
||||
Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
|
||||
Decision::Resident {
|
||||
local,
|
||||
remote: _remote,
|
||||
} => (name, Err(DismissedLayer::Future { local: Some(local) })),
|
||||
}
|
||||
} else {
|
||||
(name, Ok(decision))
|
||||
}
|
||||
}));
|
||||
discovered
|
||||
.into_iter()
|
||||
.map(|(name, (local, remote))| {
|
||||
let decision = if name.is_in_future(disk_consistent_lsn) {
|
||||
Err(DismissedLayer::Future { local })
|
||||
} else {
|
||||
match (local, remote) {
|
||||
(Some(local), Some(remote)) if local.metadata != remote => {
|
||||
Ok(UseRemote { local, remote })
|
||||
}
|
||||
(Some(x), Some(_)) => Ok(UseLocal(x)),
|
||||
(None, Some(x)) => Ok(Evicted(x)),
|
||||
(Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
|
||||
(None, None) => {
|
||||
unreachable!("there must not be any non-local non-remote files")
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
result
|
||||
(name, decision)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
|
||||
@@ -199,15 +196,25 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
|
||||
std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
|
||||
}
|
||||
|
||||
pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
|
||||
let local_size = local.file_size;
|
||||
pub(super) fn cleanup_local_file_for_remote(
|
||||
local: &LocalLayerFileMetadata,
|
||||
remote: &LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_size = local.metadata.file_size();
|
||||
let remote_size = remote.file_size();
|
||||
let path = &local.local_path;
|
||||
let file_name = path.file_name().expect("must be file path");
|
||||
tracing::warn!(
|
||||
"removing local file {file_name:?} because it has unexpected length {local_size};"
|
||||
);
|
||||
|
||||
std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
|
||||
let file_name = path.file_name().expect("must be file path");
|
||||
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
|
||||
assert!(
|
||||
path.exists(),
|
||||
"we would leave the local_layer without a file if this does not hold: {path}",
|
||||
);
|
||||
Err(err)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn cleanup_future_layer(
|
||||
@@ -229,8 +236,8 @@ pub(super) fn cleanup_local_only_file(
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = name.kind();
|
||||
tracing::info!(
|
||||
"found local-only {kind} layer {name} size {}",
|
||||
local.file_size
|
||||
"found local-only {kind} layer {name}, metadata {:?}",
|
||||
local.metadata
|
||||
);
|
||||
std::fs::remove_file(&local.local_path)?;
|
||||
Ok(())
|
||||
|
||||
@@ -212,34 +212,13 @@ impl LayerManager {
|
||||
&mut self,
|
||||
rewrite_layers: &[(Layer, ResidentLayer)],
|
||||
drop_layers: &[Layer],
|
||||
metrics: &TimelineMetrics,
|
||||
_metrics: &TimelineMetrics,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for (old_layer, new_layer) in rewrite_layers {
|
||||
debug_assert_eq!(
|
||||
old_layer.layer_desc().key_range,
|
||||
new_layer.layer_desc().key_range
|
||||
);
|
||||
debug_assert_eq!(
|
||||
old_layer.layer_desc().lsn_range,
|
||||
new_layer.layer_desc().lsn_range
|
||||
);
|
||||
|
||||
// Safety: we may never rewrite the same file in-place. Callers are responsible
|
||||
// for ensuring that they only rewrite layers after something changes the path,
|
||||
// such as an increment in the generation number.
|
||||
assert_ne!(old_layer.local_path(), new_layer.local_path());
|
||||
// TODO: implement rewrites (currently this code path only used for drops)
|
||||
assert!(rewrite_layers.is_empty());
|
||||
|
||||
Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
|
||||
|
||||
Self::insert_historic_layer(
|
||||
new_layer.as_ref().clone(),
|
||||
&mut updates,
|
||||
&mut self.layer_fmgr,
|
||||
);
|
||||
|
||||
metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
|
||||
}
|
||||
for l in drop_layers {
|
||||
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
|
||||
@@ -213,7 +213,10 @@ impl UploadQueue {
|
||||
|
||||
let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
|
||||
for (layer_name, layer_metadata) in &index_part.layer_metadata {
|
||||
files.insert(layer_name.to_owned(), layer_metadata.clone());
|
||||
files.insert(
|
||||
layer_name.to_owned(),
|
||||
LayerFileMetadata::from(layer_metadata),
|
||||
);
|
||||
}
|
||||
|
||||
info!(
|
||||
@@ -319,7 +322,9 @@ impl std::fmt::Display for UploadOp {
|
||||
write!(
|
||||
f,
|
||||
"UploadLayer({}, size={:?}, gen={:?})",
|
||||
layer, metadata.file_size, metadata.generation
|
||||
layer,
|
||||
metadata.file_size(),
|
||||
metadata.generation
|
||||
)
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => {
|
||||
|
||||
@@ -51,6 +51,7 @@ int flush_every_n_requests = 8;
|
||||
|
||||
int neon_protocol_version = 2;
|
||||
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
static int stripe_size;
|
||||
|
||||
@@ -94,37 +95,18 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
static PagestoreShmemState *pagestore_shared;
|
||||
static uint64 pagestore_local_counter = 0;
|
||||
|
||||
typedef enum PSConnectionState {
|
||||
PS_Disconnected, /* no connection yet */
|
||||
PS_Connecting_Startup, /* connection starting up */
|
||||
PS_Connecting_PageStream, /* negotiating pagestream */
|
||||
PS_Connected, /* connected, pagestream established */
|
||||
} PSConnectionState;
|
||||
|
||||
/* This backend's per-shard connections */
|
||||
typedef struct
|
||||
{
|
||||
TimestampTz last_connect_time; /* read-only debug value */
|
||||
TimestampTz last_reconnect_time;
|
||||
uint32 delay_us;
|
||||
int n_reconnect_attempts;
|
||||
PGconn *conn;
|
||||
|
||||
/*---
|
||||
* Pageserver connection state, i.e.
|
||||
* disconnected: conn == NULL, wes == NULL;
|
||||
* conn_startup: connection initiated, waiting for connection establishing
|
||||
* conn_ps: PageStream query sent, waiting for confirmation
|
||||
* connected: PageStream established
|
||||
*/
|
||||
PSConnectionState state;
|
||||
PGconn *conn;
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *wes_read;
|
||||
WaitEventSet *wes;
|
||||
} PageServer;
|
||||
|
||||
static PageServer page_servers[MAX_SHARDS];
|
||||
@@ -321,277 +303,119 @@ get_shard_number(BufferTag *tag)
|
||||
return hash % n_shards;
|
||||
}
|
||||
|
||||
static inline void
|
||||
CLEANUP_AND_DISCONNECT(PageServer *shard)
|
||||
{
|
||||
if (shard->wes_read)
|
||||
{
|
||||
FreeWaitEventSet(shard->wes_read);
|
||||
shard->wes_read = NULL;
|
||||
}
|
||||
if (shard->conn)
|
||||
{
|
||||
PQfinish(shard->conn);
|
||||
shard->conn = NULL;
|
||||
}
|
||||
|
||||
shard->state = PS_Disconnected;
|
||||
}
|
||||
|
||||
/*
|
||||
* Connect to a pageserver, or continue to try to connect if we're yet to
|
||||
* complete the connection (e.g. due to receiving an earlier cancellation
|
||||
* during connection start).
|
||||
* Returns true if successfully connected; false if the connection failed.
|
||||
*
|
||||
* Throws errors in unrecoverable situations, or when this backend's query
|
||||
* is canceled.
|
||||
*/
|
||||
static bool
|
||||
pageserver_connect(shardno_t shard_no, int elevel)
|
||||
{
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
char *query;
|
||||
int ret;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
PGconn *conn;
|
||||
WaitEventSet *wes;
|
||||
char connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
|
||||
static TimestampTz last_connect_time = 0;
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
bool broke_from_loop = false;
|
||||
|
||||
Assert(page_servers[shard_no].conn == NULL);
|
||||
|
||||
/*
|
||||
* Get the connection string for this shard. If the shard map has been
|
||||
* updated since we last looked, this will also disconnect any existing
|
||||
* pageserver connections as a side effect.
|
||||
* Note that connstr is used both during connection start, and when we
|
||||
* log the successful connection.
|
||||
*/
|
||||
load_shard_map(shard_no, connstr, NULL);
|
||||
|
||||
switch (shard->state)
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_connect = now - last_connect_time;
|
||||
if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
|
||||
{
|
||||
case PS_Disconnected:
|
||||
{
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n_pgsql_params;
|
||||
TimestampTz now;
|
||||
int64 us_since_last_attempt;
|
||||
|
||||
/* Make sure we start with a clean slate */
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
|
||||
shard->last_reconnect_time = now;
|
||||
|
||||
/*
|
||||
* If we did other tasks between reconnect attempts, then we won't
|
||||
* need to wait as long as a full delay.
|
||||
*/
|
||||
if (us_since_last_attempt < shard->delay_us)
|
||||
{
|
||||
pg_usleep(shard->delay_us - us_since_last_attempt);
|
||||
}
|
||||
|
||||
/* update the delay metric */
|
||||
shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
|
||||
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so when
|
||||
* we set the password before the connection string, the connection string
|
||||
* can override the password from the env variable. Seems useful, although
|
||||
* we don't currently use that capability anywhere.
|
||||
*/
|
||||
keywords[0] = "dbname";
|
||||
values[0] = connstr;
|
||||
n_pgsql_params = 1;
|
||||
|
||||
if (neon_auth_token)
|
||||
{
|
||||
keywords[1] = "password";
|
||||
values[1] = neon_auth_token;
|
||||
n_pgsql_params++;
|
||||
}
|
||||
|
||||
keywords[n_pgsql_params] = NULL;
|
||||
values[n_pgsql_params] = NULL;
|
||||
|
||||
shard->conn = PQconnectStartParams(keywords, values, 1);
|
||||
if (!shard->conn)
|
||||
{
|
||||
neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
|
||||
return false;
|
||||
}
|
||||
|
||||
shard->state = PS_Connecting_Startup;
|
||||
/* fallthrough */
|
||||
pg_usleep(delay_us);
|
||||
delay_us *= 2;
|
||||
}
|
||||
case PS_Connecting_Startup:
|
||||
else
|
||||
{
|
||||
char *pagestream_query;
|
||||
int ps_send_query_ret;
|
||||
bool connected = false;
|
||||
int poll_result = PGRES_POLLING_WRITING;
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
|
||||
delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
WaitEvent event;
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so when
|
||||
* we set the password before the connection string, the connection string
|
||||
* can override the password from the env variable. Seems useful, although
|
||||
* we don't currently use that capability anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (neon_auth_token)
|
||||
{
|
||||
keywords[n] = "password";
|
||||
values[n] = neon_auth_token;
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = connstr;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
conn = PQconnectdbParams(keywords, values, 1);
|
||||
last_connect_time = GetCurrentTimestamp();
|
||||
|
||||
switch (poll_result)
|
||||
{
|
||||
default: /* unknown/unused states are handled as a failed connection */
|
||||
case PGRES_POLLING_FAILED:
|
||||
{
|
||||
char *pqerr = PQerrorMessage(shard->conn);
|
||||
char *msg = NULL;
|
||||
neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
|
||||
if (PQstatus(conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
|
||||
if (pqerr)
|
||||
msg = pchomp(pqerr);
|
||||
PQfinish(conn);
|
||||
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
if (msg)
|
||||
{
|
||||
neon_shard_log(shard_no, elevel,
|
||||
"could not connect to pageserver: %s",
|
||||
msg);
|
||||
pfree(msg);
|
||||
}
|
||||
else
|
||||
neon_shard_log(shard_no, elevel,
|
||||
"could not connect to pageserver");
|
||||
|
||||
return false;
|
||||
}
|
||||
case PGRES_POLLING_READING:
|
||||
/* Sleep until there's something to do */
|
||||
while (true)
|
||||
{
|
||||
int rc = WaitLatchOrSocket(MyLatch,
|
||||
WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE,
|
||||
PQsocket(shard->conn),
|
||||
0,
|
||||
PG_WAIT_EXTENSION);
|
||||
elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc);
|
||||
if (rc & WL_LATCH_SET)
|
||||
{
|
||||
ResetLatch(MyLatch);
|
||||
/* query cancellation, backend shutdown */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
if (rc & WL_SOCKET_READABLE)
|
||||
break;
|
||||
}
|
||||
/* PQconnectPoll() handles the socket polling state updates */
|
||||
|
||||
break;
|
||||
case PGRES_POLLING_WRITING:
|
||||
/* Sleep until there's something to do */
|
||||
while (true)
|
||||
{
|
||||
int rc = WaitLatchOrSocket(MyLatch,
|
||||
WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE,
|
||||
PQsocket(shard->conn),
|
||||
0,
|
||||
PG_WAIT_EXTENSION);
|
||||
elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc);
|
||||
if (rc & WL_LATCH_SET)
|
||||
{
|
||||
ResetLatch(MyLatch);
|
||||
/* query cancellation, backend shutdown */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
if (rc & WL_SOCKET_WRITEABLE)
|
||||
break;
|
||||
}
|
||||
/* PQconnectPoll() handles the socket polling state updates */
|
||||
|
||||
break;
|
||||
case PGRES_POLLING_OK:
|
||||
neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
|
||||
connected = true;
|
||||
break;
|
||||
}
|
||||
poll_result = PQconnectPoll(shard->conn);
|
||||
elog(DEBUG5, "PQconnectPoll=>%d", poll_result);
|
||||
}
|
||||
while (!connected);
|
||||
|
||||
/* No more polling needed; connection succeeded */
|
||||
shard->last_connect_time = GetCurrentTimestamp();
|
||||
|
||||
shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
|
||||
|
||||
|
||||
switch (neon_protocol_version)
|
||||
{
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
switch (neon_protocol_version)
|
||||
{
|
||||
case 2:
|
||||
pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
|
||||
query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
case 1:
|
||||
pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
|
||||
}
|
||||
|
||||
if (PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
|
||||
ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
|
||||
pfree(pagestream_query);
|
||||
if (ps_send_query_ret != 1)
|
||||
{
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
|
||||
return false;
|
||||
}
|
||||
|
||||
shard->state = PS_Connecting_PageStream;
|
||||
/* fallthrough */
|
||||
}
|
||||
case PS_Connecting_PageStream:
|
||||
ret = PQsendQuery(conn, query);
|
||||
pfree(query);
|
||||
if (ret != 1)
|
||||
{
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
|
||||
PQfinish(conn);
|
||||
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
|
||||
|
||||
while (PQisBusy(shard->conn))
|
||||
PG_TRY();
|
||||
{
|
||||
while (PQisBusy(conn))
|
||||
{
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
@@ -599,37 +423,40 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
/* Data available in socket? */
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(shard->conn))
|
||||
if (!PQconsumeInput(conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
pfree(msg);
|
||||
return false;
|
||||
/* Returning from inside PG_TRY is bad, so we break/return later */
|
||||
broke_from_loop = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
shard->state = PS_Connected;
|
||||
/* fallthrough */
|
||||
}
|
||||
case PS_Connected:
|
||||
/*
|
||||
* We successfully connected. Future connections to this PageServer
|
||||
* will do fast retries again, with exponential backoff.
|
||||
*/
|
||||
shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
|
||||
return true;
|
||||
default:
|
||||
neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
|
||||
PG_CATCH();
|
||||
{
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
/* This shouldn't be hit */
|
||||
Assert(false);
|
||||
PG_END_TRY();
|
||||
|
||||
if (broke_from_loop)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
|
||||
page_servers[shard_no].conn = conn;
|
||||
page_servers[shard_no].wes = wes;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -649,7 +476,7 @@ retry:
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
@@ -675,8 +502,7 @@ retry:
|
||||
|
||||
/*
|
||||
* Reset prefetch and drop connection to the shard.
|
||||
* It also drops connection to all other shards involved in prefetch, through
|
||||
* prefetch_on_ps_disconnect().
|
||||
* It also drops connection to all other shards involved in prefetch.
|
||||
*/
|
||||
static void
|
||||
pageserver_disconnect(shardno_t shard_no)
|
||||
@@ -686,6 +512,9 @@ pageserver_disconnect(shardno_t shard_no)
|
||||
* whole prefetch queue, even for other pageservers. It should not
|
||||
* cause big problems, because connection loss is supposed to be a
|
||||
* rare event.
|
||||
*
|
||||
* Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
|
||||
* because prefetch request may be registered before connection is established.
|
||||
*/
|
||||
prefetch_on_ps_disconnect();
|
||||
|
||||
@@ -698,36 +527,37 @@ pageserver_disconnect(shardno_t shard_no)
|
||||
static void
|
||||
pageserver_disconnect_shard(shardno_t shard_no)
|
||||
{
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
/*
|
||||
* If anything goes wrong while we were sending a request, it's not clear
|
||||
* what state the connection is in. For example, if we sent the request
|
||||
* but didn't receive a response yet, we might receive the response some
|
||||
* time later after we have already sent a new unrelated request. Close
|
||||
* the connection to avoid getting confused.
|
||||
* Similarly, even when we're in PS_DISCONNECTED, we may have junk to
|
||||
* clean up: It is possible that we encountered an error allocating any
|
||||
* of the wait event sets or the psql connection, or failed when we tried
|
||||
* to attach wait events to the WaitEventSets.
|
||||
*/
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
shard->state = PS_Disconnected;
|
||||
if (page_servers[shard_no].conn)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
|
||||
PQfinish(page_servers[shard_no].conn);
|
||||
page_servers[shard_no].conn = NULL;
|
||||
}
|
||||
if (page_servers[shard_no].wes != NULL)
|
||||
{
|
||||
FreeWaitEventSet(page_servers[shard_no].wes);
|
||||
page_servers[shard_no].wes = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
{
|
||||
StringInfoData req_buff;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn;
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
|
||||
pageserver_disconnect(shard_no);
|
||||
pageserver_conn = NULL;
|
||||
}
|
||||
|
||||
req_buff = nm_pack_request(request);
|
||||
@@ -741,19 +571,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
|
||||
* connection in case of failure.
|
||||
*/
|
||||
if (shard->state != PS_Connected)
|
||||
if (!page_servers[shard_no].conn)
|
||||
{
|
||||
while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
{
|
||||
HandleMainLoopInterrupts();
|
||||
shard->n_reconnect_attempts += 1;
|
||||
n_reconnect_attempts += 1;
|
||||
}
|
||||
shard->n_reconnect_attempts = 0;
|
||||
} else {
|
||||
Assert(shard->conn != NULL);
|
||||
n_reconnect_attempts = 0;
|
||||
}
|
||||
|
||||
pageserver_conn = shard->conn;
|
||||
pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
/*
|
||||
* Send request.
|
||||
@@ -762,17 +590,13 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
* should use async mode and check for interrupts while waiting. In
|
||||
* practice, our requests are small enough to always fit in the output and
|
||||
* TCP buffer.
|
||||
*
|
||||
* Note that this also will fail when the connection is in the
|
||||
* PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
|
||||
* point, but on the grand scheme of things it's only a small issue.
|
||||
*/
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
|
||||
pfree(msg);
|
||||
pfree(req_buff.data);
|
||||
return false;
|
||||
@@ -787,7 +611,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -796,68 +619,58 @@ pageserver_receive(shardno_t shard_no)
|
||||
{
|
||||
StringInfoData resp_buff;
|
||||
NeonResponse *resp;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn = shard->conn;
|
||||
/* read response */
|
||||
int rc;
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
if (shard->state != PS_Connected)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG,
|
||||
"pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
|
||||
shard->state);
|
||||
if (!pageserver_conn)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Assert(pageserver_conn);
|
||||
|
||||
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
|
||||
if (rc >= 0)
|
||||
PG_TRY();
|
||||
{
|
||||
/* call_PQgetCopyData handles rc == 0 */
|
||||
Assert(rc > 0);
|
||||
/* read response */
|
||||
int rc;
|
||||
|
||||
PG_TRY();
|
||||
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
|
||||
if (rc >= 0)
|
||||
{
|
||||
resp_buff.len = rc;
|
||||
resp_buff.cursor = 0;
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
else if (rc == -1)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
resp = NULL;
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
}
|
||||
else if (rc == -1)
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
|
||||
pageserver_disconnect(shard_no);
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
return (NeonResponse *) resp;
|
||||
}
|
||||
@@ -868,7 +681,7 @@ pageserver_flush(shardno_t shard_no)
|
||||
{
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
if (page_servers[shard_no].state != PS_Connected)
|
||||
if (!pageserver_conn)
|
||||
{
|
||||
neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
|
||||
}
|
||||
@@ -884,7 +697,6 @@ pageserver_flush(shardno_t shard_no)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1079,7 +891,5 @@ pg_init_libpagestore(void)
|
||||
dbsize_hook = neon_dbsize;
|
||||
}
|
||||
|
||||
memset(page_servers, 0, sizeof(page_servers));
|
||||
|
||||
lfc_init();
|
||||
}
|
||||
|
||||
@@ -94,10 +94,6 @@ static char *hexdump_page(char *page);
|
||||
|
||||
const int SmgrTrace = DEBUG5;
|
||||
|
||||
#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
|
||||
neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
|
||||
##__VA_ARGS__)
|
||||
|
||||
page_server_api *page_server;
|
||||
|
||||
/* unlogged relation build states */
|
||||
@@ -530,8 +526,6 @@ prefetch_flush_requests(void)
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
* NOTE: callers should make sure they can handle query cancellations in this
|
||||
* function's call path.
|
||||
*/
|
||||
static bool
|
||||
prefetch_wait_for(uint64 ring_index)
|
||||
@@ -567,8 +561,6 @@ prefetch_wait_for(uint64 ring_index)
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
*
|
||||
* NOTE: this does IO, and can get canceled out-of-line.
|
||||
*/
|
||||
static bool
|
||||
prefetch_read(PrefetchRequest *slot)
|
||||
@@ -580,14 +572,6 @@ prefetch_read(PrefetchRequest *slot)
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_receive);
|
||||
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(slot->shard_no, ERROR,
|
||||
"Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long)slot->my_ring_index, (long)MyPState->ring_receive);
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = (NeonResponse *) page_server->receive(slot->shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
@@ -605,11 +589,6 @@ prefetch_read(PrefetchRequest *slot)
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_shard_log(slot->shard_no, WARNING,
|
||||
"No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
|
||||
(long)slot->my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
|
||||
slot->buftag.forkNum, slot->buftag.blockNum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -624,7 +603,6 @@ void
|
||||
prefetch_on_ps_disconnect(void)
|
||||
{
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
|
||||
while (MyPState->ring_receive < MyPState->ring_unused)
|
||||
{
|
||||
PrefetchRequest *slot;
|
||||
@@ -647,7 +625,6 @@ prefetch_on_ps_disconnect(void)
|
||||
slot->status = PRFS_TAG_REMAINS;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
MyPState->ring_receive += 1;
|
||||
|
||||
prefetch_set_unused(ring_index);
|
||||
}
|
||||
}
|
||||
@@ -714,8 +691,6 @@ static void
|
||||
prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
|
||||
{
|
||||
bool found;
|
||||
uint64 mySlotNo = slot->my_ring_index;
|
||||
|
||||
NeonGetPageRequest request = {
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
/* lsn and not_modified_since are filled in below */
|
||||
@@ -724,8 +699,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
.blkno = slot->buftag.blockNum,
|
||||
};
|
||||
|
||||
Assert(mySlotNo == MyPState->ring_unused);
|
||||
|
||||
if (force_request_lsns)
|
||||
slot->request_lsns = *force_request_lsns;
|
||||
else
|
||||
@@ -738,11 +711,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_unused);
|
||||
|
||||
while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
|
||||
{
|
||||
Assert(mySlotNo == MyPState->ring_unused);
|
||||
/* loop */
|
||||
}
|
||||
while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_requests_inflight += 1;
|
||||
@@ -753,6 +722,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
|
||||
/* update slot state */
|
||||
slot->status = PRFS_REQUESTED;
|
||||
|
||||
prfh_insert(MyPState->prf_hash, slot, &found);
|
||||
Assert(!found);
|
||||
}
|
||||
@@ -924,10 +894,6 @@ Retry:
|
||||
return ring_index;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: this function can get canceled and use a long jump to the next catch
|
||||
* context. Take care.
|
||||
*/
|
||||
static NeonResponse *
|
||||
page_server_request(void const *req)
|
||||
{
|
||||
@@ -959,38 +925,19 @@ page_server_request(void const *req)
|
||||
* Current sharding model assumes that all metadata is present only at shard 0.
|
||||
* We still need to call get_shard_no() to check if shard map is up-to-date.
|
||||
*/
|
||||
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
|
||||
((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
|
||||
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
|
||||
{
|
||||
shard_no = 0;
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
PG_TRY();
|
||||
{
|
||||
while (!page_server->send(shard_no, (NeonRequest *) req)
|
||||
|| !page_server->flush(shard_no))
|
||||
{
|
||||
/* do nothing */
|
||||
}
|
||||
consume_prefetch_responses();
|
||||
resp = page_server->receive(shard_no);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
/*
|
||||
* Cancellation in this code needs to be handled better at some
|
||||
* point, but this currently seems fine for now.
|
||||
*/
|
||||
page_server->disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
|
||||
consume_prefetch_responses();
|
||||
resp = page_server->receive(shard_no);
|
||||
} while (resp == NULL);
|
||||
|
||||
return resp;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -1958,9 +1905,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
|
||||
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
return exists;
|
||||
@@ -2412,7 +2357,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
/*
|
||||
* Try to find prefetched page in the list of received pages.
|
||||
*/
|
||||
Retry:
|
||||
Retry:
|
||||
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
|
||||
|
||||
if (entry != NULL)
|
||||
@@ -2498,9 +2443,7 @@ Retry:
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
|
||||
"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
|
||||
T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
|
||||
}
|
||||
|
||||
/* buffer was used, clean up for later reuse */
|
||||
@@ -2771,9 +2714,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
|
||||
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
|
||||
}
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
|
||||
@@ -2826,9 +2767,7 @@ neon_dbsize(Oid dbNode)
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
|
||||
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
|
||||
}
|
||||
|
||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
@@ -3167,9 +3106,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
|
||||
T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ use futures::future::Either;
|
||||
use itertools::Itertools;
|
||||
use proxy::config::TlsServerEndPoint;
|
||||
use proxy::context::RequestMonitoring;
|
||||
use proxy::metrics::{Metrics, ThreadPoolMetrics};
|
||||
use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
|
||||
use rustls::pki_types::PrivateKeyDer;
|
||||
use tokio::net::TcpListener;
|
||||
@@ -66,8 +65,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
|
||||
|
||||
let args = cli().get_matches();
|
||||
let destination: String = args.get_one::<String>("dest").unwrap().parse()?;
|
||||
|
||||
|
||||
@@ -356,7 +356,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let cancel_map = CancelMap::default();
|
||||
|
||||
let redis_publisher = match ®ional_redis_client {
|
||||
let redis_publisher = match &redis_notifications_client {
|
||||
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
|
||||
redis_publisher.clone(),
|
||||
args.region.clone(),
|
||||
|
||||
@@ -51,10 +51,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
|
||||
) -> Poll<io::Result<usize>> {
|
||||
let this = self.project();
|
||||
let mut stream = this.stream;
|
||||
this.send.put(buf);
|
||||
|
||||
ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
|
||||
|
||||
this.send.put(buf);
|
||||
match stream.as_mut().start_send(Frame::binary(this.send.split())) {
|
||||
Ok(()) => Poll::Ready(Ok(buf.len())),
|
||||
Err(e) => Poll::Ready(Err(io_error(e))),
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::{HashMap, HashSet};
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_sdk_s3::{types::ObjectIdentifier, Client};
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use tracing::{error, info, warn};
|
||||
use utils::generation::Generation;
|
||||
@@ -208,7 +208,7 @@ impl TenantObjectListing {
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
layer_file: &LayerName,
|
||||
metadata: &LayerFileMetadata,
|
||||
metadata: &IndexLayerMetadata,
|
||||
) -> bool {
|
||||
let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
|
||||
return false;
|
||||
|
||||
@@ -11,7 +11,7 @@ use async_stream::stream;
|
||||
use aws_sdk_s3::Client;
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::{StreamExt, TryStreamExt};
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -49,8 +49,8 @@ impl SnapshotDownloader {
|
||||
&self,
|
||||
ttid: TenantShardTimelineId,
|
||||
layer_name: LayerName,
|
||||
layer_metadata: LayerFileMetadata,
|
||||
) -> anyhow::Result<(LayerName, LayerFileMetadata)> {
|
||||
layer_metadata: IndexLayerMetadata,
|
||||
) -> anyhow::Result<(LayerName, IndexLayerMetadata)> {
|
||||
// Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use
|
||||
// different layer names (remote-style has the generation suffix)
|
||||
let local_path = self.output_path.join(format!(
|
||||
@@ -110,7 +110,7 @@ impl SnapshotDownloader {
|
||||
async fn download_layers(
|
||||
&self,
|
||||
ttid: TenantShardTimelineId,
|
||||
layers: Vec<(LayerName, LayerFileMetadata)>,
|
||||
layers: Vec<(LayerName, IndexLayerMetadata)>,
|
||||
) -> anyhow::Result<()> {
|
||||
let layer_count = layers.len();
|
||||
tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
|
||||
@@ -161,7 +161,10 @@ impl SnapshotDownloader {
|
||||
ttid: TenantShardTimelineId,
|
||||
index_part: Box<IndexPart>,
|
||||
index_part_generation: Generation,
|
||||
ancestor_layers: &mut HashMap<TenantShardTimelineId, HashMap<LayerName, LayerFileMetadata>>,
|
||||
ancestor_layers: &mut HashMap<
|
||||
TenantShardTimelineId,
|
||||
HashMap<LayerName, IndexLayerMetadata>,
|
||||
>,
|
||||
) -> anyhow::Result<()> {
|
||||
let index_bytes = serde_json::to_string(&index_part).unwrap();
|
||||
|
||||
@@ -231,7 +234,7 @@ impl SnapshotDownloader {
|
||||
// happen if this tenant has been split at some point)
|
||||
let mut ancestor_layers: HashMap<
|
||||
TenantShardTimelineId,
|
||||
HashMap<LayerName, LayerFileMetadata>,
|
||||
HashMap<LayerName, IndexLayerMetadata>,
|
||||
> = Default::default();
|
||||
|
||||
for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
|
||||
|
||||
@@ -287,26 +287,6 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))
|
||||
}
|
||||
|
||||
/// Force persist control file and remove old WAL.
|
||||
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
|
||||
let tli = GlobalTimelines::get(ttid)?;
|
||||
tli.maybe_persist_control_file(true)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
tli.remove_old_wal()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Deactivates the timeline and removes its data directory.
|
||||
async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
@@ -573,10 +553,6 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
|
||||
|r| request_span(r, patch_control_file_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|
||||
|r| request_span(r, timeline_checkpoint_handler),
|
||||
)
|
||||
// for tests
|
||||
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
|
||||
request_span(r, record_safekeeper_info)
|
||||
|
||||
@@ -11,7 +11,6 @@ use tracing::info;
|
||||
use utils::{
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
pausable_failpoint,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
@@ -163,8 +162,6 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
|
||||
filenames.remove(control_file_index);
|
||||
filenames.insert(0, "safekeeper.control".to_string());
|
||||
|
||||
pausable_failpoint!("sk-pull-timeline-after-list-pausable");
|
||||
|
||||
info!(
|
||||
"downloading {} files from safekeeper {}",
|
||||
filenames.len(),
|
||||
@@ -186,13 +183,6 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
|
||||
|
||||
let mut file = tokio::fs::File::create(&file_path).await?;
|
||||
let mut response = client.get(&http_url).send().await?;
|
||||
if response.status() != reqwest::StatusCode::OK {
|
||||
bail!(
|
||||
"pulling file {} failed: status is {}",
|
||||
filename,
|
||||
response.status()
|
||||
);
|
||||
}
|
||||
while let Some(chunk) = response.chunk().await? {
|
||||
file.write_all(&chunk).await?;
|
||||
file.flush().await?;
|
||||
|
||||
@@ -15,7 +15,7 @@ pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
for tli in &tlis {
|
||||
let ttid = tli.ttid;
|
||||
async {
|
||||
if let Err(e) = tli.maybe_persist_control_file(false).await {
|
||||
if let Err(e) = tli.maybe_persist_control_file().await {
|
||||
warn!("failed to persist control file: {e}");
|
||||
}
|
||||
if let Err(e) = tli.remove_old_wal().await {
|
||||
|
||||
@@ -827,10 +827,10 @@ where
|
||||
|
||||
/// Persist control file if there is something to save and enough time
|
||||
/// passed after the last save.
|
||||
pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
|
||||
pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> {
|
||||
const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
|
||||
if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
|
||||
return Ok(false);
|
||||
if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
|
||||
return Ok(());
|
||||
}
|
||||
let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
|
||||
|| self.state.inmem.backup_lsn > self.state.backup_lsn
|
||||
@@ -840,7 +840,7 @@ where
|
||||
self.state.flush().await?;
|
||||
trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
|
||||
}
|
||||
Ok(need_persist)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle request to append WAL.
|
||||
|
||||
@@ -104,16 +104,11 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
|
||||
pub struct WriteGuardSharedState<'a> {
|
||||
tli: Arc<Timeline>,
|
||||
guard: RwLockWriteGuard<'a, SharedState>,
|
||||
skip_update: bool,
|
||||
}
|
||||
|
||||
impl<'a> WriteGuardSharedState<'a> {
|
||||
fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
|
||||
WriteGuardSharedState {
|
||||
tli,
|
||||
guard,
|
||||
skip_update: false,
|
||||
}
|
||||
WriteGuardSharedState { tli, guard }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -154,12 +149,10 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
|
||||
}
|
||||
});
|
||||
|
||||
if !self.skip_update {
|
||||
// send notification about shared state update
|
||||
self.tli.shared_state_version_tx.send_modify(|old| {
|
||||
*old += 1;
|
||||
});
|
||||
}
|
||||
// send notification about shared state update
|
||||
self.tli.shared_state_version_tx.send_modify(|old| {
|
||||
*old += 1;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -809,11 +802,7 @@ impl Timeline {
|
||||
|
||||
// update last_removed_segno
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
if shared_state.last_removed_segno != horizon_segno {
|
||||
shared_state.last_removed_segno = horizon_segno;
|
||||
} else {
|
||||
shared_state.skip_update = true;
|
||||
}
|
||||
shared_state.last_removed_segno = horizon_segno;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -821,11 +810,12 @@ impl Timeline {
|
||||
/// passed after the last save. This helps to keep remote_consistent_lsn up
|
||||
/// to date so that storage nodes restart doesn't cause many pageserver ->
|
||||
/// safekeeper reconnections.
|
||||
pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
|
||||
let mut guard = self.write_shared_state().await;
|
||||
let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
|
||||
guard.skip_update = !changed;
|
||||
Ok(())
|
||||
pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
|
||||
self.write_shared_state()
|
||||
.await
|
||||
.sk
|
||||
.maybe_persist_inmem_control_file()
|
||||
.await
|
||||
}
|
||||
|
||||
/// Gather timeline data for metrics.
|
||||
|
||||
@@ -106,7 +106,7 @@ pub async fn main_task(
|
||||
|
||||
if !is_active {
|
||||
// TODO: maybe use tokio::spawn?
|
||||
if let Err(e) = tli.maybe_persist_control_file(false).await {
|
||||
if let Err(e) = tli.maybe_persist_control_file().await {
|
||||
warn!("control file save in update_status failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,6 @@ from typing import Any, Type, TypeVar, Union
|
||||
|
||||
T = TypeVar("T", bound="Id")
|
||||
|
||||
DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024
|
||||
|
||||
|
||||
@total_ordering
|
||||
class Lsn:
|
||||
@@ -69,9 +67,6 @@ class Lsn:
|
||||
def as_int(self) -> int:
|
||||
return self.lsn_int
|
||||
|
||||
def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
|
||||
return Lsn(self.lsn_int - (self.lsn_int % seg_sz))
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Key:
|
||||
|
||||
@@ -2667,9 +2667,7 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
)
|
||||
|
||||
def list_layers(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
|
||||
) -> list[Path]:
|
||||
def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
|
||||
"""
|
||||
Inspect local storage on a pageserver to discover which layer files are present.
|
||||
|
||||
@@ -3771,7 +3769,7 @@ class SafekeeperPort:
|
||||
|
||||
|
||||
@dataclass
|
||||
class Safekeeper(LogUtils):
|
||||
class Safekeeper:
|
||||
"""An object representing a running safekeeper daemon."""
|
||||
|
||||
env: NeonEnv
|
||||
@@ -3779,13 +3777,6 @@ class Safekeeper(LogUtils):
|
||||
id: int
|
||||
running: bool = False
|
||||
|
||||
def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False):
|
||||
self.env = env
|
||||
self.port = port
|
||||
self.id = id
|
||||
self.running = running
|
||||
self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
|
||||
|
||||
def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
|
||||
assert self.running is False
|
||||
self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
|
||||
@@ -3846,38 +3837,11 @@ class Safekeeper(LogUtils):
|
||||
port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
|
||||
)
|
||||
|
||||
def get_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
|
||||
timeline_status = self.http_client().timeline_status(tenant_id, timeline_id)
|
||||
timeline_start_lsn = timeline_status.timeline_start_lsn
|
||||
log.info(f"sk {self.id} timeline start LSN: {timeline_start_lsn}")
|
||||
return timeline_start_lsn
|
||||
def data_dir(self) -> str:
|
||||
return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
|
||||
|
||||
def get_flush_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
|
||||
timeline_status = self.http_client().timeline_status(tenant_id, timeline_id)
|
||||
flush_lsn = timeline_status.flush_lsn
|
||||
log.info(f"sk {self.id} flush LSN: {flush_lsn}")
|
||||
return flush_lsn
|
||||
|
||||
def pull_timeline(
|
||||
self, srcs: list[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
pull_timeline from srcs to self.
|
||||
"""
|
||||
src_https = [f"http://localhost:{sk.port.http}" for sk in srcs]
|
||||
res = self.http_client().pull_timeline(
|
||||
{"tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "http_hosts": src_https}
|
||||
)
|
||||
src_ids = [sk.id for sk in srcs]
|
||||
log.info(f"finished pulling timeline from {src_ids} to {self.id}")
|
||||
return res
|
||||
|
||||
@property
|
||||
def data_dir(self) -> Path:
|
||||
return self.env.repo_dir / "safekeepers" / f"sk{self.id}"
|
||||
|
||||
def timeline_dir(self, tenant_id, timeline_id) -> Path:
|
||||
return self.data_dir / str(tenant_id) / str(timeline_id)
|
||||
def timeline_dir(self, tenant_id, timeline_id) -> str:
|
||||
return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id))
|
||||
|
||||
def list_segments(self, tenant_id, timeline_id) -> List[str]:
|
||||
"""
|
||||
@@ -3890,35 +3854,6 @@ class Safekeeper(LogUtils):
|
||||
segments.sort()
|
||||
return segments
|
||||
|
||||
def checkpoint_up_to(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
|
||||
"""
|
||||
Assuming pageserver(s) uploaded to s3 up to `lsn`,
|
||||
1) wait for remote_consistent_lsn and wal_backup_lsn on safekeeper to reach it.
|
||||
2) checkpoint timeline on safekeeper, which should remove WAL before this LSN.
|
||||
"""
|
||||
cli = self.http_client()
|
||||
|
||||
def are_lsns_advanced():
|
||||
stat = cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(
|
||||
f"waiting for remote_consistent_lsn and backup_lsn on sk {self.id} to reach {lsn}, currently remote_consistent_lsn={stat.remote_consistent_lsn}, backup_lsn={stat.backup_lsn}"
|
||||
)
|
||||
assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn()
|
||||
|
||||
# xxx: max wait is long because we might be waiting for reconnection from
|
||||
# pageserver to this safekeeper
|
||||
wait_until(30, 1, are_lsns_advanced)
|
||||
cli.checkpoint(tenant_id, timeline_id)
|
||||
|
||||
def wait_until_paused(self, failpoint: str):
|
||||
msg = f"at failpoint {failpoint}"
|
||||
|
||||
def paused():
|
||||
log.info(f"waiting for hitting failpoint {failpoint}")
|
||||
self.assert_log_contains(msg)
|
||||
|
||||
wait_until(20, 0.5, paused)
|
||||
|
||||
|
||||
class S3Scrubber:
|
||||
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
|
||||
|
||||
@@ -70,7 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
# this is expected given our collaborative shutdown approach for the UploadQueue
|
||||
".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
|
||||
".*Compaction failed.*, retrying in .*: ShuttingDown",
|
||||
".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*",
|
||||
".*Compaction failed.*, retrying in .*: timeline shutting down.*",
|
||||
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
|
||||
".*Error processing HTTP request: NotFound: Timeline .* was not found",
|
||||
".*took more than expected to complete.*",
|
||||
|
||||
@@ -177,13 +177,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
|
||||
json={},
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
# only_local doesn't remove segments in the remote storage.
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
|
||||
|
||||
@@ -196,7 +196,7 @@ def query_scalar(cur: cursor, query: str) -> Any:
|
||||
|
||||
|
||||
# Traverse directory to get total size.
|
||||
def get_dir_size(path: Path) -> int:
|
||||
def get_dir_size(path: str) -> int:
|
||||
"""Return size in bytes."""
|
||||
totalbytes = 0
|
||||
for root, _dirs, files in os.walk(path):
|
||||
@@ -541,44 +541,11 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str
|
||||
|
||||
left_list, right_list = map(build_hash_list, [left, right])
|
||||
|
||||
assert len(left_list) == len(
|
||||
right_list
|
||||
), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}"
|
||||
try:
|
||||
assert len(left_list) == len(right_list)
|
||||
|
||||
mismatching = set()
|
||||
|
||||
for left_tuple, right_tuple in zip(left_list, right_list):
|
||||
left_path, left_hash = left_tuple
|
||||
right_path, right_hash = right_tuple
|
||||
assert (
|
||||
left_path == right_path
|
||||
), f"file count matched, expected these to be same paths: {left_path}, {right_path}"
|
||||
if left_hash != right_hash:
|
||||
mismatching.add(left_path)
|
||||
|
||||
assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}"
|
||||
|
||||
elapsed = time.time() - started_at
|
||||
log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
|
||||
|
||||
|
||||
class PropagatingThread(threading.Thread):
|
||||
_target: Any
|
||||
_args: Any
|
||||
_kwargs: Any
|
||||
"""
|
||||
Simple Thread wrapper with join() propagating the possible exception in the thread.
|
||||
"""
|
||||
|
||||
def run(self):
|
||||
self.exc = None
|
||||
try:
|
||||
self.ret = self._target(*self._args, **self._kwargs)
|
||||
except BaseException as e:
|
||||
self.exc = e
|
||||
|
||||
def join(self, timeout=None):
|
||||
super(PropagatingThread, self).join(timeout)
|
||||
if self.exc:
|
||||
raise self.exc
|
||||
return self.ret
|
||||
for left_tuple, right_tuple in zip(left_list, right_list):
|
||||
assert left_tuple == right_tuple
|
||||
finally:
|
||||
elapsed = time.time() - started_at
|
||||
log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
|
||||
|
||||
@@ -17,13 +17,9 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# eviction might be the first one after an attach to access the layers
|
||||
".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
|
||||
# detach can happen before we get to validate the generation number
|
||||
".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
|
||||
]
|
||||
# eviction might be the first one after an attach to access the layers
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
|
||||
)
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
return env
|
||||
|
||||
@@ -163,7 +163,7 @@ def test_import_from_pageserver_small(
|
||||
|
||||
num_rows = 3000
|
||||
lsn = _generate_data(num_rows, endpoint)
|
||||
_import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
|
||||
_import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir)
|
||||
|
||||
|
||||
@pytest.mark.timeout(1800)
|
||||
@@ -193,7 +193,9 @@ def test_import_from_pageserver_multisegment(
|
||||
log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
|
||||
assert logical_size > 1024**3 # = 1GB
|
||||
|
||||
tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
|
||||
tar_output_file = _import(
|
||||
num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir
|
||||
)
|
||||
|
||||
# Check if the backup data contains multiple segment files
|
||||
cnt_seg_files = 0
|
||||
@@ -233,6 +235,7 @@ def _import(
|
||||
env: NeonEnv,
|
||||
pg_bin: PgBin,
|
||||
timeline: TimelineId,
|
||||
pg_distrib_dir: Path,
|
||||
test_output_dir: Path,
|
||||
) -> Path:
|
||||
"""Test importing backup data to the pageserver.
|
||||
@@ -292,7 +295,7 @@ def _import(
|
||||
wait_for_upload(client, tenant, timeline, lsn)
|
||||
|
||||
# Check it worked
|
||||
endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn)
|
||||
endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
|
||||
assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
|
||||
|
||||
# Take another fullbackup
|
||||
|
||||
@@ -1,282 +0,0 @@
|
||||
from contextlib import closing
|
||||
from typing import Set
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from psycopg2.errors import QueryCanceled
|
||||
|
||||
CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = {
|
||||
"ps::connection-start::pre-login",
|
||||
"ps::connection-start::startup-packet",
|
||||
"ps::connection-start::process-query",
|
||||
"ps::handle-pagerequest-message::exists",
|
||||
"ps::handle-pagerequest-message::nblocks",
|
||||
"ps::handle-pagerequest-message::getpage",
|
||||
"ps::handle-pagerequest-message::dbsize",
|
||||
# We don't yet have a good way to on-demand guarantee the download of an
|
||||
# SLRU segment, so that's disabled for now.
|
||||
# "ps::handle-pagerequest-message::slrusegment",
|
||||
}
|
||||
|
||||
PG_PS_START_FAILPOINTS = {
|
||||
"ps::connection-start::pre-login",
|
||||
"ps::connection-start::startup-packet",
|
||||
"ps::connection-start::process-query",
|
||||
}
|
||||
SMGR_EXISTS = "ps::handle-pagerequest-message::exists"
|
||||
SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks"
|
||||
SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage"
|
||||
SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize"
|
||||
|
||||
"""
|
||||
Test that we can handle connection delays and cancellations at various
|
||||
unfortunate connection startup and request states.
|
||||
"""
|
||||
|
||||
|
||||
def test_cancellations(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
ps = env.pageserver
|
||||
ps_http = ps.http_client()
|
||||
ps_http.is_testing_enabled_or_skip()
|
||||
|
||||
env.neon_cli.create_branch("test_config", "empty")
|
||||
|
||||
# We don't want to have any racy behaviour with autovacuum IOs
|
||||
ep = env.endpoints.create_start(
|
||||
"test_config",
|
||||
config_lines=[
|
||||
"autovacuum = off",
|
||||
"shared_buffers = 128MB",
|
||||
],
|
||||
)
|
||||
|
||||
with closing(ep.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE TABLE test1 AS
|
||||
SELECT id, sha256(id::text::bytea) payload
|
||||
FROM generate_series(1, 1024::bigint) p(id);
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE TABLE test2 AS
|
||||
SELECT id, sha256(id::text::bytea) payload
|
||||
FROM generate_series(1025, 2048::bigint) p(id);
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
VACUUM (ANALYZE, FREEZE) test1, test2;
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE EXTENSION pg_buffercache;
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE EXTENSION pg_prewarm;
|
||||
"""
|
||||
)
|
||||
|
||||
# data preparation is now complete, with 2 disjoint tables that aren't
|
||||
# preloaded into any caches.
|
||||
|
||||
ep.stop()
|
||||
|
||||
for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS:
|
||||
connect_works_correctly(failpoint, ep, ps, ps_http)
|
||||
|
||||
|
||||
ENABLED_FAILPOINTS: Set[str] = set()
|
||||
|
||||
|
||||
def connect_works_correctly(
|
||||
failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient
|
||||
):
|
||||
log.debug("Starting work on %s", failpoint)
|
||||
# All queries we use should finish (incl. IO) within 500ms,
|
||||
# including all their IO.
|
||||
# This allows us to use `SET statement_timeout` to let the query
|
||||
# timeout system cancel queries, rather than us having to go
|
||||
# through the most annoying effort of manual query cancellation
|
||||
# in psycopg2.
|
||||
options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1"
|
||||
|
||||
ep.start()
|
||||
|
||||
def fp_enable():
|
||||
global ENABLED_FAILPOINTS
|
||||
ps_http.configure_failpoints(
|
||||
[
|
||||
(failpoint, "pause"),
|
||||
]
|
||||
)
|
||||
ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint}
|
||||
log.info(
|
||||
'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2
|
||||
)
|
||||
|
||||
def fp_disable():
|
||||
global ENABLED_FAILPOINTS
|
||||
ps_http.configure_failpoints(
|
||||
[
|
||||
(failpoint, "off"),
|
||||
]
|
||||
)
|
||||
ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint}
|
||||
log.info(
|
||||
'Disabled failpoint "%s", current_active=%s',
|
||||
failpoint,
|
||||
ENABLED_FAILPOINTS,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
def check_buffers(cur):
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT n.nspname AS nspname
|
||||
, c.relname AS relname
|
||||
, count(*) AS count
|
||||
FROM pg_buffercache b
|
||||
JOIN pg_class c
|
||||
ON b.relfilenode = pg_relation_filenode(c.oid) AND
|
||||
b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database())
|
||||
JOIN pg_namespace n ON n.oid = c.relnamespace
|
||||
WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid)
|
||||
GROUP BY n.nspname, c.relname
|
||||
ORDER BY 3 DESC
|
||||
LIMIT 10
|
||||
"""
|
||||
)
|
||||
return cur.fetchone()
|
||||
|
||||
def exec_may_cancel(query, cursor, result, cancels):
|
||||
if cancels:
|
||||
with pytest.raises(QueryCanceled):
|
||||
cursor.execute(query)
|
||||
assert cursor.fetchone() == result
|
||||
else:
|
||||
cursor.execute(query)
|
||||
assert cursor.fetchone() == result
|
||||
|
||||
fp_disable()
|
||||
|
||||
# Warm caches required for new connections, so that they can run without
|
||||
# requiring catalog reads.
|
||||
with closing(ep.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT 1;
|
||||
"""
|
||||
)
|
||||
assert cur.fetchone() == (1,)
|
||||
|
||||
assert check_buffers(cur) is None
|
||||
# Ensure all caches required for connection start are correctly
|
||||
# filled, so that we don't have any "accidents" in this test run
|
||||
# caused by changes in connection startup plans that require
|
||||
# requests to the PageServer.
|
||||
cur.execute(
|
||||
"""
|
||||
select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0))
|
||||
from pg_class c
|
||||
where c.oid < 16384 AND c.relkind IN ('i', 'r');
|
||||
"""
|
||||
)
|
||||
assert cur.fetchone() == ([True],)
|
||||
|
||||
# Enable failpoint
|
||||
fp_enable()
|
||||
|
||||
with closing(ep.connect(options=options, autocommit=True)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SHOW statement_timeout;")
|
||||
assert cur.fetchone() == ("500ms",)
|
||||
assert check_buffers(cur) is None
|
||||
exec_may_cancel(
|
||||
"""
|
||||
SELECT min(id) FROM test1;
|
||||
""",
|
||||
cur,
|
||||
(1,),
|
||||
failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
|
||||
)
|
||||
|
||||
fp_disable()
|
||||
|
||||
with closing(ep.connect(options=options, autocommit=True)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Do a select on the data, putting some buffers into the prefetch
|
||||
# queue.
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT count(id) FROM (select * from test1 LIMIT 256) a;
|
||||
"""
|
||||
)
|
||||
assert cur.fetchone() == (256,)
|
||||
|
||||
ps.stop()
|
||||
ps.start()
|
||||
fp_enable()
|
||||
|
||||
exec_may_cancel(
|
||||
"""
|
||||
SELECT COUNT(id) FROM test1;
|
||||
""",
|
||||
cur,
|
||||
(1024,),
|
||||
failpoint
|
||||
in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}),
|
||||
)
|
||||
|
||||
with closing(ep.connect(options=options, autocommit=True)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
exec_may_cancel(
|
||||
"""
|
||||
SELECT COUNT(id) FROM test2;
|
||||
""",
|
||||
cur,
|
||||
(1024,),
|
||||
failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
|
||||
)
|
||||
|
||||
fp_disable()
|
||||
fp_enable()
|
||||
|
||||
exec_may_cancel(
|
||||
"""
|
||||
SELECT 0 < pg_database_size(CURRENT_DATABASE());
|
||||
""",
|
||||
cur,
|
||||
(True,),
|
||||
failpoint
|
||||
in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}),
|
||||
)
|
||||
|
||||
fp_disable()
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2;
|
||||
"""
|
||||
)
|
||||
|
||||
assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376)
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1;
|
||||
"""
|
||||
)
|
||||
|
||||
assert cur.fetchone() == (1024, 1024, 1, 1024, 524800)
|
||||
|
||||
ep.stop()
|
||||
@@ -177,16 +177,7 @@ def test_sharding_split_unsharded(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"failpoint",
|
||||
[
|
||||
None,
|
||||
"compact-shard-ancestors-localonly",
|
||||
"compact-shard-ancestors-enqueued",
|
||||
"compact-shard-ancestors-persistent",
|
||||
],
|
||||
)
|
||||
def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
|
||||
def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that after a split, we clean up parent layer data in the child shards via compaction.
|
||||
"""
|
||||
@@ -205,11 +196,6 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
|
||||
"image_layer_creation_check_threshold": "0",
|
||||
}
|
||||
|
||||
neon_env_builder.storage_controller_config = {
|
||||
# Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
|
||||
"max_unavailable": "300s"
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
@@ -227,10 +213,6 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
|
||||
# Split one shard into two
|
||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
|
||||
|
||||
# Let all shards move into their stable locations, so that during subsequent steps we
|
||||
# don't have reconciles in progress (simpler to reason about what messages we expect in logs)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
# Check we got the shard IDs we expected
|
||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
|
||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
|
||||
@@ -255,90 +237,6 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
|
||||
# Compaction shouldn't make anything unreadable
|
||||
workload.validate()
|
||||
|
||||
# Force a generation increase: layer rewrites are a long-term thing and only happen after
|
||||
# the generation has increased.
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
# Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant
|
||||
env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"})
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
|
||||
# Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior
|
||||
# across restarts, as we will have local layer files that temporarily disagree with the remote metadata
|
||||
# for the same local layer file name.
|
||||
if failpoint is not None:
|
||||
ps.http_client().configure_failpoints((failpoint, "exit"))
|
||||
|
||||
# Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten)
|
||||
# Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively.
|
||||
ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0)
|
||||
|
||||
# We will compare stats before + after compaction
|
||||
detail_before = ps.http_client().timeline_detail(shard, timeline_id)
|
||||
|
||||
# Invoke compaction: this should rewrite layers that are behind the pitr horizon
|
||||
try:
|
||||
ps.http_client().timeline_compact(shard, timeline_id)
|
||||
except requests.ConnectionError as e:
|
||||
if failpoint is None:
|
||||
raise e
|
||||
else:
|
||||
log.info(f"Compaction failed (failpoint={failpoint}): {e}")
|
||||
|
||||
if failpoint in (
|
||||
"compact-shard-ancestors-localonly",
|
||||
"compact-shard-ancestors-enqueued",
|
||||
):
|
||||
# If we left local files that don't match remote metadata, we expect warnings on next startup
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*removing local file .+ because it has unexpected length.*"
|
||||
)
|
||||
|
||||
# Post-failpoint: we check that the pageserver comes back online happily.
|
||||
env.pageserver.running = False
|
||||
env.pageserver.start()
|
||||
else:
|
||||
assert failpoint is None # We shouldn't reach success path if a failpoint was set
|
||||
|
||||
detail_after = ps.http_client().timeline_detail(shard, timeline_id)
|
||||
|
||||
# Physical size should shrink because layers are smaller
|
||||
assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
|
||||
|
||||
# Validate size statistics
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
|
||||
reported_size = timeline_info["current_physical_size"]
|
||||
layer_paths = ps.list_layers(shard, timeline_id)
|
||||
measured_size = 0
|
||||
for p in layer_paths:
|
||||
abs_path = ps.timeline_dir(shard, timeline_id) / p
|
||||
measured_size += os.stat(abs_path).st_size
|
||||
|
||||
log.info(
|
||||
f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)"
|
||||
)
|
||||
|
||||
if failpoint in (
|
||||
"compact-shard-ancestors-localonly",
|
||||
"compact-shard-ancestors-enqueued",
|
||||
):
|
||||
# If we injected a failure between local rewrite and remote upload, then after
|
||||
# restart we may end up with neither version of the file on local disk (the new file
|
||||
# is cleaned up because it doesn't matchc remote metadata). So local size isn't
|
||||
# necessarily going to match remote physical size.
|
||||
continue
|
||||
|
||||
assert measured_size == reported_size
|
||||
|
||||
# Compaction shouldn't make anything unreadable
|
||||
workload.validate()
|
||||
|
||||
|
||||
def test_sharding_split_smoke(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
|
||||
@@ -23,6 +23,7 @@ from fixtures.log_helper import log
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.neon_fixtures import (
|
||||
Endpoint,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
NeonPageserver,
|
||||
PgBin,
|
||||
@@ -47,7 +48,7 @@ from fixtures.remote_storage import (
|
||||
)
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.safekeeper.utils import are_walreceivers_absent
|
||||
from fixtures.utils import PropagatingThread, get_dir_size, query_scalar, start_in_background
|
||||
from fixtures.utils import get_dir_size, query_scalar, start_in_background
|
||||
|
||||
|
||||
def wait_lsn_force_checkpoint(
|
||||
@@ -359,7 +360,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
|
||||
# We will wait for first segment removal. Make sure they exist for starter.
|
||||
first_segments = [
|
||||
sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001"
|
||||
os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id), "000000010000000000000001")
|
||||
for sk in env.safekeepers
|
||||
]
|
||||
assert all(os.path.exists(p) for p in first_segments)
|
||||
@@ -444,7 +445,7 @@ def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: Tim
|
||||
def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb):
|
||||
http_cli = sk.http_client()
|
||||
tli_status = http_cli.timeline_status(tenant_id, timeline_id)
|
||||
sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id))
|
||||
sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id)))
|
||||
sk_wal_size_mb = sk_wal_size / 1024 / 1024
|
||||
log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}")
|
||||
return sk_wal_size_mb <= target_size_mb
|
||||
@@ -590,10 +591,10 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# save the last (partial) file to put it back after recreation; others will be fetched from s3
|
||||
sk = env.safekeepers[0]
|
||||
tli_dir = Path(sk.data_dir) / str(tenant_id) / str(timeline_id)
|
||||
tli_dir = Path(sk.data_dir()) / str(tenant_id) / str(timeline_id)
|
||||
f_partial = Path([f for f in os.listdir(tli_dir) if f.endswith(".partial")][0])
|
||||
f_partial_path = tli_dir / f_partial
|
||||
f_partial_saved = Path(sk.data_dir) / f_partial.name
|
||||
f_partial_saved = Path(sk.data_dir()) / f_partial.name
|
||||
f_partial_path.rename(f_partial_saved)
|
||||
|
||||
pg_version = sk.http_client().timeline_status(tenant_id, timeline_id).pg_version
|
||||
@@ -615,7 +616,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
|
||||
cli = sk.http_client()
|
||||
cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn)
|
||||
f_partial_path = (
|
||||
Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
|
||||
Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
|
||||
)
|
||||
shutil.copy(f_partial_saved, f_partial_path)
|
||||
|
||||
@@ -1131,8 +1132,8 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
|
||||
)
|
||||
|
||||
for f in mismatch:
|
||||
f1 = sk0.timeline_dir(tenant_id, timeline_id) / f
|
||||
f2 = sk.timeline_dir(tenant_id, timeline_id) / f
|
||||
f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
|
||||
f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
|
||||
stdout_filename = f"{f2}.filediff"
|
||||
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
@@ -1630,7 +1631,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t(key int primary key)")
|
||||
sk = env.safekeepers[0]
|
||||
sk_data_dir = sk.data_dir
|
||||
sk_data_dir = Path(sk.data_dir())
|
||||
if not auth_enabled:
|
||||
sk_http = sk.http_client()
|
||||
sk_http_other = sk_http
|
||||
@@ -1723,6 +1724,9 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
|
||||
|
||||
def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str:
|
||||
return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names])
|
||||
|
||||
def execute_payload(endpoint: Endpoint):
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -1808,65 +1812,6 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
show_statuses(env.safekeepers, tenant_id, timeline_id)
|
||||
|
||||
|
||||
# Test pull_timeline while concurrently gc'ing WAL on safekeeper:
|
||||
# 1) Start pull_timeline, listing files to fetch.
|
||||
# 2) Write segment, do gc.
|
||||
# 3) Finish pull_timeline.
|
||||
# 4) Do some write, verify integrity with timeline_digest.
|
||||
# Expected to fail while holding off WAL gc plus fetching commit_lsn WAL
|
||||
# segment is not implemented.
|
||||
@pytest.mark.xfail
|
||||
def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
(src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
|
||||
|
||||
log.info("use only first 2 safekeepers, 3rd will be seeded")
|
||||
endpoint = env.endpoints.create("main")
|
||||
endpoint.active_safekeepers = [1, 2]
|
||||
endpoint.start()
|
||||
endpoint.safe_psql("create table t(key int, value text)")
|
||||
endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
|
||||
|
||||
src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id)
|
||||
log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}")
|
||||
|
||||
dst_http = dst_sk.http_client()
|
||||
# run pull_timeline which will halt before downloading files
|
||||
dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
|
||||
pt_handle = PropagatingThread(
|
||||
target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
|
||||
)
|
||||
pt_handle.start()
|
||||
dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
|
||||
|
||||
# ensure segment exists
|
||||
endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'")
|
||||
lsn = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
||||
assert lsn > Lsn("0/2000000")
|
||||
# Checkpoint timeline beyond lsn.
|
||||
src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn)
|
||||
first_segment_p = src_sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001"
|
||||
log.info(f"first segment exist={os.path.exists(first_segment_p)}")
|
||||
|
||||
dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
|
||||
pt_handle.join()
|
||||
|
||||
timeline_start_lsn = src_sk.get_timeline_start_lsn(tenant_id, timeline_id)
|
||||
dst_flush_lsn = dst_sk.get_flush_lsn(tenant_id, timeline_id)
|
||||
log.info(f"flush_lsn on dst after pull_timeline: {dst_flush_lsn}")
|
||||
assert dst_flush_lsn >= src_flush_lsn
|
||||
digests = [
|
||||
sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, dst_flush_lsn)
|
||||
for sk in [src_sk, dst_sk]
|
||||
]
|
||||
assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}"
|
||||
|
||||
|
||||
# In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
|
||||
# when compute is active, but there are no writes to the timeline. In that case
|
||||
# pageserver should maintain a single connection to safekeeper and don't attempt
|
||||
|
||||
@@ -531,64 +531,6 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
|
||||
asyncio.run(run_recovery_uncommitted(env))
|
||||
|
||||
|
||||
async def run_wal_truncation(env: NeonEnv):
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
(sk1, sk2, sk3) = env.safekeepers
|
||||
|
||||
ep = env.endpoints.create_start("main")
|
||||
ep.safe_psql("create table t (key int, value text)")
|
||||
ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
|
||||
|
||||
# insert with only one sk3 up to create tail of flushed but not committed WAL on it
|
||||
sk1.stop()
|
||||
sk2.stop()
|
||||
conn = await ep.connect_async()
|
||||
# query should hang, so execute in separate task
|
||||
bg_query = asyncio.create_task(
|
||||
conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
|
||||
)
|
||||
sleep_sec = 2
|
||||
await asyncio.sleep(sleep_sec)
|
||||
# it must still be not finished
|
||||
assert not bg_query.done()
|
||||
# note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
|
||||
ep.stop_and_destroy()
|
||||
|
||||
# stop sk3 as well
|
||||
sk3.stop()
|
||||
|
||||
# now start sk1 and sk2 and make them commit something
|
||||
sk1.start()
|
||||
sk2.start()
|
||||
ep = env.endpoints.create_start(
|
||||
"main",
|
||||
)
|
||||
ep.safe_psql("insert into t select generate_series(1, 200), 'payload'")
|
||||
|
||||
# start sk3 and wait for it to catch up
|
||||
sk3.start()
|
||||
flush_lsn = Lsn(ep.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()"))
|
||||
await wait_for_lsn(sk3, tenant_id, timeline_id, flush_lsn)
|
||||
|
||||
timeline_start_lsn = sk1.get_timeline_start_lsn(tenant_id, timeline_id)
|
||||
digests = [
|
||||
sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, flush_lsn)
|
||||
for sk in [sk1, sk2]
|
||||
]
|
||||
assert digests[0] == digests[1], f"digest on sk1 is {digests[0]} but on sk3 is {digests[1]}"
|
||||
|
||||
|
||||
# Simple deterministic test creating tail of WAL on safekeeper which is
|
||||
# truncated when majority without this sk elects walproposer starting earlier.
|
||||
def test_wal_truncation(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
asyncio.run(run_wal_truncation(env))
|
||||
|
||||
|
||||
async def run_segment_init_failure(env: NeonEnv):
|
||||
env.neon_cli.create_branch("test_segment_init_failure")
|
||||
ep = env.endpoints.create_start("test_segment_init_failure")
|
||||
|
||||
@@ -194,7 +194,7 @@ files:
|
||||
|
||||
- metric_name: pg_stats_userdb
|
||||
type: gauge
|
||||
help: 'Stats for several oldest non-system dbs'
|
||||
help: 'Stats for the oldest non-system db'
|
||||
key_labels:
|
||||
- datname
|
||||
value_label: kind
|
||||
@@ -205,8 +205,9 @@ files:
|
||||
- inserted
|
||||
- updated
|
||||
- deleted
|
||||
# We export stats for 10 non-system database. Without this limit
|
||||
# We export stats for only one non-system database. Without this limit
|
||||
# it is too easy to abuse the system by creating lots of databases.
|
||||
# We can try lifting this limit in the future after we understand the needs better.
|
||||
query: |
|
||||
select pg_database_size(datname) as db_size, deadlocks,
|
||||
tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
|
||||
@@ -217,7 +218,7 @@ files:
|
||||
from pg_database
|
||||
where datname <> 'postgres' and not datistemplate
|
||||
order by oid
|
||||
limit 10
|
||||
limit 1
|
||||
);
|
||||
|
||||
- metric_name: max_cluster_size
|
||||
@@ -319,7 +320,7 @@ files:
|
||||
|
||||
- metric_name: wal_is_lost
|
||||
type: gauge
|
||||
help: 'Whether or not the replication slot wal_status is lost'
|
||||
help: 'Whether or not the replication slot\'s wal_status is lost'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [wal_status_is_lost]
|
||||
|
||||
Reference in New Issue
Block a user