mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-13 23:50:36 +00:00
Compare commits
35 Commits
ci-run/pr-
...
undo_unlog
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
92e72cc3f3 | ||
|
|
f9416ebf2b | ||
|
|
0c9dee9d06 | ||
|
|
5a5775806f | ||
|
|
947f8c59dd | ||
|
|
520101170f | ||
|
|
1bd86c5c6a | ||
|
|
e4fc6c3162 | ||
|
|
fcd7d7008f | ||
|
|
acf0a11fea | ||
|
|
c1f55c1525 | ||
|
|
34f450c05a | ||
|
|
db477c0b8c | ||
|
|
a345cf3fc6 | ||
|
|
e98bc4fd2b | ||
|
|
7e60563910 | ||
|
|
ef83f31e77 | ||
|
|
9fda85b486 | ||
|
|
87afbf6b24 | ||
|
|
16b2e74037 | ||
|
|
5a394fde56 | ||
|
|
7ec70b5eff | ||
|
|
1fcc2b37eb | ||
|
|
af40bf3c2e | ||
|
|
e6db8069b0 | ||
|
|
98dadf8543 | ||
|
|
c18b1c0646 | ||
|
|
f20a9e760f | ||
|
|
33395dcf4e | ||
|
|
1eca8b8a6b | ||
|
|
167394a073 | ||
|
|
9a081c230f | ||
|
|
fddd11dd1a | ||
|
|
238fa47bee | ||
|
|
b0a954bde2 |
14
.github/workflows/actionlint.yml
vendored
14
.github/workflows/actionlint.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
|
||||
actionlint:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: reviewdog/action-actionlint@v1
|
||||
@@ -36,3 +36,15 @@ jobs:
|
||||
fail_on_error: true
|
||||
filter_mode: nofilter
|
||||
level: error
|
||||
- run: |
|
||||
PAT='^\s*runs-on:.*-latest'
|
||||
if grep -ERq $PAT .github/workflows
|
||||
then
|
||||
grep -ERl $PAT .github/workflows |\
|
||||
while read -r f
|
||||
do
|
||||
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
|
||||
echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
6
.github/workflows/approved-for-ci-run.yml
vendored
6
.github/workflows/approved-for-ci-run.yml
vendored
@@ -44,7 +44,7 @@ jobs:
|
||||
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
@@ -60,7 +60,7 @@ jobs:
|
||||
github.event.action == 'labeled' &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
@@ -109,7 +109,7 @@ jobs:
|
||||
github.event.action == 'closed' &&
|
||||
github.event.pull_request.head.repo.full_name != github.repository
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
|
||||
|
||||
2
.github/workflows/benchmarking.yml
vendored
2
.github/workflows/benchmarking.yml
vendored
@@ -137,7 +137,7 @@ jobs:
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
|
||||
|
||||
@@ -88,7 +88,7 @@ jobs:
|
||||
|
||||
merge-images:
|
||||
needs: [ build-image ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
env:
|
||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
|
||||
12
.github/workflows/build_and_test.yml
vendored
12
.github/workflows/build_and_test.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Cancel previous e2e-tests runs for this PR
|
||||
@@ -549,7 +549,7 @@ jobs:
|
||||
report-benchmarks-failures:
|
||||
needs: [ benchmarks, create-test-report ]
|
||||
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: slackapi/slack-github-action@v1
|
||||
@@ -774,7 +774,7 @@ jobs:
|
||||
|
||||
neon-image:
|
||||
needs: [ neon-image-arch, tag ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
@@ -884,7 +884,7 @@ jobs:
|
||||
|
||||
compute-node-image:
|
||||
needs: [ compute-node-image-arch, tag ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -1032,7 +1032,7 @@ jobs:
|
||||
|
||||
promote-images:
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
env:
|
||||
VERSIONS: v14 v15 v16
|
||||
@@ -1077,7 +1077,7 @@ jobs:
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
|
||||
@@ -19,7 +19,7 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
check-image:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
|
||||
found: ${{ steps.check-image.outputs.found }}
|
||||
|
||||
2
.github/workflows/check-permissions.yml
vendored
2
.github/workflows/check-permissions.yml
vendored
@@ -16,7 +16,7 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Disallow CI runs on PRs from forks
|
||||
if: |
|
||||
|
||||
@@ -9,7 +9,7 @@ on:
|
||||
|
||||
jobs:
|
||||
cleanup:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Cleanup
|
||||
run: |
|
||||
|
||||
2
.github/workflows/pg_clients.yml
vendored
2
.github/workflows/pg_clients.yml
vendored
@@ -20,7 +20,7 @@ concurrency:
|
||||
jobs:
|
||||
test-postgres-client-libs:
|
||||
# TODO: switch to gen2 runner, requires docker
|
||||
runs-on: [ ubuntu-latest ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
env:
|
||||
DEFAULT_PG_VERSION: 14
|
||||
|
||||
2
.github/workflows/pin-build-tools-image.yml
vendored
2
.github/workflows/pin-build-tools-image.yml
vendored
@@ -26,7 +26,7 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
env:
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
|
||||
2
.github/workflows/release-notify.yml
vendored
2
.github/workflows/release-notify.yml
vendored
@@ -19,7 +19,7 @@ on:
|
||||
|
||||
jobs:
|
||||
notify:
|
||||
runs-on: [ ubuntu-latest ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: neondatabase/dev-actions/release-pr-notify@main
|
||||
|
||||
4
.github/workflows/release.yml
vendored
4
.github/workflows/release.yml
vendored
@@ -26,7 +26,7 @@ defaults:
|
||||
jobs:
|
||||
create-storage-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
@@ -65,7 +65,7 @@ jobs:
|
||||
|
||||
create-proxy-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
|
||||
6
.github/workflows/trigger-e2e-tests.yml
vendored
6
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -19,7 +19,7 @@ env:
|
||||
jobs:
|
||||
cancel-previous-e2e-tests:
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Cancel previous e2e-tests runs for this PR
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
|
||||
|
||||
tag:
|
||||
runs-on: [ ubuntu-latest ]
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
build-tag: ${{ steps.build-tag.outputs.tag }}
|
||||
|
||||
@@ -62,7 +62,7 @@ jobs:
|
||||
|
||||
trigger-e2e-tests:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
steps:
|
||||
|
||||
13
Cargo.lock
generated
13
Cargo.lock
generated
@@ -2915,6 +2915,12 @@ version = "0.4.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.10"
|
||||
@@ -3564,6 +3570,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"svg_fmt",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
@@ -4113,6 +4120,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"itertools",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"rand 0.8.5",
|
||||
@@ -6156,7 +6164,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -6668,11 +6676,12 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
"libc",
|
||||
"linux-raw-sys 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use bytes::BufMut;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -53,14 +52,8 @@ impl Key {
|
||||
/// Encode a metadata key to a storage key.
|
||||
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
|
||||
assert!(is_metadata_key_slice(key), "key not in metadata key range");
|
||||
Key {
|
||||
field1: key[0],
|
||||
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
|
||||
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
|
||||
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
|
||||
field5: key[11],
|
||||
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
|
||||
}
|
||||
// Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
|
||||
Self::from_i128(i128::from_be_bytes(*key))
|
||||
}
|
||||
|
||||
/// Encode a metadata key to a storage key.
|
||||
@@ -68,17 +61,6 @@ impl Key {
|
||||
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
|
||||
}
|
||||
|
||||
/// Extract a metadata key to a writer. The result should always be 16 bytes.
|
||||
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
|
||||
writer.put_u8(self.field1);
|
||||
assert!(self.field2 <= 0xFFFF);
|
||||
writer.put_u16(self.field2 as u16);
|
||||
writer.put_u32(self.field3);
|
||||
writer.put_u32(self.field4);
|
||||
writer.put_u8(self.field5);
|
||||
writer.put_u32(self.field6);
|
||||
}
|
||||
|
||||
/// Get the range of metadata keys.
|
||||
pub const fn metadata_key_range() -> Range<Self> {
|
||||
Key {
|
||||
@@ -121,7 +103,7 @@ impl Key {
|
||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||
pub fn to_i128(&self) -> i128 {
|
||||
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
||||
assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
||||
(((self.field1 & 0x7F) as i128) << 120)
|
||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||
| ((self.field3 as i128) << 72)
|
||||
@@ -175,7 +157,7 @@ impl Key {
|
||||
}
|
||||
|
||||
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::from_metadata_key`] instead.
|
||||
/// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
|
||||
pub fn from_slice(b: &[u8]) -> Self {
|
||||
Key {
|
||||
field1: b[0],
|
||||
@@ -188,7 +170,7 @@ impl Key {
|
||||
}
|
||||
|
||||
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::extract_metadata_key_to_writer`] instead.
|
||||
/// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
|
||||
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
||||
buf[0] = self.field1;
|
||||
BE::write_u32(&mut buf[1..5], self.field2);
|
||||
@@ -399,7 +381,14 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0xffffffff,
|
||||
field6: 0xffff_ffff,
|
||||
}
|
||||
}
|
||||
|
||||
impl Key {
|
||||
#[inline(always)]
|
||||
pub fn is_rel_size_key(&self) -> bool {
|
||||
self.field1 == 0 && self.field6 == u32::MAX
|
||||
}
|
||||
}
|
||||
|
||||
@@ -440,6 +429,25 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
|
||||
if key.field1 == 0x01
|
||||
&& key.field3 == 0
|
||||
&& key.field4 == 0
|
||||
&& key.field5 == 0
|
||||
&& key.field6 == 0
|
||||
{
|
||||
match key.field2 {
|
||||
0 => Some(Ok(SlruKind::Clog)),
|
||||
1 => Some(Ok(SlruKind::MultiXactMembers)),
|
||||
2 => Some(Ok(SlruKind::MultiXactOffsets)),
|
||||
x => Some(Err(x)),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
@@ -468,7 +476,17 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0xffffffff,
|
||||
field6: 0xffff_ffff,
|
||||
}
|
||||
}
|
||||
|
||||
impl Key {
|
||||
pub fn is_slru_segment_size_key(&self) -> bool {
|
||||
self.field1 == 0x01
|
||||
&& self.field2 < 0x03
|
||||
&& self.field3 == 0x01
|
||||
&& self.field5 == 0
|
||||
&& self.field6 == u32::MAX
|
||||
}
|
||||
}
|
||||
|
||||
@@ -577,73 +595,78 @@ pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
|
||||
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
|
||||
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
!NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
|
||||
}
|
||||
impl Key {
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(self) -> bool {
|
||||
!NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn is_rel_fsm_block_key(self) -> bool {
|
||||
self.field1 == 0x00
|
||||
&& self.field4 != 0
|
||||
&& self.field5 == FSM_FORKNUM
|
||||
&& self.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_vm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00
|
||||
&& key.field4 != 0
|
||||
&& key.field5 == VISIBILITYMAP_FORKNUM
|
||||
&& key.field6 != 0xffffffff
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn is_rel_vm_block_key(self) -> bool {
|
||||
self.field1 == 0x00
|
||||
&& self.field4 != 0
|
||||
&& self.field5 == VISIBILITYMAP_FORKNUM
|
||||
&& self.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
let kind = match key.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
|
||||
};
|
||||
let segno = key.field4;
|
||||
let blknum = key.field6;
|
||||
#[inline(always)]
|
||||
pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match self.field1 {
|
||||
0x01 => {
|
||||
let kind = match self.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2),
|
||||
};
|
||||
let segno = self.field4;
|
||||
let blknum = self.field6;
|
||||
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_slru_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x01 // SLRU-related
|
||||
&& key.field3 == 0x00000001 // but not SlruDir
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn is_slru_block_key(self) -> bool {
|
||||
self.field1 == 0x01 // SLRU-related
|
||||
&& self.field3 == 0x00000001 // but not SlruDir
|
||||
&& self.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn is_rel_block_key(&self) -> bool {
|
||||
self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
spcnode: key.field2,
|
||||
dbnode: key.field3,
|
||||
relnode: key.field4,
|
||||
forknum: key.field5,
|
||||
},
|
||||
key.field6,
|
||||
),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match self.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
spcnode: self.field2,
|
||||
dbnode: self.field3,
|
||||
relnode: self.field4,
|
||||
forknum: self.field5,
|
||||
},
|
||||
self.field6,
|
||||
),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
@@ -687,10 +710,15 @@ mod tests {
|
||||
let mut metadata_key = vec![AUX_KEY_PREFIX];
|
||||
metadata_key.extend_from_slice(&[0xFF; 15]);
|
||||
let encoded_key = Key::from_metadata_key(&metadata_key);
|
||||
let mut output_key = Vec::new();
|
||||
encoded_key.extract_metadata_key_to_writer(&mut output_key);
|
||||
let output_key = encoded_key.to_i128().to_be_bytes();
|
||||
assert_eq!(metadata_key, output_key);
|
||||
assert!(encoded_key.is_metadata_key());
|
||||
assert!(is_metadata_key_slice(&metadata_key));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_possible_largest_key() {
|
||||
Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
|
||||
// TODO: put this key into the system and see if anything breaks.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
|
||||
use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
|
||||
use postgres_ffi::Oid;
|
||||
|
||||
///
|
||||
@@ -68,6 +68,57 @@ impl fmt::Display for RelTag {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ParseRelTagError {
|
||||
#[error("invalid forknum")]
|
||||
InvalidForknum(#[source] std::num::ParseIntError),
|
||||
#[error("missing triplet member {}", .0)]
|
||||
MissingTripletMember(usize),
|
||||
#[error("invalid triplet member {}", .0)]
|
||||
InvalidTripletMember(usize, #[source] std::num::ParseIntError),
|
||||
}
|
||||
|
||||
impl std::str::FromStr for RelTag {
|
||||
type Err = ParseRelTagError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
use ParseRelTagError::*;
|
||||
|
||||
// FIXME: in postgres logs this separator is dot
|
||||
// Example:
|
||||
// could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
|
||||
// with a regex we could get this more painlessly
|
||||
let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
|
||||
Some((t, f)) => {
|
||||
let forknum = forkname_to_number(Some(f));
|
||||
let forknum = if let Ok(f) = forknum {
|
||||
f
|
||||
} else {
|
||||
f.parse::<u8>().map_err(InvalidForknum)?
|
||||
};
|
||||
|
||||
(t, Some(forknum))
|
||||
}
|
||||
None => (s, None),
|
||||
};
|
||||
|
||||
let mut split = triplet
|
||||
.splitn(3, '/')
|
||||
.enumerate()
|
||||
.map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
|
||||
let spcnode = split.next().ok_or(MissingTripletMember(0))??;
|
||||
let dbnode = split.next().ok_or(MissingTripletMember(1))??;
|
||||
let relnode = split.next().ok_or(MissingTripletMember(2))??;
|
||||
|
||||
Ok(RelTag {
|
||||
spcnode,
|
||||
forknum: forknum.unwrap_or(MAIN_FORKNUM),
|
||||
dbnode,
|
||||
relnode,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl RelTag {
|
||||
pub fn to_segfile_name(&self, segno: u32) -> String {
|
||||
let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use std::{ops::RangeInclusive, str::FromStr};
|
||||
|
||||
use crate::{
|
||||
key::{is_rel_block_key, Key},
|
||||
models::ShardParameters,
|
||||
};
|
||||
use crate::{key::Key, models::ShardParameters};
|
||||
use hex::FromHex;
|
||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -428,6 +425,12 @@ impl<'de> Deserialize<'de> for TenantShardId {
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardStripeSize(pub u32);
|
||||
|
||||
impl Default for ShardStripeSize {
|
||||
fn default() -> Self {
|
||||
DEFAULT_STRIPE_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardLayout(u8);
|
||||
@@ -666,7 +669,7 @@ fn key_is_shard0(key: &Key) -> bool {
|
||||
// because they must be included in basebackups.
|
||||
let is_initfork = key.field5 == INIT_FORKNUM;
|
||||
|
||||
!is_rel_block_key(key) || is_initfork
|
||||
!key.is_rel_block_key() || is_initfork
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
@@ -713,6 +716,25 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
|
||||
ShardNumber((hash % count.0 as u32) as u8)
|
||||
}
|
||||
|
||||
/// For debugging, while not exposing the internals.
|
||||
#[derive(Debug)]
|
||||
#[allow(unused)] // used by debug formatting by pagectl
|
||||
struct KeyShardingInfo {
|
||||
shard0: bool,
|
||||
shard_number: ShardNumber,
|
||||
}
|
||||
|
||||
pub fn describe(
|
||||
key: &Key,
|
||||
shard_count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
) -> impl std::fmt::Debug {
|
||||
KeyShardingInfo {
|
||||
shard0: key_is_shard0(key),
|
||||
shard_number: key_to_shard_number(shard_count, stripe_size, key),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::Hex;
|
||||
|
||||
@@ -7,6 +7,7 @@ license.workspace = true
|
||||
[dependencies]
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
itertools.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
@@ -7,8 +7,9 @@ pub mod framed;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
|
||||
use std::{borrow::Cow, fmt, io, str};
|
||||
|
||||
// re-export for use in utils pageserver_feedback.rs
|
||||
pub use postgres_protocol::PG_EPOCH;
|
||||
@@ -50,15 +51,37 @@ pub enum FeStartupPacket {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct StartupMessageParamsBuilder {
|
||||
params: BytesMut,
|
||||
}
|
||||
|
||||
impl StartupMessageParamsBuilder {
|
||||
/// Set parameter's value by its name.
|
||||
/// name and value must not contain a \0 byte
|
||||
pub fn insert(&mut self, name: &str, value: &str) {
|
||||
self.params.put(name.as_bytes());
|
||||
self.params.put(&b"\0"[..]);
|
||||
self.params.put(value.as_bytes());
|
||||
self.params.put(&b"\0"[..]);
|
||||
}
|
||||
|
||||
pub fn freeze(self) -> StartupMessageParams {
|
||||
StartupMessageParams {
|
||||
params: self.params.freeze(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct StartupMessageParams {
|
||||
params: HashMap<String, String>,
|
||||
params: Bytes,
|
||||
}
|
||||
|
||||
impl StartupMessageParams {
|
||||
/// Get parameter's value by its name.
|
||||
pub fn get(&self, name: &str) -> Option<&str> {
|
||||
self.params.get(name).map(|s| s.as_str())
|
||||
self.iter().find_map(|(k, v)| (k == name).then_some(v))
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
@@ -112,15 +135,19 @@ impl StartupMessageParams {
|
||||
|
||||
/// Iterate through key-value pairs in an arbitrary order.
|
||||
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
|
||||
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
|
||||
let params =
|
||||
std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
|
||||
params.split_terminator('\0').tuples()
|
||||
}
|
||||
|
||||
// This function is mostly useful in tests.
|
||||
#[doc(hidden)]
|
||||
pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
|
||||
Self {
|
||||
params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
|
||||
let mut b = StartupMessageParamsBuilder::default();
|
||||
for (k, v) in pairs {
|
||||
b.insert(k, v)
|
||||
}
|
||||
b.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -345,35 +372,21 @@ impl FeStartupPacket {
|
||||
(major_version, minor_version) => {
|
||||
// StartupMessage
|
||||
|
||||
// Parse pairs of null-terminated strings (key, value).
|
||||
// See `postgres: ProcessStartupPacket, build_startup_packet`.
|
||||
let mut tokens = str::from_utf8(&msg)
|
||||
.map_err(|_e| {
|
||||
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
|
||||
})?
|
||||
.strip_suffix('\0') // drop packet's own null
|
||||
.ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
)
|
||||
})?
|
||||
.split_terminator('\0');
|
||||
|
||||
let mut params = HashMap::new();
|
||||
while let Some(name) = tokens.next() {
|
||||
let value = tokens.next().ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: key without value".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
params.insert(name.to_owned(), value.to_owned());
|
||||
}
|
||||
let s = str::from_utf8(&msg).map_err(|_e| {
|
||||
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
|
||||
})?;
|
||||
let s = s.strip_suffix('\0').ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
FeStartupPacket::StartupMessage {
|
||||
major_version,
|
||||
minor_version,
|
||||
params: StartupMessageParams { params },
|
||||
params: StartupMessageParams {
|
||||
params: msg.slice_ref(s.as_bytes()),
|
||||
},
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -26,13 +26,14 @@ use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use futures_util::TryStreamExt;
|
||||
use http_types::{StatusCode, Url};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
|
||||
use crate::{
|
||||
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
|
||||
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
TimeTravelError, TimeoutOrCancel,
|
||||
error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
|
||||
ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
|
||||
};
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
@@ -137,6 +138,8 @@ impl AzureBlobStorage {
|
||||
let mut last_modified = None;
|
||||
let mut metadata = HashMap::new();
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let download = async {
|
||||
let response = builder
|
||||
// convert to concrete Pageable
|
||||
@@ -200,13 +203,22 @@ impl AzureBlobStorage {
|
||||
})
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
let download = tokio::select! {
|
||||
bufs = download => bufs,
|
||||
cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
|
||||
TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
|
||||
TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
|
||||
TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
|
||||
TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
|
||||
},
|
||||
}
|
||||
};
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
let outcome = match &download {
|
||||
Ok(_) => AttemptOutcome::Ok,
|
||||
Err(_) => AttemptOutcome::Err,
|
||||
};
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, outcome, started_at);
|
||||
download
|
||||
}
|
||||
|
||||
async fn permit(
|
||||
@@ -340,7 +352,10 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Put, cancel).await?;
|
||||
let kind = RequestKind::Put;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let op = async {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
@@ -364,14 +379,25 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
match fut.await {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
|
||||
}
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
let res = tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let outcome = match res {
|
||||
Ok(_) => AttemptOutcome::Ok,
|
||||
Err(_) => AttemptOutcome::Err,
|
||||
};
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, outcome, started_at);
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
async fn download(
|
||||
@@ -417,12 +443,13 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Delete, cancel).await?;
|
||||
let kind = RequestKind::Delete;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let op = async {
|
||||
// TODO batch requests are also not supported by the SDK
|
||||
// TODO batch requests are not supported by the SDK
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
|
||||
for path in paths {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
|
||||
@@ -447,10 +474,16 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
Ok(())
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
let res = tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
res
|
||||
}
|
||||
|
||||
async fn copy(
|
||||
@@ -459,7 +492,9 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Copy, cancel).await?;
|
||||
let kind = RequestKind::Copy;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let timeout = tokio::time::sleep(self.timeout);
|
||||
|
||||
@@ -503,15 +538,21 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
let res = tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
|
||||
_ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
|
||||
_ = timeout => {
|
||||
let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
|
||||
let e = e.context(format!("Timeout, last status: {copy_status:?}"));
|
||||
Err(e)
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
res
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
mod azure_blob;
|
||||
mod error;
|
||||
mod local_fs;
|
||||
mod metrics;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
mod support;
|
||||
@@ -121,8 +122,8 @@ impl RemotePath {
|
||||
self.0.file_name()
|
||||
}
|
||||
|
||||
pub fn join(&self, segment: &Utf8Path) -> Self {
|
||||
Self(self.0.join(segment))
|
||||
pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
|
||||
Self(self.0.join(path))
|
||||
}
|
||||
|
||||
pub fn get_path(&self) -> &Utf8PathBuf {
|
||||
|
||||
@@ -15,6 +15,7 @@ pub(crate) enum RequestKind {
|
||||
TimeTravel = 5,
|
||||
}
|
||||
|
||||
use scopeguard::ScopeGuard;
|
||||
use RequestKind::*;
|
||||
|
||||
impl RequestKind {
|
||||
@@ -33,10 +34,10 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct RequestTyped<C>([C; 6]);
|
||||
pub(crate) struct RequestTyped<C>([C; 6]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
pub(crate) fn get(&self, kind: RequestKind) -> &C {
|
||||
&self.0[kind.as_index()]
|
||||
}
|
||||
|
||||
@@ -58,19 +59,19 @@ impl<C> RequestTyped<C> {
|
||||
}
|
||||
|
||||
impl RequestTyped<Histogram> {
|
||||
pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
|
||||
pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
|
||||
self.get(kind).observe(started_at.elapsed().as_secs_f64())
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct PassFailCancelledRequestTyped<C> {
|
||||
pub(crate) struct PassFailCancelledRequestTyped<C> {
|
||||
success: RequestTyped<C>,
|
||||
fail: RequestTyped<C>,
|
||||
cancelled: RequestTyped<C>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(super) enum AttemptOutcome {
|
||||
pub(crate) enum AttemptOutcome {
|
||||
Ok,
|
||||
Err,
|
||||
Cancelled,
|
||||
@@ -86,7 +87,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
|
||||
}
|
||||
|
||||
impl AttemptOutcome {
|
||||
pub(super) fn as_str(&self) -> &'static str {
|
||||
pub(crate) fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
AttemptOutcome::Ok => "ok",
|
||||
AttemptOutcome::Err => "err",
|
||||
@@ -96,7 +97,7 @@ impl AttemptOutcome {
|
||||
}
|
||||
|
||||
impl<C> PassFailCancelledRequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
|
||||
pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
|
||||
let target = match outcome {
|
||||
AttemptOutcome::Ok => &self.success,
|
||||
AttemptOutcome::Err => &self.fail,
|
||||
@@ -119,7 +120,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
|
||||
}
|
||||
|
||||
impl PassFailCancelledRequestTyped<Histogram> {
|
||||
pub(super) fn observe_elapsed(
|
||||
pub(crate) fn observe_elapsed(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
outcome: impl Into<AttemptOutcome>,
|
||||
@@ -130,19 +131,44 @@ impl PassFailCancelledRequestTyped<Histogram> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct BucketMetrics {
|
||||
/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
|
||||
pub(crate) fn start_counting_cancelled_wait(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.cancelled_waits
|
||||
.get(kind)
|
||||
.inc()
|
||||
})
|
||||
}
|
||||
|
||||
/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
|
||||
pub(crate) fn start_measuring_requests(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Cancelled,
|
||||
started_at,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) struct BucketMetrics {
|
||||
/// Full request duration until successful completion, error or cancellation.
|
||||
pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
|
||||
pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
|
||||
/// Total amount of seconds waited on queue.
|
||||
pub(super) wait_seconds: RequestTyped<Histogram>,
|
||||
pub(crate) wait_seconds: RequestTyped<Histogram>,
|
||||
|
||||
/// Track how many semaphore awaits were cancelled per request type.
|
||||
///
|
||||
/// This is in case cancellations are happening more than expected.
|
||||
pub(super) cancelled_waits: RequestTyped<IntCounter>,
|
||||
pub(crate) cancelled_waits: RequestTyped<IntCounter>,
|
||||
|
||||
/// Total amount of deleted objects in batches or single requests.
|
||||
pub(super) deleted_objects_total: IntCounter,
|
||||
pub(crate) deleted_objects_total: IntCounter,
|
||||
}
|
||||
|
||||
impl Default for BucketMetrics {
|
||||
@@ -46,15 +46,16 @@ use utils::backoff;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
|
||||
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
|
||||
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
error::Cancelled,
|
||||
metrics::{start_counting_cancelled_wait, start_measuring_requests},
|
||||
support::PermitCarrying,
|
||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
|
||||
use self::metrics::AttemptOutcome;
|
||||
pub(super) use self::metrics::RequestKind;
|
||||
use crate::metrics::AttemptOutcome;
|
||||
pub(super) use crate::metrics::RequestKind;
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
@@ -227,7 +228,7 @@ impl S3Bucket {
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
|
||||
@@ -248,7 +249,7 @@ impl S3Bucket {
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
Ok(permit)
|
||||
@@ -287,7 +288,7 @@ impl S3Bucket {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
@@ -295,7 +296,7 @@ impl S3Bucket {
|
||||
return Err(DownloadError::NotFound);
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
@@ -371,12 +372,12 @@ impl S3Bucket {
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
let resp = resp.context("request deletion")?;
|
||||
metrics::BUCKET_METRICS
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
|
||||
@@ -435,14 +436,14 @@ pin_project_lite::pin_project! {
|
||||
/// Times and tracks the outcome of the request.
|
||||
struct TimedDownload<S> {
|
||||
started_at: std::time::Instant,
|
||||
outcome: metrics::AttemptOutcome,
|
||||
outcome: AttemptOutcome,
|
||||
#[pin]
|
||||
inner: S
|
||||
}
|
||||
|
||||
impl<S> PinnedDrop for TimedDownload<S> {
|
||||
fn drop(mut this: Pin<&mut Self>) {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -451,7 +452,7 @@ impl<S> TimedDownload<S> {
|
||||
fn new(started_at: std::time::Instant, inner: S) -> Self {
|
||||
TimedDownload {
|
||||
started_at,
|
||||
outcome: metrics::AttemptOutcome::Cancelled,
|
||||
outcome: AttemptOutcome::Cancelled,
|
||||
inner,
|
||||
}
|
||||
}
|
||||
@@ -468,8 +469,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||
let res = ready!(this.inner.poll_next(cx));
|
||||
match &res {
|
||||
Some(Ok(_)) => {}
|
||||
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
|
||||
None => *this.outcome = metrics::AttemptOutcome::Ok,
|
||||
Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
|
||||
None => *this.outcome = AttemptOutcome::Ok,
|
||||
}
|
||||
|
||||
Poll::Ready(res)
|
||||
@@ -543,7 +544,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
metrics::BUCKET_METRICS
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
@@ -625,7 +626,7 @@ impl RemoteStorage for S3Bucket {
|
||||
if let Ok(inner) = &res {
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, inner, started_at);
|
||||
}
|
||||
@@ -673,7 +674,7 @@ impl RemoteStorage for S3Bucket {
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
@@ -977,28 +978,6 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
fn start_counting_cancelled_wait(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
|
||||
metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
|
||||
})
|
||||
}
|
||||
|
||||
/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
|
||||
fn start_measuring_requests(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Cancelled,
|
||||
started_at,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
|
||||
struct VerOrDelete {
|
||||
kind: VerOrDeleteKind,
|
||||
|
||||
@@ -19,13 +19,13 @@
|
||||
/// // right: [0x68; 1]
|
||||
/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
|
||||
/// ```
|
||||
#[derive(PartialEq)]
|
||||
pub struct Hex<'a>(pub &'a [u8]);
|
||||
pub struct Hex<S>(pub S);
|
||||
|
||||
impl std::fmt::Debug for Hex<'_> {
|
||||
impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[")?;
|
||||
for (i, c) in self.0.chunks(16).enumerate() {
|
||||
let chunks = self.0.as_ref().chunks(16);
|
||||
for (i, c) in chunks.enumerate() {
|
||||
if i > 0 && !c.is_empty() {
|
||||
writeln!(f, ", ")?;
|
||||
}
|
||||
@@ -36,6 +36,15 @@ impl std::fmt::Debug for Hex<'_> {
|
||||
write!(f, "0x{b:02x}")?;
|
||||
}
|
||||
}
|
||||
write!(f, "; {}]", self.0.len())
|
||||
write!(f, "; {}]", self.0.as_ref().len())
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
|
||||
fn eq(&self, other: &Hex<R>) -> bool {
|
||||
let left = self.0.as_ref();
|
||||
let right = other.0.as_ref();
|
||||
|
||||
left == right
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
postgres_ffi.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
|
||||
475
pageserver/ctl/src/key.rs
Normal file
475
pageserver/ctl/src/key.rs
Normal file
@@ -0,0 +1,475 @@
|
||||
use anyhow::Context;
|
||||
use clap::Parser;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
reltag::{BlockNumber, RelTag, SlruKind},
|
||||
shard::{ShardCount, ShardStripeSize},
|
||||
};
|
||||
use std::str::FromStr;
|
||||
|
||||
#[derive(Parser)]
|
||||
pub(super) struct DescribeKeyCommand {
|
||||
/// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
|
||||
input: Vec<String>,
|
||||
|
||||
/// The number of shards to calculate what Keys placement would be.
|
||||
#[arg(long)]
|
||||
shard_count: Option<CustomShardCount>,
|
||||
|
||||
/// The sharding stripe size.
|
||||
///
|
||||
/// The default is hardcoded. It makes no sense to provide this without providing
|
||||
/// `--shard-count`.
|
||||
#[arg(long, requires = "shard_count")]
|
||||
stripe_size: Option<u32>,
|
||||
}
|
||||
|
||||
/// Sharded shard count without unsharded count, which the actual ShardCount supports.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(super) struct CustomShardCount(std::num::NonZeroU8);
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(super) enum InvalidShardCount {
|
||||
#[error(transparent)]
|
||||
ParsingFailed(#[from] std::num::ParseIntError),
|
||||
#[error("too few shards")]
|
||||
TooFewShards,
|
||||
}
|
||||
|
||||
impl FromStr for CustomShardCount {
|
||||
type Err = InvalidShardCount;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let inner: std::num::NonZeroU8 = s.parse()?;
|
||||
if inner.get() < 2 {
|
||||
Err(InvalidShardCount::TooFewShards)
|
||||
} else {
|
||||
Ok(CustomShardCount(inner))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CustomShardCount> for ShardCount {
|
||||
fn from(value: CustomShardCount) -> Self {
|
||||
ShardCount::new(value.0.get())
|
||||
}
|
||||
}
|
||||
|
||||
impl DescribeKeyCommand {
|
||||
pub(super) fn execute(self) {
|
||||
let DescribeKeyCommand {
|
||||
input,
|
||||
shard_count,
|
||||
stripe_size,
|
||||
} = self;
|
||||
|
||||
let material = KeyMaterial::try_from(input.as_slice()).unwrap();
|
||||
let kind = material.kind();
|
||||
let key = Key::from(material);
|
||||
|
||||
println!("parsed from {kind}: {key}:");
|
||||
println!();
|
||||
println!("{key:?}");
|
||||
|
||||
macro_rules! kind_query {
|
||||
([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}};
|
||||
($name:ident) => {{
|
||||
let s: &'static str = stringify!($name);
|
||||
let s = s.strip_prefix("is_").unwrap_or(s);
|
||||
let s = s.strip_suffix("_key").unwrap_or(s);
|
||||
|
||||
#[allow(clippy::needless_borrow)]
|
||||
(s, key.$name())
|
||||
}};
|
||||
}
|
||||
|
||||
// the current characterization is a mess of these boolean queries and separate
|
||||
// "recognization". I think it accurately represents how strictly we model the Key
|
||||
// right now, but could of course be made less confusing.
|
||||
|
||||
let queries = kind_query!([
|
||||
is_rel_block_key,
|
||||
is_rel_vm_block_key,
|
||||
is_rel_fsm_block_key,
|
||||
is_slru_block_key,
|
||||
is_inherited_key,
|
||||
is_rel_size_key,
|
||||
is_slru_segment_size_key,
|
||||
]);
|
||||
|
||||
let recognized_kind = "recognized kind";
|
||||
let metadata_key = "metadata key";
|
||||
let shard_placement = "shard placement";
|
||||
|
||||
let longest = queries
|
||||
.iter()
|
||||
.map(|t| t.0)
|
||||
.chain([recognized_kind, metadata_key, shard_placement])
|
||||
.map(|s| s.len())
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
let colon = 1;
|
||||
let padding = 1;
|
||||
|
||||
for (name, is) in queries {
|
||||
let width = longest - name.len() + colon + padding;
|
||||
println!("{}{:width$}{}", name, ":", is);
|
||||
}
|
||||
|
||||
let width = longest - recognized_kind.len() + colon + padding;
|
||||
println!(
|
||||
"{}{:width$}{:?}",
|
||||
recognized_kind,
|
||||
":",
|
||||
RecognizedKeyKind::new(key),
|
||||
);
|
||||
|
||||
if let Some(shard_count) = shard_count {
|
||||
// seeing the sharding placement might be confusing, so leave it out unless shard
|
||||
// count was given.
|
||||
|
||||
let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
|
||||
println!(
|
||||
"# placement with shard_count: {} and stripe_size: {}:",
|
||||
shard_count.0, stripe_size.0
|
||||
);
|
||||
let width = longest - shard_placement.len() + colon + padding;
|
||||
println!(
|
||||
"{}{:width$}{:?}",
|
||||
shard_placement,
|
||||
":",
|
||||
pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Hand-wavy "inputs we accept" for a key.
|
||||
#[derive(Debug)]
|
||||
pub(super) enum KeyMaterial {
|
||||
Hex(Key),
|
||||
String(SpanAttributesFromLogs),
|
||||
Split(RelTag, BlockNumber),
|
||||
}
|
||||
|
||||
impl KeyMaterial {
|
||||
fn kind(&self) -> &'static str {
|
||||
match self {
|
||||
KeyMaterial::Hex(_) => "hex",
|
||||
KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<KeyMaterial> for Key {
|
||||
fn from(value: KeyMaterial) -> Self {
|
||||
match value {
|
||||
KeyMaterial::Hex(key) => key,
|
||||
KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
|
||||
| KeyMaterial::Split(rt, blocknum) => {
|
||||
pageserver_api::key::rel_block_to_key(rt, blocknum)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(value: &[S]) -> Result<Self, Self::Error> {
|
||||
match value {
|
||||
[] => anyhow::bail!(
|
||||
"need 1..N positional arguments describing the key, try hex or a log line"
|
||||
),
|
||||
[one] => {
|
||||
let one = one.as_ref();
|
||||
|
||||
let key = Key::from_hex(one).map(KeyMaterial::Hex);
|
||||
|
||||
let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
|
||||
|
||||
match (key, attrs) {
|
||||
(Ok(key), _) => Ok(key),
|
||||
(_, Ok(s)) => Ok(s),
|
||||
(Err(e1), Err(e2)) => anyhow::bail!(
|
||||
"failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
|
||||
),
|
||||
}
|
||||
}
|
||||
more => {
|
||||
// assume going left to right one of these is a reltag and then we find a blocknum
|
||||
// this works, because we don't have plain numbers at least right after reltag in
|
||||
// logs. for some definition of "works".
|
||||
|
||||
let Some((reltag_at, reltag)) = more
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.enumerate()
|
||||
.find_map(|(i, s)| {
|
||||
s.split_once("rel=")
|
||||
.map(|(_garbage, actual)| actual)
|
||||
.unwrap_or(s)
|
||||
.parse::<RelTag>()
|
||||
.ok()
|
||||
.map(|rt| (i, rt))
|
||||
})
|
||||
else {
|
||||
anyhow::bail!("found no RelTag in arguments");
|
||||
};
|
||||
|
||||
let Some(blocknum) = more
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.skip(reltag_at)
|
||||
.find_map(|s| {
|
||||
s.split_once("blkno=")
|
||||
.map(|(_garbage, actual)| actual)
|
||||
.unwrap_or(s)
|
||||
.parse::<BlockNumber>()
|
||||
.ok()
|
||||
})
|
||||
else {
|
||||
anyhow::bail!("found no blocknum in arguments");
|
||||
};
|
||||
|
||||
Ok(KeyMaterial::Split(reltag, blocknum))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
|
||||
|
||||
impl std::str::FromStr for SpanAttributesFromLogs {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
// accept the span separator but do not require or fail if either is missing
|
||||
// "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
|
||||
let (_, reltag) = s
|
||||
.split_once("rel=")
|
||||
.ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
|
||||
let reltag = reltag.split_whitespace().next().unwrap();
|
||||
|
||||
let (_, blocknum) = s
|
||||
.split_once("blkno=")
|
||||
.ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
|
||||
let blocknum = blocknum.split_whitespace().next().unwrap();
|
||||
|
||||
let reltag = reltag
|
||||
.parse()
|
||||
.with_context(|| format!("parse reltag from {reltag:?}"))?;
|
||||
let blocknum = blocknum
|
||||
.parse()
|
||||
.with_context(|| format!("parse blocknum from {blocknum:?}"))?;
|
||||
|
||||
Ok(Self(reltag, blocknum))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)] // debug print is used
|
||||
enum RecognizedKeyKind {
|
||||
DbDir,
|
||||
ControlFile,
|
||||
Checkpoint,
|
||||
AuxFilesV1,
|
||||
SlruDir(Result<SlruKind, u32>),
|
||||
RelMap(RelTagish<2>),
|
||||
RelDir(RelTagish<2>),
|
||||
AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[allow(unused)]
|
||||
enum AuxFileV2 {
|
||||
Recognized(&'static str, utils::Hex<[u8; 13]>),
|
||||
OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
|
||||
Other(utils::Hex<[u8; 13]>),
|
||||
}
|
||||
|
||||
impl RecognizedKeyKind {
|
||||
fn new(key: Key) -> Option<Self> {
|
||||
use RecognizedKeyKind::{
|
||||
AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
|
||||
};
|
||||
|
||||
let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
|
||||
|
||||
Some(match key {
|
||||
pageserver_api::key::DBDIR_KEY => DbDir,
|
||||
pageserver_api::key::CONTROLFILE_KEY => ControlFile,
|
||||
pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
|
||||
pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
|
||||
_ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
|
||||
_ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
|
||||
RelMap([key.field2, key.field3].into())
|
||||
}
|
||||
_ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
|
||||
RelDir([key.field2, key.field3].into())
|
||||
}
|
||||
_ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
|
||||
AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
|
||||
),
|
||||
_ => return None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl AuxFileV2 {
|
||||
fn new(key: Key) -> Option<AuxFileV2> {
|
||||
const EMPTY_HASH: [u8; 13] = {
|
||||
let mut out = [0u8; 13];
|
||||
let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
|
||||
let mut i = 3;
|
||||
while i < 16 {
|
||||
out[i - 3] = hash[i];
|
||||
i += 1;
|
||||
}
|
||||
out
|
||||
};
|
||||
|
||||
let bytes = key.to_i128().to_be_bytes();
|
||||
let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
|
||||
|
||||
assert_eq!(EMPTY_HASH.len(), hash.0.len());
|
||||
|
||||
// TODO: we could probably find the preimages for the hashes
|
||||
|
||||
Some(match (bytes[1], bytes[2]) {
|
||||
(1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
|
||||
(1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
|
||||
(1, 3) if hash.0 == EMPTY_HASH => {
|
||||
AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
|
||||
}
|
||||
(2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
|
||||
(1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
|
||||
(0xff, 0xff) => AuxFileV2::Other(hash),
|
||||
_ => return None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Prefix of RelTag, currently only known use cases are the two item versions.
|
||||
///
|
||||
/// Renders like a reltag with `/`, nothing else.
|
||||
struct RelTagish<const N: usize>([u32; N]);
|
||||
|
||||
impl<const N: usize> From<[u32; N]> for RelTagish<N> {
|
||||
fn from(val: [u32; N]) -> Self {
|
||||
RelTagish(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> std::fmt::Debug for RelTagish<N> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use std::fmt::Write as _;
|
||||
let mut first = true;
|
||||
self.0.iter().try_for_each(|x| {
|
||||
if !first {
|
||||
f.write_char('/')?;
|
||||
}
|
||||
first = false;
|
||||
write!(f, "{}", x)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver::aux_file::encode_aux_file_key;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn hex_is_key_material() {
|
||||
let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
|
||||
assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_positional_spanalike_is_key_material() {
|
||||
// why is this needed? if you are checking many, then copypaste starts to appeal
|
||||
let strings = [
|
||||
(line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
|
||||
(line!(), "rel=1663/208101/2620_fsm blkno=2"),
|
||||
(line!(), "rel=1663/208101/2620.1 blkno=2"),
|
||||
];
|
||||
|
||||
let mut first: Option<Key> = None;
|
||||
|
||||
for (line, example) in strings {
|
||||
let m = KeyMaterial::try_from(&[example][..])
|
||||
.unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
|
||||
let key = Key::from(m);
|
||||
if let Some(first) = first {
|
||||
assert_eq!(first, key);
|
||||
} else {
|
||||
first = Some(key);
|
||||
}
|
||||
}
|
||||
|
||||
// not supporting this is rather accidential, but I think the input parsing is lenient
|
||||
// enough already
|
||||
KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_spanlike_args() {
|
||||
let strings = [
|
||||
(line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
|
||||
(line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
|
||||
(line!(), &["1663/208101/2620_fsm", "2"][..]),
|
||||
];
|
||||
|
||||
let mut first: Option<Key> = None;
|
||||
|
||||
for (line, example) in strings {
|
||||
let m = KeyMaterial::try_from(example)
|
||||
.unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
|
||||
let key = Key::from(m);
|
||||
if let Some(first) = first {
|
||||
assert_eq!(first, key);
|
||||
} else {
|
||||
first = Some(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn recognized_auxfiles() {
|
||||
use AuxFileV2::*;
|
||||
|
||||
let empty = [
|
||||
0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
|
||||
];
|
||||
let foobar = [
|
||||
0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
|
||||
];
|
||||
|
||||
#[rustfmt::skip]
|
||||
let examples = [
|
||||
(line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
|
||||
(line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
|
||||
(line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
|
||||
(line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
|
||||
(line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
|
||||
(line!(), "foobar", Other(utils::Hex(foobar))),
|
||||
];
|
||||
|
||||
for (line, path, expected) in examples {
|
||||
let key = encode_aux_file_key(path);
|
||||
let recognized =
|
||||
AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
|
||||
|
||||
assert_eq!(recognized, expected);
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
|
||||
None,
|
||||
"example key has one too few 0 after 6 before 1"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
mod draw_timeline_dir;
|
||||
mod index_part;
|
||||
mod key;
|
||||
mod layer_map_analyzer;
|
||||
mod layers;
|
||||
|
||||
@@ -61,6 +62,8 @@ enum Commands {
|
||||
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
||||
#[command(subcommand)]
|
||||
Layer(LayerCmd),
|
||||
/// Debug print a hex key found from logs
|
||||
Key(key::DescribeKeyCommand),
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
|
||||
.await?;
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use utils::lsn::Lsn;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
/// Ingest aux files into the pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
@@ -88,11 +89,17 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
println!("ingested {file_cnt} files");
|
||||
}
|
||||
|
||||
let files = mgmt_api_client
|
||||
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
|
||||
.await?;
|
||||
|
||||
println!("{} files found", files.len());
|
||||
for _ in 0..100 {
|
||||
let start = Instant::now();
|
||||
let files = mgmt_api_client
|
||||
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
|
||||
.await?;
|
||||
println!(
|
||||
"{} files found in {}s",
|
||||
files.len(),
|
||||
start.elapsed().as_secs_f64()
|
||||
);
|
||||
}
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
|
||||
@@ -187,7 +187,7 @@ async fn main_impl(
|
||||
for r in partitioning.keys.ranges.iter() {
|
||||
let mut i = r.start;
|
||||
while i != r.end {
|
||||
if is_rel_block_key(&i) {
|
||||
if i.is_rel_block_key() {
|
||||
filtered.add_key(i);
|
||||
}
|
||||
i = i.next();
|
||||
@@ -308,9 +308,10 @@ async fn main_impl(
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{key_to_slru_block, Key};
|
||||
use pageserver_api::key::Key;
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
@@ -170,7 +170,7 @@ where
|
||||
}
|
||||
|
||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
|
||||
let (kind, segno, _) = key_to_slru_block(*key)?;
|
||||
let (kind, segno, _) = key.to_slru_block()?;
|
||||
|
||||
match kind {
|
||||
SlruKind::Clog => {
|
||||
|
||||
@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
|
||||
// mean the synthetic size worker should terminate.
|
||||
let shutting_down = matches!(
|
||||
e.downcast_ref::<PageReconstructError>(),
|
||||
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||
Some(PageReconstructError::Cancelled)
|
||||
);
|
||||
|
||||
if !shutting_down {
|
||||
|
||||
@@ -311,7 +311,7 @@ impl DeletionList {
|
||||
result.extend(
|
||||
timeline_layers
|
||||
.into_iter()
|
||||
.map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
|
||||
.map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,6 +74,7 @@ use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::SpawnMode;
|
||||
@@ -183,9 +184,6 @@ impl From<PageReconstructError> for ApiError {
|
||||
PageReconstructError::Cancelled => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::AncestorStopping(_) => {
|
||||
ApiError::ResourceUnavailable(format!("{pre}").into())
|
||||
}
|
||||
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||
}
|
||||
@@ -1813,11 +1811,22 @@ async fn timeline_checkpoint_handler(
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e| {
|
||||
match e {
|
||||
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
|
||||
other => ApiError::InternalServerError(other.into()),
|
||||
|
||||
}
|
||||
})?;
|
||||
timeline
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
.map_err(|e|
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e)
|
||||
}
|
||||
)?;
|
||||
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -66,6 +66,7 @@ use crate::tenant::mgr::GetTenantError;
|
||||
use crate::tenant::mgr::ShardResolveResult;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
use crate::tenant::timeline::FlushLayerError;
|
||||
use crate::tenant::timeline::WaitLsnError;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
@@ -372,7 +373,7 @@ impl From<WaitLsnError> for PageStreamError {
|
||||
match value {
|
||||
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
|
||||
WaitLsnError::Shutdown => Self::Shutdown,
|
||||
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
|
||||
e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -382,7 +383,7 @@ impl From<WaitLsnError> for QueryError {
|
||||
match value {
|
||||
e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
|
||||
WaitLsnError::Shutdown => Self::Shutdown,
|
||||
WaitLsnError::BadState => Self::Reconnect,
|
||||
WaitLsnError::BadState { .. } => Self::Reconnect,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -830,7 +831,10 @@ impl PageServerHandler {
|
||||
// We only want to persist the data, and it doesn't matter if it's in the
|
||||
// shape of deltas or images.
|
||||
info!("flushing layers");
|
||||
timeline.freeze_and_flush().await?;
|
||||
timeline.freeze_and_flush().await.map_err(|e| match e {
|
||||
FlushLayerError::Cancelled => QueryError::Shutdown,
|
||||
other => QueryError::Other(other.into()),
|
||||
})?;
|
||||
|
||||
info!("done");
|
||||
Ok(())
|
||||
|
||||
@@ -17,10 +17,10 @@ use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, slru_block_to_key, slru_dir_to_key, slru_segment_key_range,
|
||||
slru_segment_size_to_key, twophase_file_key, twophase_key_range, AUX_FILES_KEY, CHECKPOINT_KEY,
|
||||
CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
@@ -78,11 +78,19 @@ pub enum LsnForTimestamp {
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum CalculateLogicalSizeError {
|
||||
pub(crate) enum CalculateLogicalSizeError {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
|
||||
/// Something went wrong while reading the metadata we use to calculate logical size
|
||||
/// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
|
||||
/// in the `From` implementation for this variant.
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
PageRead(PageReconstructError),
|
||||
|
||||
/// Something went wrong deserializing metadata that we read to calculate logical size
|
||||
#[error("decode error: {0}")]
|
||||
Decode(#[from] DeserializeError),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -107,10 +115,8 @@ impl From<PageReconstructError> for CollectKeySpaceError {
|
||||
impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||
fn from(pre: PageReconstructError) -> Self {
|
||||
match pre {
|
||||
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
|
||||
Self::Cancelled
|
||||
}
|
||||
_ => Self::Other(pre.into()),
|
||||
PageReconstructError::Cancelled => Self::Cancelled,
|
||||
_ => Self::PageRead(pre),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -763,7 +769,7 @@ impl Timeline {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn get_current_logical_size_non_incremental(
|
||||
pub(crate) async fn get_current_logical_size_non_incremental(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -772,7 +778,7 @@ impl Timeline {
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||
let dbdir = DbDirectory::des(&buf)?;
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
@@ -1552,7 +1558,7 @@ impl<'a> DatadirModification<'a> {
|
||||
self.tline.aux_file_size_estimator.on_add(content.len());
|
||||
new_files.push((path, content));
|
||||
}
|
||||
(None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
|
||||
(None, true) => warn!("removing non-existing aux file: {}", path),
|
||||
}
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
@@ -1606,8 +1612,7 @@ impl<'a> DatadirModification<'a> {
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
| PageReconstructError::Cancelled
|
||||
e @ (PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
@@ -1679,7 +1684,7 @@ impl<'a> DatadirModification<'a> {
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
for (key, values) in self.pending_updates.drain() {
|
||||
for (lsn, value) in values {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
if key.is_rel_block_key() || key.is_slru_block_key() {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
|
||||
@@ -487,6 +487,33 @@ enum CreateTimelineCause {
|
||||
Delete,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GcError {
|
||||
// The tenant is shutting down
|
||||
#[error("tenant shutting down")]
|
||||
TenantCancelled,
|
||||
|
||||
// The tenant is shutting down
|
||||
#[error("timeline shutting down")]
|
||||
TimelineCancelled,
|
||||
|
||||
// The tenant is in a state inelegible to run GC
|
||||
#[error("not active")]
|
||||
NotActive,
|
||||
|
||||
// A requested GC cutoff LSN was invalid, for example it tried to move backwards
|
||||
#[error("not active")]
|
||||
BadLsn { why: String },
|
||||
|
||||
// A remote storage error while scheduling updates after compaction
|
||||
#[error(transparent)]
|
||||
Remote(anyhow::Error),
|
||||
|
||||
// If GC was invoked for a particular timeline, this error means it didn't exist
|
||||
#[error("timeline not found")]
|
||||
TimelineNotFound,
|
||||
}
|
||||
|
||||
impl Tenant {
|
||||
/// Yet another helper for timeline initialization.
|
||||
///
|
||||
@@ -1393,6 +1420,36 @@ impl Tenant {
|
||||
Ok(tl)
|
||||
}
|
||||
|
||||
/// Helper for unit tests to create a timeline with some pre-loaded states.
|
||||
#[cfg(test)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn create_test_timeline_with_layers(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
ctx: &RequestContext,
|
||||
delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
|
||||
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
||||
end_lsn: Lsn,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let tline = self
|
||||
.create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
|
||||
.await?;
|
||||
tline.force_advance_lsn(end_lsn);
|
||||
for deltas in delta_layer_desc {
|
||||
tline
|
||||
.force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
for (lsn, images) in image_layer_desc {
|
||||
tline
|
||||
.force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
Ok(tline)
|
||||
}
|
||||
|
||||
/// Create a new timeline.
|
||||
///
|
||||
/// Returns the new timeline ID and reference to its Timeline object.
|
||||
@@ -1507,7 +1564,7 @@ impl Tenant {
|
||||
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
||||
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
|
||||
CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
|
||||
}
|
||||
WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
|
||||
@@ -1575,24 +1632,23 @@ impl Tenant {
|
||||
/// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
|
||||
/// requires more history to be retained.
|
||||
//
|
||||
pub async fn gc_iteration(
|
||||
pub(crate) async fn gc_iteration(
|
||||
&self,
|
||||
target_timeline_id: Option<TimelineId>,
|
||||
horizon: u64,
|
||||
pitr: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<GcResult> {
|
||||
) -> Result<GcResult, GcError> {
|
||||
// Don't start doing work during shutdown
|
||||
if let TenantState::Stopping { .. } = self.current_state() {
|
||||
return Ok(GcResult::default());
|
||||
}
|
||||
|
||||
// there is a global allowed_error for this
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot run GC iteration on inactive tenant"
|
||||
);
|
||||
if !self.is_active() {
|
||||
return Err(GcError::NotActive);
|
||||
}
|
||||
|
||||
{
|
||||
let conf = self.tenant_conf.load();
|
||||
@@ -2760,28 +2816,13 @@ impl Tenant {
|
||||
pitr: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<GcResult> {
|
||||
) -> Result<GcResult, GcError> {
|
||||
let mut totals: GcResult = Default::default();
|
||||
let now = Instant::now();
|
||||
|
||||
let gc_timelines = match self
|
||||
let gc_timelines = self
|
||||
.refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
|
||||
.await
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(e) => {
|
||||
if let Some(PageReconstructError::Cancelled) =
|
||||
e.downcast_ref::<PageReconstructError>()
|
||||
{
|
||||
// Handle cancellation
|
||||
totals.elapsed = now.elapsed();
|
||||
return Ok(totals);
|
||||
} else {
|
||||
// Propagate other errors
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
.await?;
|
||||
|
||||
failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
|
||||
|
||||
@@ -2806,7 +2847,19 @@ impl Tenant {
|
||||
// made.
|
||||
break;
|
||||
}
|
||||
let result = timeline.gc().await?;
|
||||
let result = match timeline.gc().await {
|
||||
Err(GcError::TimelineCancelled) => {
|
||||
if target_timeline_id.is_some() {
|
||||
// If we were targetting this specific timeline, surface cancellation to caller
|
||||
return Err(GcError::TimelineCancelled);
|
||||
} else {
|
||||
// A timeline may be shutting down independently of the tenant's lifecycle: we should
|
||||
// skip past this and proceed to try GC on other timelines.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
r => r?,
|
||||
};
|
||||
totals += result;
|
||||
}
|
||||
|
||||
@@ -2819,11 +2872,11 @@ impl Tenant {
|
||||
/// [`Tenant::get_gc_horizon`].
|
||||
///
|
||||
/// This is usually executed as part of periodic gc, but can now be triggered more often.
|
||||
pub async fn refresh_gc_info(
|
||||
pub(crate) async fn refresh_gc_info(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<Arc<Timeline>>> {
|
||||
) -> Result<Vec<Arc<Timeline>>, GcError> {
|
||||
// since this method can now be called at different rates than the configured gc loop, it
|
||||
// might be that these configuration values get applied faster than what it was previously,
|
||||
// since these were only read from the gc task.
|
||||
@@ -2844,7 +2897,7 @@ impl Tenant {
|
||||
pitr: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<Arc<Timeline>>> {
|
||||
) -> Result<Vec<Arc<Timeline>>, GcError> {
|
||||
// before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
|
||||
// currently visible timelines.
|
||||
let timelines = self
|
||||
@@ -2881,8 +2934,8 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
if !self.is_active() {
|
||||
anyhow::bail!("shutting down");
|
||||
if !self.is_active() || self.cancel.is_cancelled() {
|
||||
return Err(GcError::TenantCancelled);
|
||||
}
|
||||
|
||||
// grab mutex to prevent new timelines from being created here; avoid doing long operations
|
||||
@@ -2891,19 +2944,19 @@ impl Tenant {
|
||||
|
||||
// Scan all timelines. For each timeline, remember the timeline ID and
|
||||
// the branch point where it was created.
|
||||
let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
|
||||
let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let mut all_branchpoints = BTreeSet::new();
|
||||
let timeline_ids = {
|
||||
let timelines = {
|
||||
if let Some(target_timeline_id) = target_timeline_id.as_ref() {
|
||||
if timelines.get(target_timeline_id).is_none() {
|
||||
bail!("gc target timeline does not exist")
|
||||
return Err(GcError::TimelineNotFound);
|
||||
}
|
||||
};
|
||||
|
||||
timelines
|
||||
.iter()
|
||||
.map(|(timeline_id, timeline_entry)| {
|
||||
.map(|(_timeline_id, timeline_entry)| {
|
||||
if let Some(ancestor_timeline_id) =
|
||||
&timeline_entry.get_ancestor_timeline_id()
|
||||
{
|
||||
@@ -2925,33 +2978,28 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
*timeline_id
|
||||
timeline_entry.clone()
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
(all_branchpoints, timeline_ids)
|
||||
(all_branchpoints, timelines)
|
||||
};
|
||||
|
||||
// Ok, we now know all the branch points.
|
||||
// Update the GC information for each timeline.
|
||||
let mut gc_timelines = Vec::with_capacity(timeline_ids.len());
|
||||
for timeline_id in timeline_ids {
|
||||
// Timeline is known to be local and loaded.
|
||||
let timeline = self
|
||||
.get_timeline(timeline_id, false)
|
||||
.with_context(|| format!("Timeline {timeline_id} was not found"))?;
|
||||
|
||||
let mut gc_timelines = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
// If target_timeline is specified, ignore all other timelines
|
||||
if let Some(target_timeline_id) = target_timeline_id {
|
||||
if timeline_id != target_timeline_id {
|
||||
if timeline.timeline_id != target_timeline_id {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||
.range((
|
||||
Included((timeline_id, Lsn(0))),
|
||||
Included((timeline_id, Lsn(u64::MAX))),
|
||||
Included((timeline.timeline_id, Lsn(0))),
|
||||
Included((timeline.timeline_id, Lsn(u64::MAX))),
|
||||
))
|
||||
.map(|&x| x.1)
|
||||
.collect();
|
||||
@@ -2959,7 +3007,7 @@ impl Tenant {
|
||||
{
|
||||
let mut target = timeline.gc_info.write().unwrap();
|
||||
|
||||
match gc_cutoffs.remove(&timeline_id) {
|
||||
match gc_cutoffs.remove(&timeline.timeline_id) {
|
||||
Some(cutoffs) => {
|
||||
*target = GcInfo {
|
||||
retain_lsns: branchpoints,
|
||||
@@ -2992,17 +3040,53 @@ impl Tenant {
|
||||
&self,
|
||||
src_timeline: &Arc<Timeline>,
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
ancestor_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
|
||||
let tl = self
|
||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
|
||||
.branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
|
||||
.await?;
|
||||
tl.set_state(TimelineState::Active);
|
||||
Ok(tl)
|
||||
}
|
||||
|
||||
/// Helper for unit tests to branch a timeline with some pre-loaded states.
|
||||
#[cfg(test)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn branch_timeline_test_with_layers(
|
||||
&self,
|
||||
src_timeline: &Arc<Timeline>,
|
||||
dst_id: TimelineId,
|
||||
ancestor_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
|
||||
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
||||
end_lsn: Lsn,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let tline = self
|
||||
.branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
|
||||
.await?;
|
||||
let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
|
||||
ancestor_lsn
|
||||
} else {
|
||||
tline.get_last_record_lsn()
|
||||
};
|
||||
assert!(end_lsn >= ancestor_lsn);
|
||||
tline.force_advance_lsn(end_lsn);
|
||||
for deltas in delta_layer_desc {
|
||||
tline
|
||||
.force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
for (lsn, images) in image_layer_desc {
|
||||
tline
|
||||
.force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
Ok(tline)
|
||||
}
|
||||
|
||||
/// Branch an existing timeline.
|
||||
///
|
||||
/// The caller is responsible for activating the returned timeline.
|
||||
@@ -4154,7 +4238,7 @@ mod tests {
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
}
|
||||
tline.freeze_and_flush().await
|
||||
tline.freeze_and_flush().await.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -4308,9 +4392,10 @@ mod tests {
|
||||
|
||||
// This needs to traverse to the parent, and fails.
|
||||
let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
|
||||
assert!(err
|
||||
.to_string()
|
||||
.contains("will not become active. Current state: Broken"));
|
||||
assert!(err.to_string().starts_with(&format!(
|
||||
"Bad state on timeline {}: Broken",
|
||||
tline.timeline_id
|
||||
)));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6205,75 +6290,36 @@ mod tests {
|
||||
async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
|
||||
let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
|
||||
|
||||
let mut lsn = Lsn(0x20);
|
||||
|
||||
{
|
||||
let mut writer = tline.writer().await;
|
||||
writer
|
||||
.put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
tline.freeze_and_flush().await?; // this will create a image layer
|
||||
}
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // delta layers
|
||||
vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
|
||||
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
|
||||
)
|
||||
.await?;
|
||||
|
||||
let child = tenant
|
||||
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
|
||||
.branch_timeline_test_with_layers(
|
||||
&tline,
|
||||
NEW_TIMELINE_ID,
|
||||
Some(Lsn(0x20)),
|
||||
&ctx,
|
||||
Vec::new(), // delta layers
|
||||
vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
lsn.0 += 0x10;
|
||||
|
||||
{
|
||||
let mut writer = child.writer().await;
|
||||
writer
|
||||
.put(
|
||||
base_key_child,
|
||||
lsn,
|
||||
&Value::Image(test_img("data key 2")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
child.freeze_and_flush().await?; // this will create a delta
|
||||
|
||||
{
|
||||
// update the partitioning to include the test key space, otherwise they
|
||||
// will be dropped by image layer creation
|
||||
let mut guard = child.partitioning.lock().await;
|
||||
let ((partitioning, _), partition_lsn) = &mut *guard;
|
||||
partitioning
|
||||
.parts
|
||||
.push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
|
||||
*partition_lsn = lsn;
|
||||
}
|
||||
|
||||
child
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut set = EnumSet::empty();
|
||||
set.insert(CompactFlags::ForceImageLayerCreation);
|
||||
set
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?; // force create an image layer for the keys, TODO: check if the image layer is created
|
||||
}
|
||||
|
||||
async fn get_vectored_impl_wrapper(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
@@ -6295,6 +6341,8 @@ mod tests {
|
||||
}))
|
||||
}
|
||||
|
||||
let lsn = Lsn(0x30);
|
||||
|
||||
// test vectored get on parent timeline
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
|
||||
@@ -6332,94 +6380,42 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
|
||||
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // delta layers
|
||||
vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
|
||||
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
|
||||
)
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
|
||||
let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
|
||||
base_key.field1 = AUX_KEY_PREFIX;
|
||||
base_key_child.field1 = AUX_KEY_PREFIX;
|
||||
base_key_nonexist.field1 = AUX_KEY_PREFIX;
|
||||
|
||||
let mut lsn = Lsn(0x20);
|
||||
|
||||
{
|
||||
let mut writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
base_key,
|
||||
lsn,
|
||||
&Value::Image(test_img("metadata key 1")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
tline.freeze_and_flush().await?; // this will create an image layer
|
||||
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut set = EnumSet::empty();
|
||||
set.insert(CompactFlags::ForceImageLayerCreation);
|
||||
set.insert(CompactFlags::ForceRepartition);
|
||||
set
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?; // force create an image layer for metadata keys
|
||||
tenant
|
||||
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let child = tenant
|
||||
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
|
||||
.branch_timeline_test_with_layers(
|
||||
&tline,
|
||||
NEW_TIMELINE_ID,
|
||||
Some(Lsn(0x20)),
|
||||
&ctx,
|
||||
Vec::new(), // delta layers
|
||||
vec![(
|
||||
Lsn(0x30),
|
||||
vec![(base_key_child, test_img("metadata key 2"))],
|
||||
)], // image layers
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
lsn.0 += 0x10;
|
||||
|
||||
{
|
||||
let mut writer = child.writer().await;
|
||||
writer
|
||||
.put(
|
||||
base_key_child,
|
||||
lsn,
|
||||
&Value::Image(test_img("metadata key 2")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
child.freeze_and_flush().await?;
|
||||
|
||||
child
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut set = EnumSet::empty();
|
||||
set.insert(CompactFlags::ForceImageLayerCreation);
|
||||
set.insert(CompactFlags::ForceRepartition);
|
||||
set
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?; // force create an image layer for metadata keys
|
||||
tenant
|
||||
.gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
async fn get_vectored_impl_wrapper(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
@@ -6441,6 +6437,8 @@ mod tests {
|
||||
}))
|
||||
}
|
||||
|
||||
let lsn = Lsn(0x30);
|
||||
|
||||
// test vectored get on parent timeline
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
|
||||
@@ -6471,4 +6469,208 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_vectored_impl_wrapper(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Bytes>, GetVectoredError> {
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let mut res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key..key.next()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
Ok(res.pop_last().map(|(k, v)| {
|
||||
assert_eq!(k, key);
|
||||
v.unwrap()
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
|
||||
|
||||
// We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones
|
||||
// Lsn 0x30 key0, key3, no key1+key2
|
||||
// Lsn 0x20 key1+key2 tomestones
|
||||
// Lsn 0x10 key1 in image, key2 in delta
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
// delta layers
|
||||
vec![
|
||||
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
||||
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
],
|
||||
// image layers
|
||||
vec![
|
||||
(Lsn(0x10), vec![(key1, test_img("metadata key 1"))]),
|
||||
(
|
||||
Lsn(0x30),
|
||||
vec![
|
||||
(key0, test_img("metadata key 0")),
|
||||
(key3, test_img("metadata key 3")),
|
||||
],
|
||||
),
|
||||
],
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x30);
|
||||
let old_lsn = Lsn(0x20);
|
||||
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key 0"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?,
|
||||
None,
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?,
|
||||
None,
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?,
|
||||
Some(Bytes::new()),
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?,
|
||||
Some(Bytes::new()),
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key 3"))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
// delta layers
|
||||
vec![
|
||||
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
||||
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
vec![
|
||||
(key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
|
||||
(key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
|
||||
],
|
||||
],
|
||||
// image layers
|
||||
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceImageLayerCreation);
|
||||
flags.insert(CompactFlags::ForceRepartition);
|
||||
flags
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.await?
|
||||
.into_iter()
|
||||
.filter(|(k, _)| k.is_metadata_key())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
// delta layers
|
||||
vec![
|
||||
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
||||
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
],
|
||||
// image layers
|
||||
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceImageLayerCreation);
|
||||
flags.insert(CompactFlags::ForceRepartition);
|
||||
flags
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.await?
|
||||
.into_iter()
|
||||
.filter(|(k, _)| k.is_metadata_key())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::storage_layer::inmemory_layer;
|
||||
use crate::tenant::timeline::ShutdownMode;
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
@@ -2833,7 +2833,13 @@ pub(crate) async fn immediate_gc(
|
||||
}
|
||||
}
|
||||
|
||||
result.map_err(ApiError::InternalServerError)
|
||||
result.map_err(|e| match e {
|
||||
GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
|
||||
GcError::TimelineNotFound => {
|
||||
ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
|
||||
}
|
||||
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
|
||||
#[derive(Debug)]
|
||||
struct LayerKeyspace {
|
||||
layer: ReadableLayer,
|
||||
target_keyspace: KeySpace,
|
||||
target_keyspace: Vec<KeySpace>,
|
||||
}
|
||||
|
||||
impl LayerFringe {
|
||||
@@ -336,6 +336,7 @@ impl LayerFringe {
|
||||
};
|
||||
|
||||
let removed = self.layers.remove_entry(&read_desc.layer_id);
|
||||
|
||||
match removed {
|
||||
Some((
|
||||
_,
|
||||
@@ -343,7 +344,15 @@ impl LayerFringe {
|
||||
layer,
|
||||
target_keyspace,
|
||||
},
|
||||
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
|
||||
)) => {
|
||||
let mut keyspace = KeySpaceRandomAccum::new();
|
||||
for ks in target_keyspace {
|
||||
for part in ks.ranges {
|
||||
keyspace.add_range(part);
|
||||
}
|
||||
}
|
||||
Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
|
||||
}
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
}
|
||||
}
|
||||
@@ -358,7 +367,7 @@ impl LayerFringe {
|
||||
let entry = self.layers.entry(layer_id.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().target_keyspace.merge(&keyspace);
|
||||
entry.get_mut().target_keyspace.push(keyspace);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
@@ -367,7 +376,7 @@ impl LayerFringe {
|
||||
});
|
||||
entry.insert(LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace: keyspace,
|
||||
target_keyspace: vec![keyspace],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -366,7 +366,10 @@ impl Layer {
|
||||
.0
|
||||
.get_or_maybe_download(true, Some(ctx))
|
||||
.await
|
||||
.map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
|
||||
.map_err(|err| match err {
|
||||
DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
|
||||
self.0
|
||||
.access_stats
|
||||
@@ -1158,6 +1161,11 @@ impl LayerInner {
|
||||
let consecutive_failures =
|
||||
1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
if timeline.cancel.is_cancelled() {
|
||||
// If we're shutting down, drop out before logging the error
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
|
||||
|
||||
let backoff = utils::backoff::exponential_backoff_duration_seconds(
|
||||
|
||||
@@ -380,21 +380,28 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
let res = tenant
|
||||
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
error!(
|
||||
match res {
|
||||
Ok(_) => {
|
||||
error_run_count = 0;
|
||||
period
|
||||
}
|
||||
Err(crate::tenant::GcError::TenantCancelled) => {
|
||||
return;
|
||||
}
|
||||
Err(e) => {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
|
||||
error!(
|
||||
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
);
|
||||
wait_duration
|
||||
} else {
|
||||
error_run_count = 0;
|
||||
period
|
||||
wait_duration
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -102,7 +102,6 @@ use crate::metrics::{
|
||||
};
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
|
||||
@@ -131,14 +130,17 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
||||
use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
|
||||
use super::{
|
||||
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
|
||||
GcError,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub(super) enum FlushLoopState {
|
||||
pub(crate) enum FlushLoopState {
|
||||
NotStarted,
|
||||
Running {
|
||||
#[cfg(test)]
|
||||
@@ -496,15 +498,11 @@ pub(crate) enum PageReconstructError {
|
||||
Other(#[from] anyhow::Error),
|
||||
|
||||
#[error("Ancestor LSN wait error: {0}")]
|
||||
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||
AncestorLsnTimeout(WaitLsnError),
|
||||
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
/// The ancestor of this is being stopped
|
||||
#[error("ancestor timeline {0} is being stopped")]
|
||||
AncestorStopping(TimelineId),
|
||||
|
||||
/// An error happened replaying WAL records
|
||||
#[error(transparent)]
|
||||
WalRedo(anyhow::Error),
|
||||
@@ -569,7 +567,7 @@ impl PageReconstructError {
|
||||
match self {
|
||||
Other(_) => false,
|
||||
AncestorLsnTimeout(_) => false,
|
||||
Cancelled | AncestorStopping(_) => true,
|
||||
Cancelled => true,
|
||||
WalRedo(_) => false,
|
||||
MissingKey { .. } => false,
|
||||
}
|
||||
@@ -577,7 +575,7 @@ impl PageReconstructError {
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum CreateImageLayersError {
|
||||
pub(crate) enum CreateImageLayersError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
@@ -591,17 +589,35 @@ enum CreateImageLayersError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum FlushLayerError {
|
||||
#[derive(thiserror::Error, Debug, Clone)]
|
||||
pub(crate) enum FlushLayerError {
|
||||
/// Timeline cancellation token was cancelled
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
/// We tried to flush a layer while the Timeline is in an unexpected state
|
||||
#[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")]
|
||||
NotRunning(FlushLoopState),
|
||||
|
||||
// Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
|
||||
// loop via a watch channel, where we can only borrow it.
|
||||
#[error(transparent)]
|
||||
CreateImageLayersError(CreateImageLayersError),
|
||||
CreateImageLayersError(Arc<CreateImageLayersError>),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
Other(#[from] Arc<anyhow::Error>),
|
||||
}
|
||||
|
||||
impl FlushLayerError {
|
||||
// When crossing from generic anyhow errors to this error type, we explicitly check
|
||||
// for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err.
|
||||
fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self {
|
||||
if timeline.cancel.is_cancelled() {
|
||||
Self::Cancelled
|
||||
} else {
|
||||
Self::Other(Arc::new(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@@ -627,17 +643,17 @@ pub(crate) enum GetVectoredError {
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetReadyAncestorError {
|
||||
#[error("ancestor timeline {0} is being stopped")]
|
||||
AncestorStopping(TimelineId),
|
||||
|
||||
#[error("Ancestor LSN wait error: {0}")]
|
||||
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||
|
||||
#[error("Bad state on timeline {timeline_id}: {state:?}")]
|
||||
BadState {
|
||||
timeline_id: TimelineId,
|
||||
state: TimelineState,
|
||||
},
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
@@ -672,8 +688,8 @@ pub(crate) enum WaitLsnError {
|
||||
Shutdown,
|
||||
|
||||
// Called on an timeline not in active state or shutting down
|
||||
#[error("Bad state (not active)")]
|
||||
BadState,
|
||||
#[error("Bad timeline state: {0:?}")]
|
||||
BadState(TimelineState),
|
||||
|
||||
// Timeout expired while waiting for LSN to catch up with goal.
|
||||
#[error("{0}")]
|
||||
@@ -696,7 +712,7 @@ impl From<CreateImageLayersError> for FlushLayerError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
|
||||
any => FlushLayerError::CreateImageLayersError(any),
|
||||
any => FlushLayerError::CreateImageLayersError(Arc::new(any)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -736,10 +752,9 @@ impl From<GetReadyAncestorError> for PageReconstructError {
|
||||
fn from(e: GetReadyAncestorError) -> Self {
|
||||
use GetReadyAncestorError::*;
|
||||
match e {
|
||||
AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
|
||||
AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
|
||||
bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
|
||||
Cancelled => PageReconstructError::Cancelled,
|
||||
Other(other) => PageReconstructError::Other(other),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1171,9 +1186,7 @@ impl Timeline {
|
||||
|
||||
use PageReconstructError::*;
|
||||
match block {
|
||||
Err(Cancelled | AncestorStopping(_)) => {
|
||||
return Err(GetVectoredError::Cancelled)
|
||||
}
|
||||
Err(Cancelled) => return Err(GetVectoredError::Cancelled),
|
||||
Err(MissingKey(_))
|
||||
if NON_INHERITED_RANGE.contains(&key)
|
||||
|| NON_INHERITED_SPARSE_RANGE.contains(&key) =>
|
||||
@@ -1448,10 +1461,11 @@ impl Timeline {
|
||||
who_is_waiting: WaitLsnWaiter<'_>,
|
||||
ctx: &RequestContext, /* Prepare for use by cancellation */
|
||||
) -> Result<(), WaitLsnError> {
|
||||
if self.cancel.is_cancelled() {
|
||||
let state = self.current_state();
|
||||
if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) {
|
||||
return Err(WaitLsnError::Shutdown);
|
||||
} else if !self.is_active() {
|
||||
return Err(WaitLsnError::BadState);
|
||||
} else if !matches!(state, TimelineState::Active) {
|
||||
return Err(WaitLsnError::BadState(state));
|
||||
}
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
@@ -1547,13 +1561,13 @@ impl Timeline {
|
||||
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||
pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
|
||||
self.freeze_and_flush0().await
|
||||
}
|
||||
|
||||
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
|
||||
// polluting the span hierarchy.
|
||||
pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
|
||||
pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
|
||||
let to_lsn = self.freeze_inmem_layer(false).await;
|
||||
self.flush_frozen_layers_and_wait(to_lsn).await
|
||||
}
|
||||
@@ -2735,11 +2749,6 @@ impl Timeline {
|
||||
self.current_logical_size.initialized.add_permits(1);
|
||||
}
|
||||
|
||||
enum BackgroundCalculationError {
|
||||
Cancelled,
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
let try_once = |attempt: usize| {
|
||||
let background_ctx = &background_ctx;
|
||||
let self_ref = &self;
|
||||
@@ -2757,10 +2766,10 @@ impl Timeline {
|
||||
(Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
|
||||
}
|
||||
_ = self_ref.cancel.cancelled() => {
|
||||
return Err(BackgroundCalculationError::Cancelled);
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(BackgroundCalculationError::Cancelled);
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
},
|
||||
() = skip_concurrency_limiter.cancelled() => {
|
||||
// Some action that is part of a end user interaction requested logical size
|
||||
@@ -2787,18 +2796,7 @@ impl Timeline {
|
||||
.await
|
||||
{
|
||||
Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
|
||||
Err(CalculateLogicalSizeError::Cancelled) => {
|
||||
Err(BackgroundCalculationError::Cancelled)
|
||||
}
|
||||
Err(CalculateLogicalSizeError::Other(err)) => {
|
||||
if let Some(PageReconstructError::AncestorStopping(_)) =
|
||||
err.root_cause().downcast_ref()
|
||||
{
|
||||
Err(BackgroundCalculationError::Cancelled)
|
||||
} else {
|
||||
Err(BackgroundCalculationError::Other(err))
|
||||
}
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -2810,8 +2808,11 @@ impl Timeline {
|
||||
|
||||
match try_once(attempt).await {
|
||||
Ok(res) => return ControlFlow::Continue(res),
|
||||
Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
|
||||
Err(BackgroundCalculationError::Other(e)) => {
|
||||
Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()),
|
||||
Err(
|
||||
e @ (CalculateLogicalSizeError::Decode(_)
|
||||
| CalculateLogicalSizeError::PageRead(_)),
|
||||
) => {
|
||||
warn!(attempt, "initial size calculation failed: {e:?}");
|
||||
// exponential back-off doesn't make sense at these long intervals;
|
||||
// use fixed retry interval with generous jitter instead
|
||||
@@ -3188,17 +3189,21 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Recurse into ancestor if needed
|
||||
if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
|
||||
trace!(
|
||||
"going into ancestor {}, cont_lsn is {}",
|
||||
timeline.ancestor_lsn,
|
||||
cont_lsn
|
||||
);
|
||||
if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
|
||||
if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
|
||||
trace!(
|
||||
"going into ancestor {}, cont_lsn is {}",
|
||||
timeline.ancestor_lsn,
|
||||
cont_lsn
|
||||
);
|
||||
|
||||
timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
|
||||
timeline = &*timeline_owned;
|
||||
prev_lsn = None;
|
||||
continue 'outer;
|
||||
timeline_owned = timeline
|
||||
.get_ready_ancestor_timeline(ancestor_timeline, ctx)
|
||||
.await?;
|
||||
timeline = &*timeline_owned;
|
||||
prev_lsn = None;
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
@@ -3347,10 +3352,10 @@ impl Timeline {
|
||||
break None;
|
||||
}
|
||||
|
||||
// Not fully retrieved but no ancestor timeline.
|
||||
if timeline.ancestor_timeline.is_none() {
|
||||
let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
|
||||
// Not fully retrieved but no ancestor timeline.
|
||||
break Some(keyspace);
|
||||
}
|
||||
};
|
||||
|
||||
// Now we see if there are keys covered by the image layer but does not exist in the
|
||||
// image layer, which means that the key does not exist.
|
||||
@@ -3370,7 +3375,7 @@ impl Timeline {
|
||||
// Take the min to avoid reconstructing a page with data newer than request Lsn.
|
||||
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
|
||||
timeline_owned = timeline
|
||||
.get_ready_ancestor_timeline(ctx)
|
||||
.get_ready_ancestor_timeline(ancestor_timeline, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::GetReadyAncestorError)?;
|
||||
timeline = &*timeline_owned;
|
||||
@@ -3542,13 +3547,9 @@ impl Timeline {
|
||||
|
||||
async fn get_ready_ancestor_timeline(
|
||||
&self,
|
||||
ancestor: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, GetReadyAncestorError> {
|
||||
let ancestor = match self.get_ancestor_timeline() {
|
||||
Ok(timeline) => timeline,
|
||||
Err(e) => return Err(GetReadyAncestorError::from(e)),
|
||||
};
|
||||
|
||||
// It's possible that the ancestor timeline isn't active yet, or
|
||||
// is active but hasn't yet caught up to the branch point. Wait
|
||||
// for it.
|
||||
@@ -3576,16 +3577,14 @@ impl Timeline {
|
||||
match ancestor.wait_to_become_active(ctx).await {
|
||||
Ok(()) => {}
|
||||
Err(TimelineState::Stopping) => {
|
||||
return Err(GetReadyAncestorError::AncestorStopping(
|
||||
ancestor.timeline_id,
|
||||
));
|
||||
// If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping.
|
||||
return Err(GetReadyAncestorError::Cancelled);
|
||||
}
|
||||
Err(state) => {
|
||||
return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
|
||||
"Timeline {} will not become active. Current state: {:?}",
|
||||
ancestor.timeline_id,
|
||||
&state,
|
||||
)));
|
||||
return Err(GetReadyAncestorError::BadState {
|
||||
timeline_id: ancestor.timeline_id,
|
||||
state,
|
||||
});
|
||||
}
|
||||
}
|
||||
ancestor
|
||||
@@ -3594,21 +3593,17 @@ impl Timeline {
|
||||
.map_err(|e| match e {
|
||||
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
|
||||
WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
|
||||
e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
|
||||
WaitLsnError::BadState(state) => GetReadyAncestorError::BadState {
|
||||
timeline_id: ancestor.timeline_id,
|
||||
state,
|
||||
},
|
||||
})?;
|
||||
|
||||
Ok(ancestor)
|
||||
Ok(ancestor.clone())
|
||||
}
|
||||
|
||||
pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
|
||||
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
|
||||
format!(
|
||||
"Ancestor is missing. Timeline id: {} Ancestor id {:?}",
|
||||
self.timeline_id,
|
||||
self.get_ancestor_timeline_id(),
|
||||
)
|
||||
})?;
|
||||
Ok(Arc::clone(ancestor))
|
||||
pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
|
||||
self.ancestor_timeline.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
@@ -3717,7 +3712,9 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
err @ Err(
|
||||
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
|
||||
FlushLayerError::NotRunning(_)
|
||||
| FlushLayerError::Other(_)
|
||||
| FlushLayerError::CreateImageLayersError(_),
|
||||
) => {
|
||||
error!("could not flush frozen layer: {err:?}");
|
||||
break err.map(|_| ());
|
||||
@@ -3763,7 +3760,10 @@ impl Timeline {
|
||||
/// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
|
||||
/// it means no data will be written between the top of the highest frozen layer and to_lsn,
|
||||
/// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
|
||||
async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
|
||||
async fn flush_frozen_layers_and_wait(
|
||||
&self,
|
||||
last_record_lsn: Lsn,
|
||||
) -> Result<(), FlushLayerError> {
|
||||
let mut rx = self.layer_flush_done_tx.subscribe();
|
||||
|
||||
// Increment the flush cycle counter and wake up the flush task.
|
||||
@@ -3774,7 +3774,7 @@ impl Timeline {
|
||||
|
||||
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
|
||||
if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
|
||||
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
|
||||
return Err(FlushLayerError::NotRunning(flush_loop_state));
|
||||
}
|
||||
|
||||
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
|
||||
@@ -3787,14 +3787,11 @@ impl Timeline {
|
||||
{
|
||||
let (last_result_counter, last_result) = &*rx.borrow();
|
||||
if *last_result_counter >= my_flush_request {
|
||||
if let Err(_err) = last_result {
|
||||
if let Err(err) = last_result {
|
||||
// We already logged the original error in
|
||||
// flush_loop. We cannot propagate it to the caller
|
||||
// here, because it might not be Cloneable
|
||||
anyhow::bail!(
|
||||
"Could not flush frozen layer. Request id: {}",
|
||||
my_flush_request
|
||||
);
|
||||
return Err(err.clone());
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -3803,7 +3800,7 @@ impl Timeline {
|
||||
trace!("waiting for flush to complete");
|
||||
tokio::select! {
|
||||
rx_e = rx.changed() => {
|
||||
rx_e?;
|
||||
rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?;
|
||||
},
|
||||
// Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
|
||||
// the notification from [`flush_loop`] that it completed.
|
||||
@@ -3875,7 +3872,8 @@ impl Timeline {
|
||||
EnumSet::empty(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e))?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
@@ -3899,7 +3897,8 @@ impl Timeline {
|
||||
Some(metadata_keyspace.0.ranges[0].clone()),
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e))?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -3926,7 +3925,11 @@ impl Timeline {
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
// `create_delta_layer` will not modify the layer map.
|
||||
// We will remove frozen layer and add delta layer in one atomic operation later.
|
||||
let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
|
||||
let Some(layer) = self
|
||||
.create_delta_layer(&frozen_layer, None, ctx)
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e))?
|
||||
else {
|
||||
panic!("delta layer cannot be empty if no filter is applied");
|
||||
};
|
||||
(
|
||||
@@ -3959,7 +3962,8 @@ impl Timeline {
|
||||
|
||||
if self.set_disk_consistent_lsn(disk_consistent_lsn) {
|
||||
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
||||
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
|
||||
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e))?;
|
||||
}
|
||||
// release lock on 'layers'
|
||||
};
|
||||
@@ -4257,7 +4261,7 @@ impl Timeline {
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
|
||||
if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() {
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
@@ -4307,6 +4311,7 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
img_range: Range<Key>,
|
||||
mode: ImageLayerCreationMode,
|
||||
start: Key,
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
assert!(!matches!(mode, ImageLayerCreationMode::Initial));
|
||||
|
||||
@@ -4315,39 +4320,43 @@ impl Timeline {
|
||||
let data = self
|
||||
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
let (data, total_kb_retrieved, total_key_retrieved) = {
|
||||
let (data, total_kb_retrieved, total_keys_retrieved) = {
|
||||
let mut new_data = BTreeMap::new();
|
||||
let mut total_kb_retrieved = 0;
|
||||
let mut total_key_retrieved = 0;
|
||||
let mut total_keys_retrieved = 0;
|
||||
for (k, v) in data {
|
||||
let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
|
||||
total_kb_retrieved += KEY_SIZE + v.len();
|
||||
total_key_retrieved += 1;
|
||||
total_keys_retrieved += 1;
|
||||
new_data.insert(k, v);
|
||||
}
|
||||
(new_data, total_kb_retrieved / 1024, total_key_retrieved)
|
||||
(new_data, total_kb_retrieved / 1024, total_keys_retrieved)
|
||||
};
|
||||
let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
|
||||
let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
|
||||
|
||||
let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
|
||||
let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
|
||||
debug!(
|
||||
"generate image layers for metadata keys: trigger_generation={trigger_generation}, \
|
||||
delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
|
||||
total_key_retrieved={total_key_retrieved}"
|
||||
trigger_generation,
|
||||
delta_files_accessed,
|
||||
total_kb_retrieved,
|
||||
total_keys_retrieved,
|
||||
"generate metadata images"
|
||||
);
|
||||
|
||||
if !trigger_generation && mode == ImageLayerCreationMode::Try {
|
||||
return Ok(ImageLayerCreationOutcome {
|
||||
image: None,
|
||||
next_start_key: img_range.end,
|
||||
});
|
||||
}
|
||||
let has_keys = !data.is_empty();
|
||||
let mut wrote_any_image = false;
|
||||
for (k, v) in data {
|
||||
// Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
|
||||
// considers this situation properly.
|
||||
// if v.is_empty() {
|
||||
// continue;
|
||||
// }
|
||||
if v.is_empty() {
|
||||
// the key has been deleted, it does not need an image
|
||||
// in metadata keyspace, an empty image == tombstone
|
||||
continue;
|
||||
}
|
||||
wrote_any_image = true;
|
||||
|
||||
// No need to handle sharding b/c metadata keys are always on the 0-th shard.
|
||||
|
||||
@@ -4355,16 +4364,26 @@ impl Timeline {
|
||||
// on the normal data path either.
|
||||
image_layer_writer.put_image(k, v, ctx).await?;
|
||||
}
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: if has_keys {
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
Some(image_layer)
|
||||
} else {
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
None
|
||||
},
|
||||
next_start_key: img_range.end,
|
||||
})
|
||||
|
||||
if wrote_any_image {
|
||||
// Normal path: we have written some data into the new image layer for this
|
||||
// partition, so flush it to disk.
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: Some(image_layer),
|
||||
next_start_key: img_range.end,
|
||||
})
|
||||
} else {
|
||||
// Special case: the image layer may be empty if this is a sharded tenant and the
|
||||
// partition does not cover any keys owned by this shard. In this case, to ensure
|
||||
// we don't leave gaps between image layers, leave `start` where it is, so that the next
|
||||
// layer we write will cover the key range that we just scanned.
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: None,
|
||||
next_start_key: start,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
|
||||
@@ -4474,6 +4493,7 @@ impl Timeline {
|
||||
ctx,
|
||||
img_range,
|
||||
mode,
|
||||
start,
|
||||
)
|
||||
.await?;
|
||||
start = next_start_key;
|
||||
@@ -4835,7 +4855,7 @@ impl Timeline {
|
||||
/// Currently, we don't make any attempt at removing unneeded page versions
|
||||
/// within a layer file. We can only remove the whole file if it's fully
|
||||
/// obsolete.
|
||||
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
|
||||
pub(super) async fn gc(&self) -> Result<GcResult, GcError> {
|
||||
// this is most likely the background tasks, but it might be the spawned task from
|
||||
// immediate_gc
|
||||
let _g = tokio::select! {
|
||||
@@ -4848,7 +4868,7 @@ impl Timeline {
|
||||
|
||||
// Is the timeline being deleted?
|
||||
if self.is_stopping() {
|
||||
anyhow::bail!("timeline is Stopping");
|
||||
return Err(GcError::TimelineCancelled);
|
||||
}
|
||||
|
||||
let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
|
||||
@@ -4906,7 +4926,7 @@ impl Timeline {
|
||||
pitr_cutoff: Lsn,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
new_gc_cutoff: Lsn,
|
||||
) -> anyhow::Result<GcResult> {
|
||||
) -> Result<GcResult, GcError> {
|
||||
// FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
|
||||
|
||||
let now = SystemTime::now();
|
||||
@@ -4928,12 +4948,15 @@ impl Timeline {
|
||||
// The GC cutoff should only ever move forwards.
|
||||
let waitlist = {
|
||||
let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
|
||||
ensure!(
|
||||
*write_guard <= new_gc_cutoff,
|
||||
"Cannot move GC cutoff LSN backwards (was {}, new {})",
|
||||
*write_guard,
|
||||
new_gc_cutoff
|
||||
);
|
||||
if *write_guard > new_gc_cutoff {
|
||||
return Err(GcError::BadLsn {
|
||||
why: format!(
|
||||
"Cannot move GC cutoff LSN backwards (was {}, new {})",
|
||||
*write_guard, new_gc_cutoff
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
write_guard.store_and_unlock(new_gc_cutoff)
|
||||
};
|
||||
waitlist.wait().await;
|
||||
@@ -5042,7 +5065,14 @@ impl Timeline {
|
||||
// This unconditionally schedules also an index_part.json update, even though, we will
|
||||
// be doing one a bit later with the unlinked gc'd layers.
|
||||
let disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
self.schedule_uploads(disk_consistent_lsn, None)?;
|
||||
self.schedule_uploads(disk_consistent_lsn, None)
|
||||
.map_err(|e| {
|
||||
if self.cancel.is_cancelled() {
|
||||
GcError::TimelineCancelled
|
||||
} else {
|
||||
GcError::Remote(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
let gc_layers = layers_to_remove
|
||||
.iter()
|
||||
@@ -5051,7 +5081,15 @@ impl Timeline {
|
||||
|
||||
result.layers_removed = gc_layers.len() as u64;
|
||||
|
||||
self.remote_client.schedule_gc_update(&gc_layers)?;
|
||||
self.remote_client
|
||||
.schedule_gc_update(&gc_layers)
|
||||
.map_err(|e| {
|
||||
if self.cancel.is_cancelled() {
|
||||
GcError::TimelineCancelled
|
||||
} else {
|
||||
GcError::Remote(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
guard.finish_gc_timeline(&gc_layers);
|
||||
|
||||
@@ -5066,7 +5104,7 @@ impl Timeline {
|
||||
result.layers_removed, new_gc_cutoff
|
||||
);
|
||||
|
||||
result.elapsed = now.elapsed()?;
|
||||
result.elapsed = now.elapsed().unwrap_or(Duration::ZERO);
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
@@ -5358,6 +5396,133 @@ impl Timeline {
|
||||
shard_count: self.tenant_shard_id.shard_count,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
|
||||
self.last_record_lsn.advance(new_lsn);
|
||||
}
|
||||
|
||||
/// Force create an image layer and place it into the layer map.
|
||||
///
|
||||
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
|
||||
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn force_create_image_layer(
|
||||
self: &Arc<Timeline>,
|
||||
lsn: Lsn,
|
||||
mut images: Vec<(Key, Bytes)>,
|
||||
check_start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
assert!(
|
||||
lsn <= last_record_lsn,
|
||||
"advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}"
|
||||
);
|
||||
if let Some(check_start_lsn) = check_start_lsn {
|
||||
assert!(lsn >= check_start_lsn);
|
||||
}
|
||||
images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
|
||||
let min_key = *images.first().map(|(k, _)| k).unwrap();
|
||||
let max_key = images.last().map(|(k, _)| k).unwrap().next();
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(min_key..max_key),
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
for (key, img) in images {
|
||||
image_layer_writer.put_image(key, img, ctx).await?;
|
||||
}
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.force_insert_layer(image_layer);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Force create a delta layer and place it into the layer map.
|
||||
///
|
||||
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
|
||||
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn force_create_delta_layer(
|
||||
self: &Arc<Timeline>,
|
||||
mut deltas: Vec<(Key, Lsn, Value)>,
|
||||
check_start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
|
||||
let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
|
||||
let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
|
||||
let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
||||
let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
|
||||
assert!(
|
||||
max_lsn <= last_record_lsn,
|
||||
"advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
|
||||
);
|
||||
let end_lsn = Lsn(max_lsn.0 + 1);
|
||||
if let Some(check_start_lsn) = check_start_lsn {
|
||||
assert!(min_lsn >= check_start_lsn);
|
||||
}
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
min_key,
|
||||
min_lsn..end_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
|
||||
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.force_insert_layer(delta_layer);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return all keys at the LSN in the image layers
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn inspect_image_layers(
|
||||
self: &Arc<Timeline>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Bytes)>> {
|
||||
let mut all_data = Vec::new();
|
||||
let guard = self.layers.read().await;
|
||||
for layer in guard.layer_map().iter_historic_layers() {
|
||||
if !layer.is_delta() && layer.image_layer_lsn() == lsn {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
let mut reconstruct_data = ValuesReconstructState::default();
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
KeySpace::single(Key::MIN..Key::MAX),
|
||||
lsn..Lsn(lsn.0 + 1),
|
||||
&mut reconstruct_data,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
for (k, v) in reconstruct_data.keys {
|
||||
all_data.push((k, v?.img.unwrap().1));
|
||||
}
|
||||
}
|
||||
}
|
||||
all_data.sort();
|
||||
Ok(all_data)
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{layer_manager::LayerManager, Timeline};
|
||||
use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::TaskKind,
|
||||
@@ -23,7 +23,7 @@ pub(crate) enum Error {
|
||||
#[error("shutting down, please retry later")]
|
||||
ShuttingDown,
|
||||
#[error("flushing failed")]
|
||||
FlushAncestor(#[source] anyhow::Error),
|
||||
FlushAncestor(#[source] FlushLayerError),
|
||||
#[error("layer download failed")]
|
||||
RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
|
||||
#[error("copying LSN prefix locally failed")]
|
||||
|
||||
@@ -255,6 +255,13 @@ impl LayerManager {
|
||||
updates.flush()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
|
||||
updates.flush()
|
||||
}
|
||||
|
||||
/// Helper function to insert a layer into the layer map and file manager.
|
||||
fn insert_historic_layer(
|
||||
layer: Layer,
|
||||
|
||||
@@ -34,7 +34,6 @@ use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::key_to_rel_block;
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::sync::Arc;
|
||||
@@ -208,7 +207,7 @@ impl PostgresRedoManager {
|
||||
) -> anyhow::Result<Bytes> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
|
||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
|
||||
@@ -3,7 +3,7 @@ use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
@@ -48,7 +48,7 @@ pub(crate) fn apply_in_neon(
|
||||
flags,
|
||||
} => {
|
||||
// sanity check that this is modifying the correct relation
|
||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
assert!(
|
||||
rel.forknum == VISIBILITYMAP_FORKNUM,
|
||||
"ClearVisibilityMapFlags record on unexpected rel {}",
|
||||
@@ -85,7 +85,7 @@ pub(crate) fn apply_in_neon(
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
@@ -130,7 +130,7 @@ pub(crate) fn apply_in_neon(
|
||||
}
|
||||
}
|
||||
NeonWalRecord::ClogSetAborted { xids } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
@@ -160,7 +160,7 @@ pub(crate) fn apply_in_neon(
|
||||
}
|
||||
}
|
||||
NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactOffsets,
|
||||
@@ -192,7 +192,7 @@ pub(crate) fn apply_in_neon(
|
||||
LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
|
||||
}
|
||||
NeonWalRecord::MultixactMembersCreate { moff, members } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactMembers,
|
||||
|
||||
@@ -184,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti
|
||||
}
|
||||
else if (state->wre_errno == ENOENT)
|
||||
{
|
||||
nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
|
||||
LSN_FORMAT_ARGS(startptr));
|
||||
nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote",
|
||||
LSN_FORMAT_ARGS(startptr), count);
|
||||
return NeonWALReadRemote(state, buf, startptr, count, tli);
|
||||
}
|
||||
else
|
||||
@@ -614,6 +614,7 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
|
||||
uint32 startoff;
|
||||
int segbytes;
|
||||
int readbytes;
|
||||
XLogSegNo lastRemovedSegNo;
|
||||
|
||||
startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
|
||||
|
||||
@@ -689,6 +690,23 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Recheck that the segment hasn't been removed while we were reading
|
||||
* it.
|
||||
*/
|
||||
lastRemovedSegNo = XLogGetLastRemovedSegno();
|
||||
if (state->seg.ws_segno <= lastRemovedSegNo)
|
||||
{
|
||||
char fname[MAXFNAMELEN];
|
||||
|
||||
state->wre_errno = ENOENT;
|
||||
|
||||
XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize);
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT,
|
||||
fname, lastRemovedSegNo);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Update state for read */
|
||||
recptr += readbytes;
|
||||
nbytes -= readbytes;
|
||||
|
||||
@@ -295,10 +295,16 @@ extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);
|
||||
/* utils for neon relsize cache */
|
||||
extern void relsize_hash_init(void);
|
||||
extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size);
|
||||
extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
|
||||
extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber new_size);
|
||||
extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
|
||||
extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);
|
||||
|
||||
extern void start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum);
|
||||
extern bool is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum);
|
||||
extern void stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum);
|
||||
|
||||
|
||||
|
||||
/* functions for local file cache */
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
@@ -10,10 +10,6 @@
|
||||
* Temporary and unlogged tables are stored locally, by md.c. The functions
|
||||
* here just pass the calls through to corresponding md.c functions.
|
||||
*
|
||||
* Index build operations that use the buffer cache are also handled locally,
|
||||
* just like unlogged tables. Such operations must be marked by calling
|
||||
* smgr_start_unlogged_build() and friends.
|
||||
*
|
||||
* In order to know what relations are permanent and which ones are not, we
|
||||
* have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set
|
||||
* by smgropen() callers, when they have the relcache entry at hand. However,
|
||||
@@ -64,6 +60,7 @@
|
||||
#include "storage/fsm_internals.h"
|
||||
#include "storage/md.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "utils/rel.h"
|
||||
|
||||
#include "pagestore_client.h"
|
||||
|
||||
@@ -100,17 +97,7 @@ const int SmgrTrace = DEBUG5;
|
||||
|
||||
page_server_api *page_server;
|
||||
|
||||
/* unlogged relation build states */
|
||||
typedef enum
|
||||
{
|
||||
UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
|
||||
UNLOGGED_BUILD_PHASE_1,
|
||||
UNLOGGED_BUILD_PHASE_2,
|
||||
UNLOGGED_BUILD_NOT_PERMANENT
|
||||
} UnloggedBuildPhase;
|
||||
|
||||
static SMgrRelation unlogged_build_rel = NULL;
|
||||
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
const PGAlignedBlock zero_buffer;
|
||||
|
||||
static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
|
||||
static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
||||
@@ -1402,10 +1389,6 @@ PageIsEmptyHeapPage(char *buffer)
|
||||
return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* A page is being evicted from the shared buffer cache. Update the
|
||||
* last-written LSN of the page, and WAL-log it if needed.
|
||||
*/
|
||||
static void
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
|
||||
@@ -1413,6 +1396,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
|
||||
#endif
|
||||
{
|
||||
BlockNumber relsize;
|
||||
XLogRecPtr lsn = PageGetLSN((Page) buffer);
|
||||
bool log_page;
|
||||
|
||||
@@ -1429,13 +1413,28 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
Assert(XLogInsertAllowed());
|
||||
log_page = true;
|
||||
}
|
||||
else if (XLogInsertAllowed() &&
|
||||
!ShutdownRequestPending &&
|
||||
(forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
|
||||
else if (XLogInsertAllowed() && !ShutdownRequestPending)
|
||||
{
|
||||
log_page = true;
|
||||
if (forknum == MAIN_FORKNUM)
|
||||
{
|
||||
if (!PageIsNew((Page) buffer))
|
||||
{
|
||||
if (lsn < FirstNormalUnloggedLSN)
|
||||
{
|
||||
start_unlogged_build(InfoFromSMgrRel(reln), forknum, blocknum);
|
||||
log_page = true;
|
||||
}
|
||||
else if (is_unlogged_build(InfoFromSMgrRel(reln), forknum))
|
||||
{
|
||||
log_page = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log_page = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (log_page)
|
||||
{
|
||||
XLogRecPtr recptr;
|
||||
@@ -1508,14 +1507,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(SmgrTrace,
|
||||
(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
}
|
||||
|
||||
/*
|
||||
* Remember the LSN on this page. When we read the page again, we must
|
||||
@@ -1524,6 +1515,19 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forknum, blocknum);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if unlogged build is in progress for specified relation
|
||||
* and stop it if so. It is used as callback for log_newpage_range( function
|
||||
* which is called at the end of unlogged build.
|
||||
*/
|
||||
static void
|
||||
neon_log_newpage_range_callback(Relation rel, ForkNumber forknum)
|
||||
{
|
||||
SMgrRelation smgr = RelationGetSmgr(rel);
|
||||
stop_unlogged_build(InfoFromSMgrRel(smgr), forknum);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* neon_init() -- Initialize private state
|
||||
*/
|
||||
@@ -1559,6 +1563,8 @@ neon_init(void)
|
||||
old_redo_read_buffer_filter = redo_read_buffer_filter;
|
||||
redo_read_buffer_filter = neon_redo_read_buffer_filter;
|
||||
|
||||
log_newpage_range_callback = neon_log_newpage_range_callback;
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
mdinit();
|
||||
#endif
|
||||
@@ -2132,6 +2138,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
|
||||
|
||||
neon_wallog_page(reln, forkNum, blkno, buffer, false);
|
||||
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
|
||||
|
||||
lsn = PageGetLSN((Page) buffer);
|
||||
@@ -2167,8 +2174,7 @@ void
|
||||
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
int nblocks, bool skipFsync)
|
||||
{
|
||||
const PGAlignedBlock buffer = {0};
|
||||
int remblocks = nblocks;
|
||||
BlockNumber remblocks = nblocks;
|
||||
XLogRecPtr lsn = 0;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
@@ -2218,8 +2224,24 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
if (!XLogInsertAllowed())
|
||||
return;
|
||||
|
||||
/* ensure we have enough xlog buffers to log max-sized records */
|
||||
XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum + nblocks);
|
||||
|
||||
if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks */
|
||||
{
|
||||
/* ensure we have enough xlog buffers to log max-sized records */
|
||||
XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* smgr_extend is often called with an all-zeroes page, so
|
||||
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
|
||||
* later, after it has been initialized with the real page contents, and
|
||||
* it is eventually evicted from the buffer cache. But we need a valid LSN
|
||||
* to the relation metadata update now.
|
||||
*/
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate over all the pages. They are collected into batches of
|
||||
@@ -2230,17 +2252,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
{
|
||||
int count = Min(remblocks, XLR_MAX_BLOCK_ID);
|
||||
|
||||
XLogBeginInsert();
|
||||
if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks */
|
||||
{
|
||||
XLogBeginInsert();
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i,
|
||||
(char *) buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
|
||||
|
||||
lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
|
||||
for (int i = 0; i < count; i++)
|
||||
XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i,
|
||||
(char *) zero_buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
|
||||
|
||||
lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
|
||||
}
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
|
||||
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, zero_buffer.data);
|
||||
SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum,
|
||||
blocknum + i);
|
||||
}
|
||||
@@ -2252,7 +2276,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
Assert(lsn != 0);
|
||||
|
||||
SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum);
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2519,6 +2542,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
#endif
|
||||
{
|
||||
neon_request_lsns request_lsns;
|
||||
BlockNumber relsize;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -2939,150 +2963,6 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* neon_start_unlogged_build() -- Starting build operation on a rel.
|
||||
*
|
||||
* Some indexes are built in two phases, by first populating the table with
|
||||
* regular inserts, using the shared buffer cache but skipping WAL-logging,
|
||||
* and WAL-logging the whole relation after it's done. Neon relies on the
|
||||
* WAL to reconstruct pages, so we cannot use the page server in the
|
||||
* first phase when the changes are not logged.
|
||||
*/
|
||||
static void
|
||||
neon_start_unlogged_build(SMgrRelation reln)
|
||||
{
|
||||
/*
|
||||
* Currently, there can be only one unlogged relation build operation in
|
||||
* progress at a time. That's enough for the current usage.
|
||||
*/
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
|
||||
neon_log(ERROR, "unlogged relation build is already in progress");
|
||||
Assert(unlogged_build_rel == NULL);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
case RELPERSISTENCE_UNLOGGED:
|
||||
unlogged_build_rel = reln;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
|
||||
return;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
|
||||
neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
|
||||
|
||||
unlogged_build_rel = reln;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
|
||||
|
||||
/* Make the relation look like it's unlogged */
|
||||
reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
|
||||
|
||||
/*
|
||||
* Create the local file. In a parallel build, the leader is expected to
|
||||
* call this first and do it.
|
||||
*
|
||||
* FIXME: should we pass isRedo true to create the tablespace dir if it
|
||||
* doesn't exist? Is it needed?
|
||||
*/
|
||||
if (!IsParallelWorker())
|
||||
mdcreate(reln, MAIN_FORKNUM, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* neon_finish_unlogged_build_phase_1()
|
||||
*
|
||||
* Call this after you have finished populating a relation in unlogged mode,
|
||||
* before you start WAL-logging it.
|
||||
*/
|
||||
static void
|
||||
neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
{
|
||||
Assert(unlogged_build_rel == reln);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
|
||||
|
||||
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
return;
|
||||
|
||||
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
|
||||
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
|
||||
|
||||
/*
|
||||
* In a parallel build, (only) the leader process performs the 2nd
|
||||
* phase.
|
||||
*/
|
||||
if (IsParallelWorker())
|
||||
{
|
||||
unlogged_build_rel = NULL;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
}
|
||||
else
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
|
||||
}
|
||||
|
||||
/*
|
||||
* neon_end_unlogged_build() -- Finish an unlogged rel build.
|
||||
*
|
||||
* Call this after you have finished WAL-logging an relation that was
|
||||
* first populated without WAL-logging.
|
||||
*
|
||||
* This removes the local copy of the rel, since it's now been fully
|
||||
* WAL-logged and is present in the page server.
|
||||
*/
|
||||
static void
|
||||
neon_end_unlogged_build(SMgrRelation reln)
|
||||
{
|
||||
NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
|
||||
|
||||
Assert(unlogged_build_rel == reln);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
|
||||
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
{
|
||||
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
|
||||
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
|
||||
|
||||
/* Make the relation look permanent again */
|
||||
reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
|
||||
|
||||
/* Remove local copy */
|
||||
rinfob = InfoBFromSMgrRel(reln);
|
||||
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||
{
|
||||
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
|
||||
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
|
||||
forknum);
|
||||
|
||||
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
|
||||
mdclose(reln, forknum);
|
||||
/* use isRedo == true, so that we drop it immediately */
|
||||
mdunlink(rinfob, forknum, true);
|
||||
}
|
||||
}
|
||||
|
||||
unlogged_build_rel = NULL;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
}
|
||||
|
||||
#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
|
||||
|
||||
static int
|
||||
@@ -3176,40 +3056,6 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
return n_blocks;
|
||||
}
|
||||
|
||||
static void
|
||||
AtEOXact_neon(XactEvent event, void *arg)
|
||||
{
|
||||
switch (event)
|
||||
{
|
||||
case XACT_EVENT_ABORT:
|
||||
case XACT_EVENT_PARALLEL_ABORT:
|
||||
|
||||
/*
|
||||
* Forget about any build we might have had in progress. The local
|
||||
* file will be unlinked by smgrDoPendingDeletes()
|
||||
*/
|
||||
unlogged_build_rel = NULL;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
break;
|
||||
|
||||
case XACT_EVENT_COMMIT:
|
||||
case XACT_EVENT_PARALLEL_COMMIT:
|
||||
case XACT_EVENT_PREPARE:
|
||||
case XACT_EVENT_PRE_COMMIT:
|
||||
case XACT_EVENT_PARALLEL_PRE_COMMIT:
|
||||
case XACT_EVENT_PRE_PREPARE:
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
|
||||
{
|
||||
unlogged_build_rel = NULL;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INTERNAL_ERROR),
|
||||
(errmsg(NEON_TAG "unlogged index build was not properly finished"))));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static const struct f_smgr neon_smgr =
|
||||
{
|
||||
.smgr_init = neon_init,
|
||||
@@ -3231,10 +3077,6 @@ static const struct f_smgr neon_smgr =
|
||||
.smgr_truncate = neon_truncate,
|
||||
.smgr_immedsync = neon_immedsync,
|
||||
|
||||
.smgr_start_unlogged_build = neon_start_unlogged_build,
|
||||
.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
|
||||
.smgr_end_unlogged_build = neon_end_unlogged_build,
|
||||
|
||||
.smgr_read_slru_segment = neon_read_slru_segment,
|
||||
};
|
||||
|
||||
@@ -3252,8 +3094,6 @@ smgr_neon(BackendId backend, NRelFileInfo rinfo)
|
||||
void
|
||||
smgr_init_neon(void)
|
||||
{
|
||||
RegisterXactCallback(AtEOXact_neon, NULL);
|
||||
|
||||
smgr_init_standard();
|
||||
neon_init();
|
||||
}
|
||||
|
||||
@@ -39,7 +39,8 @@ typedef struct
|
||||
typedef struct
|
||||
{
|
||||
RelTag tag;
|
||||
BlockNumber size;
|
||||
BlockNumber size : 31;
|
||||
BlockNumber unlogged : 1;
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
} RelSizeEntry;
|
||||
|
||||
@@ -117,9 +118,12 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
|
||||
*size = entry->size;
|
||||
relsize_ctl->hits += 1;
|
||||
found = true;
|
||||
/* Move entry to the LRU list tail */
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
|
||||
if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
|
||||
{
|
||||
/* Move entry to the LRU list tail */
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -130,6 +134,9 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
|
||||
return found;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cache relation size.
|
||||
*/
|
||||
void
|
||||
set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
|
||||
{
|
||||
@@ -148,31 +155,53 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
|
||||
*/
|
||||
while ((entry = hash_search(relsize_hash, &tag, HASH_ENTER_NULL, &found)) == NULL)
|
||||
{
|
||||
RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
|
||||
hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
|
||||
Assert(relsize_ctl->size > 0);
|
||||
relsize_ctl->size -= 1;
|
||||
if (dlist_is_empty(&relsize_ctl->lru))
|
||||
{
|
||||
elog(FATAL, "No more free relsize cache entries");
|
||||
}
|
||||
else
|
||||
{
|
||||
RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
|
||||
hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
|
||||
Assert(relsize_ctl->size > 0);
|
||||
relsize_ctl->size -= 1;
|
||||
}
|
||||
}
|
||||
entry->size = size;
|
||||
if (!found)
|
||||
{
|
||||
if (++relsize_ctl->size == relsize_hash_size)
|
||||
entry->unlogged = false;
|
||||
if (relsize_ctl->size+1 == relsize_hash_size)
|
||||
{
|
||||
/*
|
||||
* Remove least recently used elment from the hash.
|
||||
* Hash size after is becomes `relsize_hash_size-1`.
|
||||
* But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
|
||||
*/
|
||||
RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
|
||||
hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
|
||||
relsize_ctl->size -= 1;
|
||||
if (dlist_is_empty(&relsize_ctl->lru))
|
||||
{
|
||||
elog(FATAL, "No more free relsize cache entries");
|
||||
}
|
||||
else
|
||||
{
|
||||
RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
|
||||
hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
relsize_ctl->size += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (entry->unlogged) /* entries of relation involved in unlogged build are pinned */
|
||||
{
|
||||
dlist_delete(&entry->lru_node);
|
||||
}
|
||||
dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
|
||||
|
||||
if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
|
||||
{
|
||||
dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
|
||||
}
|
||||
relsize_ctl->writes += 1;
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
@@ -191,23 +220,42 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
|
||||
tag.forknum = forknum;
|
||||
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
|
||||
entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
|
||||
if (!found || entry->size < size)
|
||||
if (!found) {
|
||||
entry->unlogged = false;
|
||||
entry->size = size;
|
||||
if (!found)
|
||||
{
|
||||
if (++relsize_ctl->size == relsize_hash_size)
|
||||
|
||||
if (relsize_ctl->size+1 == relsize_hash_size)
|
||||
{
|
||||
RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
|
||||
hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
|
||||
relsize_ctl->size -= 1;
|
||||
if (dlist_is_empty(&relsize_ctl->lru))
|
||||
{
|
||||
elog(FATAL, "No more free relsize cache entries");
|
||||
}
|
||||
else
|
||||
{
|
||||
RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
|
||||
hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
relsize_ctl->size += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
dlist_delete(&entry->lru_node);
|
||||
if (entry->size < size)
|
||||
entry->size = size;
|
||||
|
||||
if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
|
||||
{
|
||||
dlist_delete(&entry->lru_node);
|
||||
}
|
||||
}
|
||||
relsize_ctl->writes += 1;
|
||||
dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
|
||||
if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
|
||||
{
|
||||
dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
|
||||
}
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
}
|
||||
@@ -225,13 +273,154 @@ forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
|
||||
entry = hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
|
||||
if (entry)
|
||||
{
|
||||
dlist_delete(&entry->lru_node);
|
||||
if (!entry->unlogged)
|
||||
{
|
||||
/* Entried of relations involved in unlogged build are pinned */
|
||||
dlist_delete(&entry->lru_node);
|
||||
}
|
||||
relsize_ctl->size -= 1;
|
||||
}
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function starts unlogged build if it was not yet started.
|
||||
* The criteria for starting iunlogged build is writing page without normal LSN.
|
||||
* It can happen in any backend when page is evicted from shared buffers.
|
||||
* Or can not happen at all if index fits in shared buffers.
|
||||
*/
|
||||
void
|
||||
start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum)
|
||||
{
|
||||
if (relsize_hash_size > 0)
|
||||
{
|
||||
RelTag tag;
|
||||
RelSizeEntry *entry;
|
||||
bool found;
|
||||
bool start = false;
|
||||
|
||||
tag.rinfo = rinfo;
|
||||
tag.forknum = forknum;
|
||||
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
|
||||
entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
|
||||
if (!found) {
|
||||
entry->size = blocknum + 1;
|
||||
start = true;
|
||||
|
||||
if (relsize_ctl->size+1 == relsize_hash_size)
|
||||
{
|
||||
if (dlist_is_empty(&relsize_ctl->lru))
|
||||
{
|
||||
elog(FATAL, "No more free relsize cache entries");
|
||||
}
|
||||
else
|
||||
{
|
||||
RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
|
||||
hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
relsize_ctl->size += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
start = !entry->unlogged;
|
||||
|
||||
if (entry->size <= blocknum)
|
||||
{
|
||||
entry->size = blocknum + 1;
|
||||
}
|
||||
|
||||
if (start)
|
||||
{
|
||||
/* relation involved in unlogged build are pinned until the end of the build */
|
||||
dlist_delete(&entry->lru_node);
|
||||
}
|
||||
}
|
||||
entry->unlogged = true;
|
||||
relsize_ctl->writes += 1;
|
||||
|
||||
/*
|
||||
* We are not putting entry in LRU least to prevent it fro eviction until the end of unlogged build
|
||||
*/
|
||||
|
||||
if (start)
|
||||
elog(LOG, "Start unlogged build for %u/%u/%u.%u",
|
||||
RelFileInfoFmt(rinfo), forknum);
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if unlogged build is in progress.
|
||||
*/
|
||||
bool
|
||||
is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum)
|
||||
{
|
||||
bool unlogged = false;
|
||||
|
||||
if (relsize_hash_size > 0)
|
||||
{
|
||||
RelTag tag;
|
||||
RelSizeEntry *entry;
|
||||
|
||||
tag.rinfo = rinfo;
|
||||
tag.forknum = forknum;
|
||||
LWLockAcquire(relsize_lock, LW_SHARED);
|
||||
entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
|
||||
if (entry != NULL)
|
||||
{
|
||||
unlogged = entry->unlogged;
|
||||
relsize_ctl->hits += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
relsize_ctl->misses += 1;
|
||||
}
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
return unlogged;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear unlogged build if it was set.
|
||||
*/
|
||||
void
|
||||
stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum)
|
||||
{
|
||||
if (relsize_hash_size > 0)
|
||||
{
|
||||
RelTag tag;
|
||||
RelSizeEntry *entry;
|
||||
|
||||
tag.rinfo = rinfo;
|
||||
tag.forknum = forknum;
|
||||
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
|
||||
entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
|
||||
if (entry != NULL)
|
||||
{
|
||||
bool unlogged = entry->unlogged;
|
||||
entry->unlogged = false;
|
||||
relsize_ctl->hits += 1;
|
||||
if (unlogged)
|
||||
{
|
||||
elog(LOG, "Stop unlogged build for %u/%u/%u.%u",
|
||||
RelFileInfoFmt(rinfo), forknum);
|
||||
/* Return entry to the LRU list */
|
||||
dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
relsize_ctl->misses += 1;
|
||||
}
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
relsize_hash_init(void)
|
||||
{
|
||||
|
||||
@@ -35,7 +35,7 @@ use crate::{
|
||||
},
|
||||
stream, url,
|
||||
};
|
||||
use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
|
||||
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
|
||||
|
||||
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
|
||||
pub enum MaybeOwned<'a, T> {
|
||||
|
||||
@@ -100,6 +100,7 @@ pub(super) async fn authenticate(
|
||||
.dbname(&db_info.dbname)
|
||||
.user(&db_info.user);
|
||||
|
||||
ctx.set_dbname(db_info.dbname.into());
|
||||
ctx.set_user(db_info.user.into());
|
||||
ctx.set_project(db_info.aux.clone());
|
||||
info!("woken up a compute node");
|
||||
|
||||
@@ -11,7 +11,6 @@ use crate::{
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use smol_str::SmolStr;
|
||||
use std::{collections::HashSet, net::IpAddr, str::FromStr};
|
||||
use thiserror::Error;
|
||||
use tracing::{info, warn};
|
||||
@@ -96,13 +95,6 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
let get_param = |key| params.get(key).ok_or(MissingKey(key));
|
||||
let user: RoleName = get_param("user")?.into();
|
||||
|
||||
// record the values if we have them
|
||||
ctx.set_application(params.get("application_name").map(SmolStr::from));
|
||||
ctx.set_user(user.clone());
|
||||
if let Some(dbname) = params.get("database") {
|
||||
ctx.set_dbname(dbname.into());
|
||||
}
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
let endpoint_option = params
|
||||
.options_raw()
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::{
|
||||
http,
|
||||
metrics::{CacheOutcome, Metrics},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
scram, EndpointCacheKey, Normalize,
|
||||
scram, EndpointCacheKey,
|
||||
};
|
||||
use crate::{cache::Cached, context::RequestMonitoring};
|
||||
use futures::TryFutureExt;
|
||||
@@ -281,14 +281,6 @@ impl super::Api for Api {
|
||||
return Ok(cached);
|
||||
}
|
||||
|
||||
// check rate limit
|
||||
if !self
|
||||
.wake_compute_endpoint_rate_limiter
|
||||
.check(user_info.endpoint.normalize().into(), 1)
|
||||
{
|
||||
return Err(WakeComputeError::TooManyConnections);
|
||||
}
|
||||
|
||||
let permit = self.locks.get_permit(&key).await?;
|
||||
|
||||
// after getting back a permit - it's possible the cache was filled
|
||||
@@ -301,6 +293,15 @@ impl super::Api for Api {
|
||||
}
|
||||
}
|
||||
|
||||
// check rate limit
|
||||
if !self
|
||||
.wake_compute_endpoint_rate_limiter
|
||||
.check(user_info.endpoint.normalize_intern(), 1)
|
||||
{
|
||||
info!(key = &*key, "found cached compute node info");
|
||||
return Err(WakeComputeError::TooManyConnections);
|
||||
}
|
||||
|
||||
let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
|
||||
ctx.set_project(node.aux.clone());
|
||||
let cold_start_info = node.aux.cold_start_info;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
use chrono::Utc;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use smol_str::SmolStr;
|
||||
use std::net::IpAddr;
|
||||
use tokio::sync::mpsc;
|
||||
@@ -46,6 +47,7 @@ pub struct RequestMonitoring {
|
||||
pub(crate) auth_method: Option<AuthMethod>,
|
||||
success: bool,
|
||||
pub(crate) cold_start_info: ColdStartInfo,
|
||||
pg_options: Option<StartupMessageParams>,
|
||||
|
||||
// extra
|
||||
// This sender is here to keep the request monitoring channel open while requests are taking place.
|
||||
@@ -102,6 +104,7 @@ impl RequestMonitoring {
|
||||
success: false,
|
||||
rejected: None,
|
||||
cold_start_info: ColdStartInfo::Unknown,
|
||||
pg_options: None,
|
||||
|
||||
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
|
||||
disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
|
||||
@@ -132,6 +135,18 @@ impl RequestMonitoring {
|
||||
self.latency_timer.cold_start_info(info);
|
||||
}
|
||||
|
||||
pub fn set_db_options(&mut self, options: StartupMessageParams) {
|
||||
self.set_application(options.get("application_name").map(SmolStr::from));
|
||||
if let Some(user) = options.get("user") {
|
||||
self.set_user(user.into());
|
||||
}
|
||||
if let Some(dbname) = options.get("database") {
|
||||
self.set_dbname(dbname.into());
|
||||
}
|
||||
|
||||
self.pg_options = Some(options);
|
||||
}
|
||||
|
||||
pub fn set_project(&mut self, x: MetricsAuxInfo) {
|
||||
if self.endpoint_id.is_none() {
|
||||
self.set_endpoint_id(x.endpoint_id.as_str().into())
|
||||
@@ -155,8 +170,10 @@ impl RequestMonitoring {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_application(&mut self, app: Option<SmolStr>) {
|
||||
self.application = app.or_else(|| self.application.clone());
|
||||
fn set_application(&mut self, app: Option<SmolStr>) {
|
||||
if let Some(app) = app {
|
||||
self.application = Some(app);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_dbname(&mut self, dbname: DbName) {
|
||||
|
||||
@@ -13,7 +13,9 @@ use parquet::{
|
||||
},
|
||||
record::RecordWriter,
|
||||
};
|
||||
use pq_proto::StartupMessageParams;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use serde::ser::SerializeMap;
|
||||
use tokio::{sync::mpsc, time};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, Span};
|
||||
@@ -87,6 +89,7 @@ pub struct RequestData {
|
||||
database: Option<String>,
|
||||
project: Option<String>,
|
||||
branch: Option<String>,
|
||||
pg_options: Option<String>,
|
||||
auth_method: Option<&'static str>,
|
||||
error: Option<&'static str>,
|
||||
/// Success is counted if we form a HTTP response with sql rows inside
|
||||
@@ -101,6 +104,23 @@ pub struct RequestData {
|
||||
disconnect_timestamp: Option<chrono::NaiveDateTime>,
|
||||
}
|
||||
|
||||
struct Options<'a> {
|
||||
options: &'a StartupMessageParams,
|
||||
}
|
||||
|
||||
impl<'a> serde::Serialize for Options<'a> {
|
||||
fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
let mut state = s.serialize_map(None)?;
|
||||
for (k, v) in self.options.iter() {
|
||||
state.serialize_entry(k, v)?;
|
||||
}
|
||||
state.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RequestMonitoring> for RequestData {
|
||||
fn from(value: &RequestMonitoring) -> Self {
|
||||
Self {
|
||||
@@ -113,6 +133,10 @@ impl From<&RequestMonitoring> for RequestData {
|
||||
database: value.dbname.as_deref().map(String::from),
|
||||
project: value.project.as_deref().map(String::from),
|
||||
branch: value.branch.as_deref().map(String::from),
|
||||
pg_options: value
|
||||
.pg_options
|
||||
.as_ref()
|
||||
.and_then(|options| serde_json::to_string(&Options { options }).ok()),
|
||||
auth_method: value.auth_method.as_ref().map(|x| match x {
|
||||
super::AuthMethod::Web => "web",
|
||||
super::AuthMethod::ScramSha256 => "scram_sha_256",
|
||||
@@ -494,6 +518,7 @@ mod tests {
|
||||
database: Some(hex::encode(rng.gen::<[u8; 16]>())),
|
||||
project: Some(hex::encode(rng.gen::<[u8; 16]>())),
|
||||
branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
|
||||
pg_options: None,
|
||||
auth_method: None,
|
||||
protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
|
||||
region: "us-east-1",
|
||||
@@ -570,15 +595,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1315314, 3, 6000),
|
||||
(1315307, 3, 6000),
|
||||
(1315367, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(1315454, 3, 6000),
|
||||
(1315296, 3, 6000),
|
||||
(1315088, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(438713, 1, 2000)
|
||||
(1315874, 3, 6000),
|
||||
(1315867, 3, 6000),
|
||||
(1315927, 3, 6000),
|
||||
(1315884, 3, 6000),
|
||||
(1316014, 3, 6000),
|
||||
(1315856, 3, 6000),
|
||||
(1315648, 3, 6000),
|
||||
(1315884, 3, 6000),
|
||||
(438913, 1, 2000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -608,11 +633,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1222212, 5, 10000),
|
||||
(1228362, 5, 10000),
|
||||
(1230156, 5, 10000),
|
||||
(1229518, 5, 10000),
|
||||
(1220796, 5, 10000)
|
||||
(1223214, 5, 10000),
|
||||
(1229364, 5, 10000),
|
||||
(1231158, 5, 10000),
|
||||
(1230520, 5, 10000),
|
||||
(1221798, 5, 10000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -644,11 +669,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1207859, 5, 10000),
|
||||
(1207590, 5, 10000),
|
||||
(1207883, 5, 10000),
|
||||
(1207871, 5, 10000),
|
||||
(1208126, 5, 10000)
|
||||
(1208861, 5, 10000),
|
||||
(1208592, 5, 10000),
|
||||
(1208885, 5, 10000),
|
||||
(1208873, 5, 10000),
|
||||
(1209128, 5, 10000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -673,15 +698,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1315314, 3, 6000),
|
||||
(1315307, 3, 6000),
|
||||
(1315367, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(1315454, 3, 6000),
|
||||
(1315296, 3, 6000),
|
||||
(1315088, 3, 6000),
|
||||
(1315324, 3, 6000),
|
||||
(438713, 1, 2000)
|
||||
(1315874, 3, 6000),
|
||||
(1315867, 3, 6000),
|
||||
(1315927, 3, 6000),
|
||||
(1315884, 3, 6000),
|
||||
(1316014, 3, 6000),
|
||||
(1315856, 3, 6000),
|
||||
(1315648, 3, 6000),
|
||||
(1315884, 3, 6000),
|
||||
(438913, 1, 2000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -718,7 +743,7 @@ mod tests {
|
||||
// files are smaller than the size threshold, but they took too long to fill so were flushed early
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
|
||||
[(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use std::convert::Infallible;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use intern::{EndpointIdInt, EndpointIdTag, InternId};
|
||||
use tokio::task::JoinError;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
@@ -129,20 +130,22 @@ macro_rules! smol_str_wrapper {
|
||||
|
||||
const POOLER_SUFFIX: &str = "-pooler";
|
||||
|
||||
pub trait Normalize {
|
||||
fn normalize(&self) -> Self;
|
||||
}
|
||||
|
||||
impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
|
||||
impl EndpointId {
|
||||
fn normalize(&self) -> Self {
|
||||
if self.as_ref().ends_with(POOLER_SUFFIX) {
|
||||
let mut s = self.as_ref().to_string();
|
||||
s.truncate(s.len() - POOLER_SUFFIX.len());
|
||||
s.into()
|
||||
if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
|
||||
stripped.into()
|
||||
} else {
|
||||
self.clone()
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_intern(&self) -> EndpointIdInt {
|
||||
if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
|
||||
EndpointIdTag::get_interner().get_or_intern(stripped)
|
||||
} else {
|
||||
self.into()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 90% of role name strings are 20 characters or less.
|
||||
|
||||
@@ -267,6 +267,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
};
|
||||
drop(pause);
|
||||
|
||||
ctx.set_db_options(params.clone());
|
||||
|
||||
let hostname = mode.hostname(stream.get_ref());
|
||||
|
||||
let common_names = tls.map(|tls| &tls.common_names);
|
||||
|
||||
@@ -17,6 +17,7 @@ use hyper1::http::HeaderValue;
|
||||
use hyper1::Response;
|
||||
use hyper1::StatusCode;
|
||||
use hyper1::{HeaderMap, Request};
|
||||
use pq_proto::StartupMessageParamsBuilder;
|
||||
use serde_json::json;
|
||||
use serde_json::Value;
|
||||
use tokio::time;
|
||||
@@ -192,13 +193,13 @@ fn get_conn_info(
|
||||
|
||||
let mut options = Option::None;
|
||||
|
||||
let mut params = StartupMessageParamsBuilder::default();
|
||||
params.insert("user", &username);
|
||||
params.insert("database", &dbname);
|
||||
for (key, value) in pairs {
|
||||
match &*key {
|
||||
"options" => {
|
||||
options = Some(NeonOptions::parse_options_raw(&value));
|
||||
}
|
||||
"application_name" => ctx.set_application(Some(value.into())),
|
||||
_ => {}
|
||||
params.insert(&key, &value);
|
||||
if key == "options" {
|
||||
options = Some(NeonOptions::parse_options_raw(&value));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,13 +29,12 @@ use safekeeper::defaults::{
|
||||
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
|
||||
};
|
||||
use safekeeper::remove_wal;
|
||||
use safekeeper::http;
|
||||
use safekeeper::wal_service;
|
||||
use safekeeper::GlobalTimelines;
|
||||
use safekeeper::SafeKeeperConf;
|
||||
use safekeeper::{broker, WAL_SERVICE_RUNTIME};
|
||||
use safekeeper::{control_file, BROKER_RUNTIME};
|
||||
use safekeeper::{http, WAL_REMOVER_RUNTIME};
|
||||
use safekeeper::{wal_backup, HTTP_RUNTIME};
|
||||
use storage_broker::DEFAULT_ENDPOINT;
|
||||
use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
|
||||
@@ -441,14 +440,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
.map(|res| ("broker main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(broker_task_handle));
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let wal_remover_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
|
||||
.spawn(remove_wal::task_main(conf_))
|
||||
.map(|res| ("WAL remover".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_remover_handle));
|
||||
|
||||
set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
|
||||
// TODO: update tokio-stream, convert to real async Stream with
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use utils::crashsafe::durable_rename;
|
||||
@@ -12,9 +12,9 @@ use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::control_file_upgrade::upgrade_control_file;
|
||||
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
|
||||
use utils::{bin_ser::LeSer, id::TenantTimelineId};
|
||||
|
||||
use crate::SafeKeeperConf;
|
||||
@@ -43,7 +43,7 @@ pub trait Storage: Deref<Target = TimelinePersistentState> {
|
||||
pub struct FileStorage {
|
||||
// save timeline dir to avoid reconstructing it every time
|
||||
timeline_dir: Utf8PathBuf,
|
||||
conf: SafeKeeperConf,
|
||||
no_sync: bool,
|
||||
|
||||
/// Last state persisted to disk.
|
||||
state: TimelinePersistentState,
|
||||
@@ -54,13 +54,12 @@ pub struct FileStorage {
|
||||
impl FileStorage {
|
||||
/// Initialize storage by loading state from disk.
|
||||
pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
|
||||
let timeline_dir = conf.timeline_dir(ttid);
|
||||
|
||||
let state = Self::load_control_file_conf(conf, ttid)?;
|
||||
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||
let state = Self::load_control_file_from_dir(&timeline_dir)?;
|
||||
|
||||
Ok(FileStorage {
|
||||
timeline_dir,
|
||||
conf: conf.clone(),
|
||||
no_sync: conf.no_sync,
|
||||
state,
|
||||
last_persist_at: Instant::now(),
|
||||
})
|
||||
@@ -74,7 +73,7 @@ impl FileStorage {
|
||||
) -> Result<FileStorage> {
|
||||
let store = FileStorage {
|
||||
timeline_dir,
|
||||
conf: conf.clone(),
|
||||
no_sync: conf.no_sync,
|
||||
state,
|
||||
last_persist_at: Instant::now(),
|
||||
};
|
||||
@@ -102,12 +101,9 @@ impl FileStorage {
|
||||
upgrade_control_file(buf, version)
|
||||
}
|
||||
|
||||
/// Load control file for given ttid at path specified by conf.
|
||||
pub fn load_control_file_conf(
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: &TenantTimelineId,
|
||||
) -> Result<TimelinePersistentState> {
|
||||
let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME);
|
||||
/// Load control file from given directory.
|
||||
pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
|
||||
let path = timeline_dir.join(CONTROL_FILE_NAME);
|
||||
Self::load_control_file(path)
|
||||
}
|
||||
|
||||
@@ -203,7 +199,7 @@ impl Storage for FileStorage {
|
||||
})?;
|
||||
|
||||
let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
|
||||
durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;
|
||||
durable_rename(&control_partial_path, &control_path, !self.no_sync).await?;
|
||||
|
||||
// update internal state
|
||||
self.state = s.clone();
|
||||
@@ -233,12 +229,13 @@ mod test {
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: &TenantTimelineId,
|
||||
) -> Result<(FileStorage, TimelinePersistentState)> {
|
||||
fs::create_dir_all(conf.timeline_dir(ttid))
|
||||
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||
fs::create_dir_all(&timeline_dir)
|
||||
.await
|
||||
.expect("failed to create timeline dir");
|
||||
Ok((
|
||||
FileStorage::restore_new(ttid, conf)?,
|
||||
FileStorage::load_control_file_conf(conf, ttid)?,
|
||||
FileStorage::load_control_file_from_dir(&timeline_dir)?,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -246,11 +243,11 @@ mod test {
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: &TenantTimelineId,
|
||||
) -> Result<(FileStorage, TimelinePersistentState)> {
|
||||
fs::create_dir_all(conf.timeline_dir(ttid))
|
||||
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||
fs::create_dir_all(&timeline_dir)
|
||||
.await
|
||||
.expect("failed to create timeline dir");
|
||||
let state = TimelinePersistentState::empty();
|
||||
let timeline_dir = conf.timeline_dir(ttid);
|
||||
let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
|
||||
Ok((storage, state))
|
||||
}
|
||||
@@ -291,7 +288,7 @@ mod test {
|
||||
.await
|
||||
.expect("failed to persist state");
|
||||
}
|
||||
let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
|
||||
let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
|
||||
let mut data = fs::read(&control_path).await.unwrap();
|
||||
data[0] += 1; // change the first byte of the file to fail checksum validation
|
||||
fs::write(&control_path, &data)
|
||||
|
||||
@@ -15,10 +15,10 @@ use crate::{
|
||||
control_file::{FileStorage, Storage},
|
||||
pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
|
||||
state::TimelinePersistentState,
|
||||
timeline::{Timeline, TimelineError},
|
||||
timeline::{FullAccessTimeline, Timeline, TimelineError},
|
||||
wal_backup::copy_s3_segments,
|
||||
wal_storage::{wal_file_paths, WalReader},
|
||||
GlobalTimelines, SafeKeeperConf,
|
||||
GlobalTimelines,
|
||||
};
|
||||
|
||||
// we don't want to have more than 10 segments on disk after copy, because they take space
|
||||
@@ -46,12 +46,14 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
let source_tli = request.source.full_access_guard().await?;
|
||||
|
||||
let conf = &GlobalTimelines::get_global_config();
|
||||
let ttid = request.destination_ttid;
|
||||
|
||||
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
|
||||
|
||||
let (mem_state, state) = request.source.get_state().await;
|
||||
let (mem_state, state) = source_tli.get_state().await;
|
||||
let start_lsn = state.timeline_start_lsn;
|
||||
if start_lsn == Lsn::INVALID {
|
||||
bail!("timeline is not initialized");
|
||||
@@ -60,7 +62,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
|
||||
{
|
||||
let commit_lsn = mem_state.commit_lsn;
|
||||
let flush_lsn = request.source.get_flush_lsn().await;
|
||||
let flush_lsn = source_tli.get_flush_lsn().await;
|
||||
|
||||
info!(
|
||||
"collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
|
||||
@@ -127,10 +129,8 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
.await?;
|
||||
|
||||
copy_disk_segments(
|
||||
conf,
|
||||
&state,
|
||||
&source_tli,
|
||||
wal_seg_size,
|
||||
&request.source.ttid,
|
||||
new_backup_lsn,
|
||||
request.until_lsn,
|
||||
&tli_dir_path,
|
||||
@@ -159,21 +159,13 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
}
|
||||
|
||||
async fn copy_disk_segments(
|
||||
conf: &SafeKeeperConf,
|
||||
persisted_state: &TimelinePersistentState,
|
||||
tli: &FullAccessTimeline,
|
||||
wal_seg_size: usize,
|
||||
source_ttid: &TenantTimelineId,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
tli_dir_path: &Utf8PathBuf,
|
||||
) -> Result<()> {
|
||||
let mut wal_reader = WalReader::new(
|
||||
conf.workdir.clone(),
|
||||
conf.timeline_dir(source_ttid),
|
||||
persisted_state,
|
||||
start_lsn,
|
||||
true,
|
||||
)?;
|
||||
let mut wal_reader = tli.get_walreader(start_lsn).await?;
|
||||
|
||||
let mut buf = [0u8; MAX_SEND_SIZE];
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::sync::Arc;
|
||||
use anyhow::bail;
|
||||
use anyhow::Result;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
@@ -26,7 +27,8 @@ use crate::safekeeper::TermHistory;
|
||||
use crate::send_wal::WalSenderState;
|
||||
use crate::state::TimelineMemState;
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::wal_storage::WalReader;
|
||||
use crate::timeline::get_timeline_dir;
|
||||
use crate::timeline::FullAccessTimeline;
|
||||
use crate::GlobalTimelines;
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
@@ -68,6 +70,7 @@ pub struct Response {
|
||||
pub struct TimelineDumpSer {
|
||||
pub tli: Arc<crate::timeline::Timeline>,
|
||||
pub args: Args,
|
||||
pub timeline_dir: Utf8PathBuf,
|
||||
pub runtime: Arc<tokio::runtime::Runtime>,
|
||||
}
|
||||
|
||||
@@ -85,14 +88,20 @@ impl Serialize for TimelineDumpSer {
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
let dump = self
|
||||
.runtime
|
||||
.block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
|
||||
let dump = self.runtime.block_on(build_from_tli_dump(
|
||||
&self.tli,
|
||||
&self.args,
|
||||
&self.timeline_dir,
|
||||
));
|
||||
dump.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
|
||||
async fn build_from_tli_dump(
|
||||
timeline: &Arc<crate::timeline::Timeline>,
|
||||
args: &Args,
|
||||
timeline_dir: &Utf8Path,
|
||||
) -> Timeline {
|
||||
let control_file = if args.dump_control_file {
|
||||
let mut state = timeline.get_state().await.1;
|
||||
if !args.dump_term_history {
|
||||
@@ -112,7 +121,8 @@ async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Arg
|
||||
let disk_content = if args.dump_disk_content {
|
||||
// build_disk_content can fail, but we don't want to fail the whole
|
||||
// request because of that.
|
||||
build_disk_content(&timeline.timeline_dir).ok()
|
||||
// Note: timeline can be in offloaded state, this is not a problem.
|
||||
build_disk_content(timeline_dir).ok()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -186,6 +196,7 @@ pub struct FileInfo {
|
||||
pub async fn build(args: Args) -> Result<Response> {
|
||||
let start_time = Utc::now();
|
||||
let timelines_count = GlobalTimelines::timelines_count();
|
||||
let config = GlobalTimelines::get_global_config();
|
||||
|
||||
let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
|
||||
// If both tenant_id and timeline_id are specified, we can just get the
|
||||
@@ -223,12 +234,11 @@ pub async fn build(args: Args) -> Result<Response> {
|
||||
timelines.push(TimelineDumpSer {
|
||||
tli,
|
||||
args: args.clone(),
|
||||
timeline_dir: get_timeline_dir(&config, &ttid),
|
||||
runtime: runtime.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
let config = GlobalTimelines::get_global_config();
|
||||
|
||||
Ok(Response {
|
||||
start_time,
|
||||
finish_time: Utc::now(),
|
||||
@@ -316,27 +326,19 @@ pub struct TimelineDigest {
|
||||
}
|
||||
|
||||
pub async fn calculate_digest(
|
||||
tli: &Arc<crate::timeline::Timeline>,
|
||||
tli: &FullAccessTimeline,
|
||||
request: TimelineDigestRequest,
|
||||
) -> Result<TimelineDigest> {
|
||||
if request.from_lsn > request.until_lsn {
|
||||
bail!("from_lsn is greater than until_lsn");
|
||||
}
|
||||
|
||||
let conf = GlobalTimelines::get_global_config();
|
||||
let (_, persisted_state) = tli.get_state().await;
|
||||
|
||||
if persisted_state.timeline_start_lsn > request.from_lsn {
|
||||
bail!("requested LSN is before the start of the timeline");
|
||||
}
|
||||
|
||||
let mut wal_reader = WalReader::new(
|
||||
conf.workdir.clone(),
|
||||
tli.timeline_dir.clone(),
|
||||
&persisted_state,
|
||||
request.from_lsn,
|
||||
true,
|
||||
)?;
|
||||
let mut wal_reader = tli.get_walreader(request.from_lsn).await?;
|
||||
|
||||
let mut hasher = Sha256::new();
|
||||
let mut buf = [0u8; MAX_SEND_SIZE];
|
||||
|
||||
@@ -85,11 +85,11 @@ impl From<TermSwitchApiEntry> for TermLsn {
|
||||
}
|
||||
}
|
||||
|
||||
/// Augment AcceptorState with epoch for convenience
|
||||
/// Augment AcceptorState with last_log_term for convenience
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct AcceptorStateStatus {
|
||||
pub term: Term,
|
||||
pub epoch: Term,
|
||||
pub epoch: Term, // aka last_log_term
|
||||
pub term_history: Vec<TermSwitchApiEntry>,
|
||||
}
|
||||
|
||||
@@ -130,7 +130,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let (inmem, state) = tli.get_state().await;
|
||||
let flush_lsn = tli.get_flush_lsn().await;
|
||||
|
||||
let epoch = state.acceptor_state.get_epoch(flush_lsn);
|
||||
let last_log_term = state.acceptor_state.get_last_log_term(flush_lsn);
|
||||
let term_history = state
|
||||
.acceptor_state
|
||||
.term_history
|
||||
@@ -143,7 +143,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
.collect();
|
||||
let acc_state = AcceptorStateStatus {
|
||||
term: state.acceptor_state.term,
|
||||
epoch,
|
||||
epoch: last_log_term,
|
||||
term_history,
|
||||
};
|
||||
|
||||
@@ -249,6 +249,10 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
|
||||
};
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let tli = tli
|
||||
.full_access_guard()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let response = debug_dump::calculate_digest(&tli, request)
|
||||
.await
|
||||
@@ -268,8 +272,12 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
|
||||
let filename: String = parse_request_param(&request, "filename")?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let tli = tli
|
||||
.full_access_guard()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let filepath = tli.timeline_dir.join(filename);
|
||||
let filepath = tli.get_timeline_dir().join(filename);
|
||||
let mut file = File::open(&filepath)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
@@ -287,7 +295,7 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))
|
||||
}
|
||||
|
||||
/// Force persist control file and remove old WAL.
|
||||
/// Force persist control file.
|
||||
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
@@ -297,13 +305,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
);
|
||||
|
||||
let tli = GlobalTimelines::get(ttid)?;
|
||||
tli.maybe_persist_control_file(true)
|
||||
tli.write_shared_state()
|
||||
.await
|
||||
.sk
|
||||
.state
|
||||
.flush()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
tli.remove_old_wal()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
|
||||
@@ -6,8 +6,6 @@
|
||||
//! modifications in tests.
|
||||
//!
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use postgres_backend::QueryError;
|
||||
@@ -23,7 +21,7 @@ use crate::safekeeper::{
|
||||
};
|
||||
use crate::safekeeper::{Term, TermHistory, TermLsn};
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::timeline::Timeline;
|
||||
use crate::timeline::FullAccessTimeline;
|
||||
use crate::GlobalTimelines;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_ffi::encode_logical_message;
|
||||
@@ -104,8 +102,8 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
async fn prepare_safekeeper(
|
||||
ttid: TenantTimelineId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
GlobalTimelines::create(
|
||||
) -> anyhow::Result<FullAccessTimeline> {
|
||||
let tli = GlobalTimelines::create(
|
||||
ttid,
|
||||
ServerInfo {
|
||||
pg_version,
|
||||
@@ -115,10 +113,16 @@ async fn prepare_safekeeper(
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await
|
||||
.await?;
|
||||
|
||||
tli.full_access_guard().await
|
||||
}
|
||||
|
||||
async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
|
||||
async fn send_proposer_elected(
|
||||
tli: &FullAccessTimeline,
|
||||
term: Term,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
// add new term to existing history
|
||||
let history = tli.get_state().await.1.acceptor_state.term_history;
|
||||
let history = history.up_to(lsn.checked_sub(1u64).unwrap());
|
||||
@@ -147,7 +151,7 @@ pub struct InsertedWAL {
|
||||
/// Extend local WAL with new LogicalMessage record. To do that,
|
||||
/// create AppendRequest with new WAL and pass it to safekeeper.
|
||||
pub async fn append_logical_message(
|
||||
tli: &Arc<Timeline>,
|
||||
tli: &FullAccessTimeline,
|
||||
msg: &AppendLogicalMessage,
|
||||
) -> anyhow::Result<InsertedWAL> {
|
||||
let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
|
||||
@@ -165,7 +169,7 @@ pub async fn append_logical_message(
|
||||
let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
term: msg.term,
|
||||
epoch_start_lsn: begin_lsn,
|
||||
term_start_lsn: begin_lsn,
|
||||
begin_lsn,
|
||||
end_lsn,
|
||||
commit_lsn,
|
||||
|
||||
@@ -7,10 +7,7 @@ use tokio::runtime::Runtime;
|
||||
use std::time::Duration;
|
||||
use storage_broker::Uri;
|
||||
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
id::{NodeId, TenantId, TenantTimelineId},
|
||||
};
|
||||
use utils::{auth::SwappableJwtAuth, id::NodeId};
|
||||
|
||||
mod auth;
|
||||
pub mod broker;
|
||||
@@ -89,15 +86,6 @@ pub struct SafeKeeperConf {
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
self.workdir.join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf {
|
||||
self.tenant_dir(&ttid.tenant_id)
|
||||
.join(ttid.timeline_id.to_string())
|
||||
}
|
||||
|
||||
pub fn is_wal_backup_enabled(&self) -> bool {
|
||||
self.remote_storage.is_some() && self.wal_backup_enabled
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use utils::{
|
||||
use crate::{
|
||||
control_file, debug_dump,
|
||||
http::routes::TimelineStatus,
|
||||
timeline::{Timeline, TimelineError},
|
||||
timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError},
|
||||
wal_storage::{self, Storage},
|
||||
GlobalTimelines, SafeKeeperConf,
|
||||
};
|
||||
@@ -283,13 +283,13 @@ pub async fn load_temp_timeline(
|
||||
}
|
||||
|
||||
// Move timeline dir to the correct location
|
||||
let timeline_path = conf.timeline_dir(&ttid);
|
||||
let timeline_path = get_timeline_dir(conf, &ttid);
|
||||
|
||||
info!(
|
||||
"moving timeline {} from {} to {}",
|
||||
ttid, tmp_path, timeline_path
|
||||
);
|
||||
tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
|
||||
tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
|
||||
tokio::fs::rename(tmp_path, &timeline_path).await?;
|
||||
|
||||
let tli = GlobalTimelines::load_timeline(&guard, ttid)
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::safekeeper::AcceptorProposerMessage;
|
||||
use crate::safekeeper::ProposerAcceptorMessage;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::timeline::Timeline;
|
||||
use crate::timeline::FullAccessTimeline;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{anyhow, Context};
|
||||
@@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
) -> Result<(), QueryError> {
|
||||
let mut tli: Option<Arc<Timeline>> = None;
|
||||
let mut tli: Option<FullAccessTimeline> = None;
|
||||
if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
|
||||
// Log the result and probably send it to the client, closing the stream.
|
||||
let handle_end_fut = pgb.handle_copy_stream_end(end);
|
||||
@@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler {
|
||||
pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tli: &mut Option<Arc<Timeline>>,
|
||||
tli: &mut Option<FullAccessTimeline>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
// Notify the libpq client that it's allowed to send `CopyData` messages
|
||||
pgb.write_message(&BeMessage::CopyBothResponse).await?;
|
||||
@@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> {
|
||||
impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
async fn read_first_message(
|
||||
&mut self,
|
||||
) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
|
||||
) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
|
||||
// Receive information about server to create timeline, if not yet.
|
||||
let next_msg = read_message(self.pgb_reader).await?;
|
||||
let tli = match next_msg {
|
||||
@@ -337,7 +337,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
|
||||
let tli =
|
||||
GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
|
||||
.await?;
|
||||
tli.full_access_guard().await?
|
||||
}
|
||||
_ => {
|
||||
return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
|
||||
@@ -353,7 +356,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
next_msg: ProposerAcceptorMessage,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
*self.acceptor_handle = Some(WalAcceptor::spawn(
|
||||
@@ -448,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
|
||||
/// replies to reply_tx; reading from socket and writing to disk in parallel is
|
||||
/// beneficial for performance, this struct provides writing to disk part.
|
||||
pub struct WalAcceptor {
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: Option<ConnectionId>,
|
||||
@@ -461,7 +464,7 @@ impl WalAcceptor {
|
||||
///
|
||||
/// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
|
||||
pub fn spawn(
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: Option<ConnectionId>,
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! provide it, i.e. safekeeper lags too much.
|
||||
|
||||
use std::time::SystemTime;
|
||||
use std::{fmt, pin::pin, sync::Arc};
|
||||
use std::{fmt, pin::pin};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use futures::StreamExt;
|
||||
@@ -21,6 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}
|
||||
|
||||
use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
|
||||
use crate::safekeeper::{AppendRequest, AppendRequestHeader};
|
||||
use crate::timeline::FullAccessTimeline;
|
||||
use crate::{
|
||||
http::routes::TimelineStatus,
|
||||
receive_wal::MSG_QUEUE_SIZE,
|
||||
@@ -28,14 +29,14 @@ use crate::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
|
||||
TermLsn, VoteRequest,
|
||||
},
|
||||
timeline::{PeerInfo, Timeline},
|
||||
timeline::PeerInfo,
|
||||
SafeKeeperConf,
|
||||
};
|
||||
|
||||
/// Entrypoint for per timeline task which always runs, checking whether
|
||||
/// recovery for this safekeeper is needed and starting it if so.
|
||||
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
info!("started");
|
||||
|
||||
let cancel = tli.cancel.clone();
|
||||
@@ -47,6 +48,87 @@ pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Should we start fetching WAL from a peer safekeeper, and if yes, from
|
||||
/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
|
||||
/// to fetch, and we can do that without running elections; 2) there is no
|
||||
/// actively streaming compute, as we don't want to compete with it.
|
||||
///
|
||||
/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
|
||||
/// to its last_log_term so we are sure such a leader ever had been elected.
|
||||
///
|
||||
/// All possible donors are returned so that we could keep connection to the
|
||||
/// current one if it is good even if it slightly lags behind.
|
||||
///
|
||||
/// Note that term conditions above might be not met, but safekeepers are
|
||||
/// still not aligned on last flush_lsn. Generally in this case until
|
||||
/// elections are run it is not possible to say which safekeeper should
|
||||
/// recover from which one -- history which would be committed is different
|
||||
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
|
||||
/// Thus we don't try to predict it here.
|
||||
async fn recovery_needed(
|
||||
tli: &FullAccessTimeline,
|
||||
heartbeat_timeout: Duration,
|
||||
) -> RecoveryNeededInfo {
|
||||
let ss = tli.read_shared_state().await;
|
||||
let term = ss.sk.state.acceptor_state.term;
|
||||
let last_log_term = ss.sk.get_last_log_term();
|
||||
let flush_lsn = ss.sk.flush_lsn();
|
||||
// note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
|
||||
let mut peers = ss.get_peers(heartbeat_timeout);
|
||||
// Sort by <last log term, lsn> pairs.
|
||||
peers.sort_by(|p1, p2| {
|
||||
let tl1 = TermLsn {
|
||||
term: p1.last_log_term,
|
||||
lsn: p1.flush_lsn,
|
||||
};
|
||||
let tl2 = TermLsn {
|
||||
term: p2.last_log_term,
|
||||
lsn: p2.flush_lsn,
|
||||
};
|
||||
tl2.cmp(&tl1) // desc
|
||||
});
|
||||
let num_streaming_computes = tli.get_walreceivers().get_num_streaming();
|
||||
let donors = if num_streaming_computes > 0 {
|
||||
vec![] // If there is a streaming compute, don't try to recover to not intervene.
|
||||
} else {
|
||||
peers
|
||||
.iter()
|
||||
.filter_map(|candidate| {
|
||||
// Are we interested in this candidate?
|
||||
let candidate_tl = TermLsn {
|
||||
term: candidate.last_log_term,
|
||||
lsn: candidate.flush_lsn,
|
||||
};
|
||||
let my_tl = TermLsn {
|
||||
term: last_log_term,
|
||||
lsn: flush_lsn,
|
||||
};
|
||||
if my_tl < candidate_tl {
|
||||
// Yes, we are interested. Can we pull from it without
|
||||
// (re)running elections? It is possible if 1) his term
|
||||
// is equal to his last_log_term so we could act on
|
||||
// behalf of leader of this term (we must be sure he was
|
||||
// ever elected) and 2) our term is not higher, or we'll refuse data.
|
||||
if candidate.term == candidate.last_log_term && candidate.term >= term {
|
||||
Some(Donor::from(candidate))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
RecoveryNeededInfo {
|
||||
term,
|
||||
last_log_term,
|
||||
flush_lsn,
|
||||
peers,
|
||||
num_streaming_computes,
|
||||
donors,
|
||||
}
|
||||
}
|
||||
/// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
|
||||
/// fields to explain the choice.
|
||||
#[derive(Debug)]
|
||||
@@ -113,10 +195,10 @@ impl From<&PeerInfo> for Donor {
|
||||
const CHECK_INTERVAL_MS: u64 = 2000;
|
||||
|
||||
/// Check regularly whether we need to start recovery.
|
||||
async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
|
||||
loop {
|
||||
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
|
||||
let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
|
||||
match recovery_needed_info.donors.first() {
|
||||
Some(donor) => {
|
||||
info!(
|
||||
@@ -146,7 +228,7 @@ async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
/// Recover from the specified donor. Returns message explaining normal finish
|
||||
/// reason or error.
|
||||
async fn recover(
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
donor: &Donor,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> anyhow::Result<String> {
|
||||
@@ -232,7 +314,7 @@ async fn recover(
|
||||
|
||||
// Pull WAL from donor, assuming handshake is already done.
|
||||
async fn recovery_stream(
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
donor: &Donor,
|
||||
start_streaming_at: Lsn,
|
||||
conf: &SafeKeeperConf,
|
||||
@@ -316,7 +398,7 @@ async fn network_io(
|
||||
physical_stream: ReplicationStream,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
donor: Donor,
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
conf: SafeKeeperConf,
|
||||
) -> anyhow::Result<Option<String>> {
|
||||
let mut physical_stream = pin!(physical_stream);
|
||||
@@ -337,7 +419,7 @@ async fn network_io(
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
let ar_hdr = AppendRequestHeader {
|
||||
term: donor.term,
|
||||
epoch_start_lsn: Lsn::INVALID, // unused
|
||||
term_start_lsn: Lsn::INVALID, // unused
|
||||
begin_lsn: Lsn(xlog_data.wal_start()),
|
||||
end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
|
||||
commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
|
||||
@@ -365,7 +447,7 @@ async fn network_io(
|
||||
}
|
||||
ReplicationMessage::PrimaryKeepAlive(_) => {
|
||||
// keepalive means nothing is being streamed for a while. Check whether we need to stop.
|
||||
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
|
||||
let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
|
||||
// do current donors still contain one we currently connected to?
|
||||
if !recovery_needed_info
|
||||
.donors
|
||||
|
||||
@@ -1,41 +1,25 @@
|
||||
//! Thread removing old WAL.
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::time::Duration;
|
||||
use crate::timeline_manager::StateSnapshot;
|
||||
|
||||
use tokio::time::sleep;
|
||||
use tracing::*;
|
||||
/// Get oldest LSN we still need to keep. We hold WAL till it is consumed
|
||||
/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
|
||||
/// offloading.
|
||||
/// While it is safe to use inmem values for determining horizon,
|
||||
/// we use persistent to make possible normal states less surprising.
|
||||
/// All segments covering LSNs before horizon_lsn can be removed.
|
||||
pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
|
||||
use std::cmp::min;
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let wal_removal_interval = Duration::from_millis(5000);
|
||||
loop {
|
||||
let now = tokio::time::Instant::now();
|
||||
let tlis = GlobalTimelines::get_all();
|
||||
for tli in &tlis {
|
||||
let ttid = tli.ttid;
|
||||
async {
|
||||
if let Err(e) = tli.maybe_persist_control_file(false).await {
|
||||
warn!("failed to persist control file: {e}");
|
||||
}
|
||||
if let Err(e) = tli.remove_old_wal().await {
|
||||
error!("failed to remove WAL: {}", e);
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("WAL removal", ttid = %ttid))
|
||||
.await;
|
||||
}
|
||||
|
||||
let elapsed = now.elapsed();
|
||||
let total_timelines = tlis.len();
|
||||
|
||||
if elapsed > wal_removal_interval {
|
||||
info!(
|
||||
"WAL removal is too long, processed {} timelines in {:?}",
|
||||
total_timelines, elapsed
|
||||
);
|
||||
}
|
||||
|
||||
sleep(wal_removal_interval).await;
|
||||
let mut horizon_lsn = min(
|
||||
state.cfile_remote_consistent_lsn,
|
||||
state.cfile_peer_horizon_lsn,
|
||||
);
|
||||
// we don't want to remove WAL that is not yet offloaded to s3
|
||||
horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn);
|
||||
if let Some(extra_horizon_lsn) = extra_horizon_lsn {
|
||||
horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
|
||||
}
|
||||
|
||||
horizon_lsn
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@ use std::cmp::max;
|
||||
use std::cmp::min;
|
||||
use std::fmt;
|
||||
use std::io::Read;
|
||||
use std::time::Duration;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
|
||||
use tracing::*;
|
||||
@@ -188,8 +187,8 @@ pub struct AcceptorState {
|
||||
}
|
||||
|
||||
impl AcceptorState {
|
||||
/// acceptor's epoch is the term of the highest entry in the log
|
||||
pub fn get_epoch(&self, flush_lsn: Lsn) -> Term {
|
||||
/// acceptor's last_log_term is the term of the highest entry in the log
|
||||
pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term {
|
||||
let th = self.term_history.up_to(flush_lsn);
|
||||
match th.0.last() {
|
||||
Some(e) => e.term,
|
||||
@@ -305,9 +304,9 @@ pub struct AppendRequest {
|
||||
pub struct AppendRequestHeader {
|
||||
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term,
|
||||
// TODO: remove this field, it in unused -- LSN of term switch can be taken
|
||||
// from ProposerElected (as well as from term history).
|
||||
pub epoch_start_lsn: Lsn,
|
||||
// TODO: remove this field from the protocol, it in unused -- LSN of term
|
||||
// switch can be taken from ProposerElected (as well as from term history).
|
||||
pub term_start_lsn: Lsn,
|
||||
/// start position of message in WAL
|
||||
pub begin_lsn: Lsn,
|
||||
/// end position of message in WAL
|
||||
@@ -326,9 +325,10 @@ pub struct AppendResponse {
|
||||
// Current term of the safekeeper; if it is higher than proposer's, the
|
||||
// compute is out of date.
|
||||
pub term: Term,
|
||||
// NOTE: this is physical end of wal on safekeeper; currently it doesn't
|
||||
// make much sense without taking epoch into account, as history can be
|
||||
// diverged.
|
||||
// Flushed end of wal on safekeeper; one should be always mindful from what
|
||||
// term history this value comes, either checking history directly or
|
||||
// observing term being set to one for which WAL truncation is known to have
|
||||
// happened.
|
||||
pub flush_lsn: Lsn,
|
||||
// We report back our awareness about which WAL is committed, as this is
|
||||
// a criterion for walproposer --sync mode exit
|
||||
@@ -482,8 +482,8 @@ impl AcceptorProposerMessage {
|
||||
/// - messages from broker peers
|
||||
pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
|
||||
/// LSN since the proposer safekeeper currently talking to appends WAL;
|
||||
/// determines epoch switch point.
|
||||
pub epoch_start_lsn: Lsn,
|
||||
/// determines last_log_term switch point.
|
||||
pub term_start_lsn: Lsn,
|
||||
|
||||
pub state: TimelineState<CTRL>, // persistent state storage
|
||||
pub wal_store: WAL,
|
||||
@@ -511,7 +511,7 @@ where
|
||||
}
|
||||
|
||||
Ok(SafeKeeper {
|
||||
epoch_start_lsn: Lsn(0),
|
||||
term_start_lsn: Lsn(0),
|
||||
state: TimelineState::new(state),
|
||||
wal_store,
|
||||
node_id,
|
||||
@@ -531,8 +531,10 @@ where
|
||||
self.state.acceptor_state.term
|
||||
}
|
||||
|
||||
pub fn get_epoch(&self) -> Term {
|
||||
self.state.acceptor_state.get_epoch(self.flush_lsn())
|
||||
pub fn get_last_log_term(&self) -> Term {
|
||||
self.state
|
||||
.acceptor_state
|
||||
.get_last_log_term(self.flush_lsn())
|
||||
}
|
||||
|
||||
/// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
|
||||
@@ -713,7 +715,7 @@ where
|
||||
// proceed, but to prevent commit_lsn surprisingly going down we should
|
||||
// either refuse the session (simpler) or skip the part we already have
|
||||
// from the stream (can be implemented).
|
||||
if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
|
||||
if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
|
||||
bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
|
||||
msg.term, self.flush_lsn(), msg.start_streaming_at)
|
||||
}
|
||||
@@ -788,7 +790,7 @@ where
|
||||
// Cache LSN where term starts to immediately fsync control file with
|
||||
// commit_lsn once we reach it -- sync-safekeepers finishes when
|
||||
// persisted commit_lsn on majority of safekeepers aligns.
|
||||
self.epoch_start_lsn = match msg.term_history.0.last() {
|
||||
self.term_start_lsn = match msg.term_history.0.last() {
|
||||
None => bail!("proposer elected with empty term history"),
|
||||
Some(term_lsn_start) => term_lsn_start.lsn,
|
||||
};
|
||||
@@ -814,35 +816,17 @@ where
|
||||
|
||||
self.state.inmem.commit_lsn = commit_lsn;
|
||||
|
||||
// If new commit_lsn reached epoch switch, force sync of control
|
||||
// If new commit_lsn reached term switch, force sync of control
|
||||
// file: walproposer in sync mode is very interested when this
|
||||
// happens. Note: this is for sync-safekeepers mode only, as
|
||||
// otherwise commit_lsn might jump over epoch_start_lsn.
|
||||
if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
|
||||
// otherwise commit_lsn might jump over term_start_lsn.
|
||||
if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn {
|
||||
self.state.flush().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Persist control file if there is something to save and enough time
|
||||
/// passed after the last save.
|
||||
pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
|
||||
const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
|
||||
if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
|
||||
return Ok(false);
|
||||
}
|
||||
let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
|
||||
|| self.state.inmem.backup_lsn > self.state.backup_lsn
|
||||
|| self.state.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
|
||||
|| self.state.inmem.remote_consistent_lsn > self.state.remote_consistent_lsn;
|
||||
if need_persist {
|
||||
self.state.flush().await?;
|
||||
trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
|
||||
}
|
||||
Ok(need_persist)
|
||||
}
|
||||
|
||||
/// Handle request to append WAL.
|
||||
#[allow(clippy::comparison_chain)]
|
||||
async fn handle_append_request(
|
||||
@@ -933,7 +917,7 @@ where
|
||||
// Note: the check is too restrictive, generally we can update local
|
||||
// commit_lsn if our history matches (is part of) history of advanced
|
||||
// commit_lsn provider.
|
||||
if sk_info.last_log_term == self.get_epoch() {
|
||||
if sk_info.last_log_term == self.get_last_log_term() {
|
||||
self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
|
||||
}
|
||||
}
|
||||
@@ -1079,7 +1063,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_epoch_switch() {
|
||||
async fn test_last_log_term_switch() {
|
||||
let storage = InMemoryState {
|
||||
persisted_state: test_sk_state(),
|
||||
};
|
||||
@@ -1089,7 +1073,7 @@ mod tests {
|
||||
|
||||
let mut ar_hdr = AppendRequestHeader {
|
||||
term: 1,
|
||||
epoch_start_lsn: Lsn(3),
|
||||
term_start_lsn: Lsn(3),
|
||||
begin_lsn: Lsn(1),
|
||||
end_lsn: Lsn(2),
|
||||
commit_lsn: Lsn(0),
|
||||
@@ -1114,14 +1098,14 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// check that AppendRequest before epochStartLsn doesn't switch epoch
|
||||
// check that AppendRequest before term_start_lsn doesn't switch last_log_term.
|
||||
let resp = sk
|
||||
.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await;
|
||||
assert!(resp.is_ok());
|
||||
assert_eq!(sk.get_epoch(), 0);
|
||||
assert_eq!(sk.get_last_log_term(), 0);
|
||||
|
||||
// but record at epochStartLsn does the switch
|
||||
// but record at term_start_lsn does the switch
|
||||
ar_hdr.begin_lsn = Lsn(2);
|
||||
ar_hdr.end_lsn = Lsn(3);
|
||||
append_request = AppendRequest {
|
||||
@@ -1133,7 +1117,7 @@ mod tests {
|
||||
.await;
|
||||
assert!(resp.is_ok());
|
||||
sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
|
||||
assert_eq!(sk.get_epoch(), 1);
|
||||
assert_eq!(sk.get_last_log_term(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::metrics::RECEIVED_PS_FEEDBACKS;
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::{Term, TermLsn};
|
||||
use crate::timeline::Timeline;
|
||||
use crate::timeline::FullAccessTimeline;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::wal_storage::WalReader;
|
||||
use crate::GlobalTimelines;
|
||||
@@ -387,8 +387,10 @@ impl SafekeeperPostgresHandler {
|
||||
term: Option<Term>,
|
||||
) -> Result<(), QueryError> {
|
||||
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
|
||||
let full_access = tli.full_access_guard().await?;
|
||||
|
||||
if let Err(end) = self
|
||||
.handle_start_replication_guts(pgb, start_pos, term, tli.clone())
|
||||
.handle_start_replication_guts(pgb, start_pos, term, full_access)
|
||||
.await
|
||||
{
|
||||
let info = tli.get_safekeeper_info(&self.conf).await;
|
||||
@@ -405,7 +407,7 @@ impl SafekeeperPostgresHandler {
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
term: Option<Term>,
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let appname = self.appname.clone();
|
||||
|
||||
@@ -448,14 +450,7 @@ impl SafekeeperPostgresHandler {
|
||||
// switch to copy
|
||||
pgb.write_message(&BeMessage::CopyBothResponse).await?;
|
||||
|
||||
let (_, persisted_state) = tli.get_state().await;
|
||||
let wal_reader = WalReader::new(
|
||||
self.conf.workdir.clone(),
|
||||
self.conf.timeline_dir(&tli.ttid),
|
||||
&persisted_state,
|
||||
start_pos,
|
||||
self.conf.is_wal_backup_enabled(),
|
||||
)?;
|
||||
let wal_reader = tli.get_walreader(start_pos).await?;
|
||||
|
||||
// Split to concurrently receive and send data; replies are generally
|
||||
// not synchronized with sends, so this avoids deadlocks.
|
||||
@@ -532,7 +527,7 @@ impl EndWatch {
|
||||
/// A half driving sending WAL.
|
||||
struct WalSender<'a, IO> {
|
||||
pgb: &'a mut PostgresBackend<IO>,
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
appname: Option<String>,
|
||||
// Position since which we are sending next chunk.
|
||||
start_pos: Lsn,
|
||||
@@ -741,7 +736,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
struct ReplyReader<IO> {
|
||||
reader: PostgresBackendReader<IO>,
|
||||
ws_guard: Arc<WalSenderGuard>,
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
}
|
||||
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
|
||||
@@ -3,14 +3,14 @@
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::fs;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use std::cmp::max;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
@@ -26,7 +26,6 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
|
||||
INVALID_TERM,
|
||||
@@ -38,8 +37,8 @@ use crate::wal_backup::{self};
|
||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
|
||||
use crate::metrics::FullTimelineInfo;
|
||||
use crate::wal_storage::Storage as wal_storage_iface;
|
||||
use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
|
||||
use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
|
||||
use crate::{debug_dump, timeline_manager, wal_storage};
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
/// Things safekeeper should know about timeline state on peers.
|
||||
@@ -169,7 +168,6 @@ pub struct SharedState {
|
||||
pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
|
||||
/// In memory list containing state of peers sent in latest messages from them.
|
||||
pub(crate) peers_info: PeersInfo,
|
||||
pub(crate) last_removed_segno: XLogSegNo,
|
||||
}
|
||||
|
||||
impl SharedState {
|
||||
@@ -197,33 +195,33 @@ impl SharedState {
|
||||
|
||||
// We don't want to write anything to disk, because we may have existing timeline there.
|
||||
// These functions should not change anything on disk.
|
||||
let timeline_dir = conf.timeline_dir(ttid);
|
||||
let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
|
||||
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||
let control_store =
|
||||
control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
|
||||
let wal_store =
|
||||
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
|
||||
wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
|
||||
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
|
||||
|
||||
Ok(Self {
|
||||
sk,
|
||||
peers_info: PeersInfo(vec![]),
|
||||
last_removed_segno: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// Restore SharedState from control file. If file doesn't exist, bails out.
|
||||
fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
|
||||
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||
let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
|
||||
if control_store.server.wal_seg_size == 0 {
|
||||
bail!(TimelineError::UninitializedWalSegSize(*ttid));
|
||||
}
|
||||
|
||||
let wal_store =
|
||||
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
|
||||
wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
|
||||
|
||||
Ok(Self {
|
||||
sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
|
||||
peers_info: PeersInfo(vec![]),
|
||||
last_removed_segno: 0,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -244,7 +242,7 @@ impl SharedState {
|
||||
timeline_id: ttid.timeline_id.as_ref().to_owned(),
|
||||
}),
|
||||
term: self.sk.state.acceptor_state.term,
|
||||
last_log_term: self.sk.get_epoch(),
|
||||
last_log_term: self.sk.get_last_log_term(),
|
||||
flush_lsn: self.sk.flush_lsn().0,
|
||||
// note: this value is not flushed to control file yet and can be lost
|
||||
commit_lsn: self.sk.state.inmem.commit_lsn.0,
|
||||
@@ -275,24 +273,6 @@ impl SharedState {
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get oldest segno we still need to keep. We hold WAL till it is consumed
|
||||
/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
|
||||
/// offloading.
|
||||
/// While it is safe to use inmem values for determining horizon,
|
||||
/// we use persistent to make possible normal states less surprising.
|
||||
fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
|
||||
let state = &self.sk.state;
|
||||
|
||||
use std::cmp::min;
|
||||
let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
|
||||
// we don't want to remove WAL that is not yet offloaded to s3
|
||||
horizon_lsn = min(horizon_lsn, state.backup_lsn);
|
||||
if let Some(extra_horizon_lsn) = extra_horizon_lsn {
|
||||
horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
|
||||
}
|
||||
horizon_lsn.segment_number(state.server.wal_seg_size as usize)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -349,22 +329,15 @@ pub struct Timeline {
|
||||
mutex: RwLock<SharedState>,
|
||||
walsenders: Arc<WalSenders>,
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
timeline_dir: Utf8PathBuf,
|
||||
|
||||
/// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
|
||||
pub(crate) cancel: CancellationToken,
|
||||
|
||||
/// Directory where timeline state is stored.
|
||||
pub timeline_dir: Utf8PathBuf,
|
||||
|
||||
/// Should we keep WAL on disk for active replication connections.
|
||||
/// Especially useful for sharding, when different shards process WAL
|
||||
/// with different speed.
|
||||
// TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
|
||||
walsenders_keep_horizon: bool,
|
||||
|
||||
// timeline_manager controlled state
|
||||
pub(crate) broker_active: AtomicBool,
|
||||
pub(crate) wal_backup_active: AtomicBool,
|
||||
pub(crate) last_removed_segno: AtomicU64,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
@@ -394,10 +367,10 @@ impl Timeline {
|
||||
walsenders: WalSenders::new(walreceivers.clone()),
|
||||
walreceivers,
|
||||
cancel: CancellationToken::default(),
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
walsenders_keep_horizon: conf.walsenders_keep_horizon,
|
||||
timeline_dir: get_timeline_dir(conf, &ttid),
|
||||
broker_active: AtomicBool::new(false),
|
||||
wal_backup_active: AtomicBool::new(false),
|
||||
last_removed_segno: AtomicU64::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -430,10 +403,10 @@ impl Timeline {
|
||||
walsenders: WalSenders::new(walreceivers.clone()),
|
||||
walreceivers,
|
||||
cancel: CancellationToken::default(),
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
walsenders_keep_horizon: conf.walsenders_keep_horizon,
|
||||
timeline_dir: get_timeline_dir(conf, &ttid),
|
||||
broker_active: AtomicBool::new(false),
|
||||
wal_backup_active: AtomicBool::new(false),
|
||||
last_removed_segno: AtomicU64::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -494,15 +467,6 @@ impl Timeline {
|
||||
conf.clone(),
|
||||
broker_active_set,
|
||||
));
|
||||
|
||||
// Start recovery task which always runs on the timeline.
|
||||
if conf.peer_recovery_enabled {
|
||||
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
||||
}
|
||||
// TODO: migrate to timeline_manager
|
||||
if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
|
||||
tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete timeline from disk completely, by removing timeline directory.
|
||||
@@ -555,36 +519,6 @@ impl Timeline {
|
||||
self.mutex.read().await
|
||||
}
|
||||
|
||||
/// Returns true if walsender should stop sending WAL to pageserver. We
|
||||
/// terminate it if remote_consistent_lsn reached commit_lsn and there is no
|
||||
/// computes. While there might be nothing to stream already, we learn about
|
||||
/// remote_consistent_lsn update through replication feedback, and we want
|
||||
/// to stop pushing to the broker if pageserver is fully caughtup.
|
||||
pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
|
||||
if self.is_cancelled() {
|
||||
return true;
|
||||
}
|
||||
let shared_state = self.read_shared_state().await;
|
||||
if self.walreceivers.get_num() == 0 {
|
||||
return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
|
||||
reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Ensure that current term is t, erroring otherwise, and lock the state.
|
||||
pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
|
||||
let ss = self.read_shared_state().await;
|
||||
if ss.sk.state.acceptor_state.term != t {
|
||||
bail!(
|
||||
"failed to acquire term {}, current term {}",
|
||||
t,
|
||||
ss.sk.state.acceptor_state.term
|
||||
);
|
||||
}
|
||||
Ok(ss)
|
||||
}
|
||||
|
||||
/// Returns commit_lsn watch channel.
|
||||
pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
|
||||
self.commit_lsn_watch_rx.clone()
|
||||
@@ -600,28 +534,6 @@ impl Timeline {
|
||||
self.shared_state_version_rx.clone()
|
||||
}
|
||||
|
||||
/// Pass arrived message to the safekeeper.
|
||||
pub async fn process_msg(
|
||||
self: &Arc<Self>,
|
||||
msg: &ProposerAcceptorMessage,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
|
||||
let mut rmsg: Option<AcceptorProposerMessage>;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
rmsg = shared_state.sk.process_msg(msg).await?;
|
||||
|
||||
// if this is AppendResponse, fill in proper hot standby feedback.
|
||||
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
|
||||
resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
|
||||
}
|
||||
}
|
||||
Ok(rmsg)
|
||||
}
|
||||
|
||||
/// Returns wal_seg_size.
|
||||
pub async fn get_wal_seg_size(&self) -> usize {
|
||||
self.read_shared_state().await.get_wal_seg_size()
|
||||
@@ -672,97 +584,11 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update in memory remote consistent lsn.
|
||||
pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.sk.state.inmem.remote_consistent_lsn =
|
||||
max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
|
||||
}
|
||||
|
||||
pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
|
||||
let shared_state = self.read_shared_state().await;
|
||||
shared_state.get_peers(conf.heartbeat_timeout)
|
||||
}
|
||||
|
||||
/// Should we start fetching WAL from a peer safekeeper, and if yes, from
|
||||
/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
|
||||
/// to fetch, and we can do that without running elections; 2) there is no
|
||||
/// actively streaming compute, as we don't want to compete with it.
|
||||
///
|
||||
/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
|
||||
/// to its last_log_term so we are sure such a leader ever had been elected.
|
||||
///
|
||||
/// All possible donors are returned so that we could keep connection to the
|
||||
/// current one if it is good even if it slightly lags behind.
|
||||
///
|
||||
/// Note that term conditions above might be not met, but safekeepers are
|
||||
/// still not aligned on last flush_lsn. Generally in this case until
|
||||
/// elections are run it is not possible to say which safekeeper should
|
||||
/// recover from which one -- history which would be committed is different
|
||||
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
|
||||
/// Thus we don't try to predict it here.
|
||||
pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
|
||||
let ss = self.read_shared_state().await;
|
||||
let term = ss.sk.state.acceptor_state.term;
|
||||
let last_log_term = ss.sk.get_epoch();
|
||||
let flush_lsn = ss.sk.flush_lsn();
|
||||
// note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
|
||||
let mut peers = ss.get_peers(heartbeat_timeout);
|
||||
// Sort by <last log term, lsn> pairs.
|
||||
peers.sort_by(|p1, p2| {
|
||||
let tl1 = TermLsn {
|
||||
term: p1.last_log_term,
|
||||
lsn: p1.flush_lsn,
|
||||
};
|
||||
let tl2 = TermLsn {
|
||||
term: p2.last_log_term,
|
||||
lsn: p2.flush_lsn,
|
||||
};
|
||||
tl2.cmp(&tl1) // desc
|
||||
});
|
||||
let num_streaming_computes = self.walreceivers.get_num_streaming();
|
||||
let donors = if num_streaming_computes > 0 {
|
||||
vec![] // If there is a streaming compute, don't try to recover to not intervene.
|
||||
} else {
|
||||
peers
|
||||
.iter()
|
||||
.filter_map(|candidate| {
|
||||
// Are we interested in this candidate?
|
||||
let candidate_tl = TermLsn {
|
||||
term: candidate.last_log_term,
|
||||
lsn: candidate.flush_lsn,
|
||||
};
|
||||
let my_tl = TermLsn {
|
||||
term: last_log_term,
|
||||
lsn: flush_lsn,
|
||||
};
|
||||
if my_tl < candidate_tl {
|
||||
// Yes, we are interested. Can we pull from it without
|
||||
// (re)running elections? It is possible if 1) his term
|
||||
// is equal to his last_log_term so we could act on
|
||||
// behalf of leader of this term (we must be sure he was
|
||||
// ever elected) and 2) our term is not higher, or we'll refuse data.
|
||||
if candidate.term == candidate.last_log_term && candidate.term >= term {
|
||||
Some(Donor::from(candidate))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
RecoveryNeededInfo {
|
||||
term,
|
||||
last_log_term,
|
||||
flush_lsn,
|
||||
peers,
|
||||
num_streaming_computes,
|
||||
donors,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_walsenders(&self) -> &Arc<WalSenders> {
|
||||
&self.walsenders
|
||||
}
|
||||
@@ -776,58 +602,6 @@ impl Timeline {
|
||||
self.read_shared_state().await.sk.wal_store.flush_lsn()
|
||||
}
|
||||
|
||||
/// Delete WAL segments from disk that are no longer needed. This is determined
|
||||
/// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
|
||||
pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
|
||||
// If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
|
||||
// This allows to get better read speed for pageservers that are lagging behind,
|
||||
// at the cost of keeping more WAL on disk.
|
||||
let replication_horizon_lsn = if self.walsenders_keep_horizon {
|
||||
self.walsenders.laggard_lsn()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let horizon_segno: XLogSegNo;
|
||||
let remover = {
|
||||
let shared_state = self.read_shared_state().await;
|
||||
horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
|
||||
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
|
||||
return Ok(()); // nothing to do
|
||||
}
|
||||
|
||||
// release the lock before removing
|
||||
shared_state.sk.wal_store.remove_up_to(horizon_segno - 1)
|
||||
};
|
||||
|
||||
// delete old WAL files
|
||||
remover.await?;
|
||||
|
||||
// update last_removed_segno
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
if shared_state.last_removed_segno != horizon_segno {
|
||||
shared_state.last_removed_segno = horizon_segno;
|
||||
} else {
|
||||
shared_state.skip_update = true;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Persist control file if there is something to save and enough time
|
||||
/// passed after the last save. This helps to keep remote_consistent_lsn up
|
||||
/// to date so that storage nodes restart doesn't cause many pageserver ->
|
||||
/// safekeeper reconnections.
|
||||
pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
|
||||
let mut guard = self.write_shared_state().await;
|
||||
let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
|
||||
guard.skip_update = !changed;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gather timeline data for metrics.
|
||||
pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
|
||||
if self.is_cancelled() {
|
||||
@@ -843,8 +617,8 @@ impl Timeline {
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
timeline_is_active: self.broker_active.load(Ordering::Relaxed),
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
|
||||
epoch_start_lsn: state.sk.term_start_lsn,
|
||||
mem_state: state.sk.state.inmem.clone(),
|
||||
persisted_state: state.sk.state.clone(),
|
||||
flush_lsn: state.sk.wal_store.flush_lsn(),
|
||||
@@ -866,8 +640,8 @@ impl Timeline {
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
active: self.broker_active.load(Ordering::Relaxed),
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
|
||||
epoch_start_lsn: state.sk.term_start_lsn,
|
||||
mem_state: state.sk.state.inmem.clone(),
|
||||
write_lsn,
|
||||
write_record_lsn,
|
||||
@@ -889,6 +663,110 @@ impl Timeline {
|
||||
state.sk.state.finish_change(&persistent_state).await?;
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
/// Get the timeline guard for reading/writing WAL files.
|
||||
/// TODO: if WAL files are not present on disk (evicted), they will be
|
||||
/// downloaded from S3. Also there will logic for preventing eviction
|
||||
/// while someone is holding FullAccessTimeline guard.
|
||||
pub async fn full_access_guard(self: &Arc<Self>) -> Result<FullAccessTimeline> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
Ok(FullAccessTimeline { tli: self.clone() })
|
||||
}
|
||||
}
|
||||
|
||||
/// This is a guard that allows to read/write disk timeline state.
|
||||
/// All tasks that are using the disk should use this guard.
|
||||
#[derive(Clone)]
|
||||
pub struct FullAccessTimeline {
|
||||
pub tli: Arc<Timeline>,
|
||||
}
|
||||
|
||||
impl Deref for FullAccessTimeline {
|
||||
type Target = Arc<Timeline>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.tli
|
||||
}
|
||||
}
|
||||
|
||||
impl FullAccessTimeline {
|
||||
/// Returns true if walsender should stop sending WAL to pageserver. We
|
||||
/// terminate it if remote_consistent_lsn reached commit_lsn and there is no
|
||||
/// computes. While there might be nothing to stream already, we learn about
|
||||
/// remote_consistent_lsn update through replication feedback, and we want
|
||||
/// to stop pushing to the broker if pageserver is fully caughtup.
|
||||
pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
|
||||
if self.is_cancelled() {
|
||||
return true;
|
||||
}
|
||||
let shared_state = self.read_shared_state().await;
|
||||
if self.walreceivers.get_num() == 0 {
|
||||
return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
|
||||
reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Ensure that current term is t, erroring otherwise, and lock the state.
|
||||
pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
|
||||
let ss = self.read_shared_state().await;
|
||||
if ss.sk.state.acceptor_state.term != t {
|
||||
bail!(
|
||||
"failed to acquire term {}, current term {}",
|
||||
t,
|
||||
ss.sk.state.acceptor_state.term
|
||||
);
|
||||
}
|
||||
Ok(ss)
|
||||
}
|
||||
|
||||
/// Pass arrived message to the safekeeper.
|
||||
pub async fn process_msg(
|
||||
&self,
|
||||
msg: &ProposerAcceptorMessage,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
|
||||
let mut rmsg: Option<AcceptorProposerMessage>;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
rmsg = shared_state.sk.process_msg(msg).await?;
|
||||
|
||||
// if this is AppendResponse, fill in proper hot standby feedback.
|
||||
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
|
||||
resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
|
||||
}
|
||||
}
|
||||
Ok(rmsg)
|
||||
}
|
||||
|
||||
pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
|
||||
let (_, persisted_state) = self.get_state().await;
|
||||
let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
|
||||
|
||||
WalReader::new(
|
||||
&self.ttid,
|
||||
self.timeline_dir.clone(),
|
||||
&persisted_state,
|
||||
start_lsn,
|
||||
enable_remote_read,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_timeline_dir(&self) -> Utf8PathBuf {
|
||||
self.timeline_dir.clone()
|
||||
}
|
||||
|
||||
/// Update in memory remote consistent lsn.
|
||||
pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.sk.state.inmem.remote_consistent_lsn =
|
||||
max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
|
||||
}
|
||||
}
|
||||
|
||||
/// Deletes directory and it's contents. Returns false if directory does not exist.
|
||||
@@ -899,3 +777,16 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a path to the tenant directory. If you just need to get a timeline directory,
|
||||
/// use FullAccessTimeline::get_timeline_dir instead.
|
||||
pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
conf.workdir.join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
/// Get a path to the timeline directory. If you need to read WAL files from disk,
|
||||
/// use FullAccessTimeline::get_timeline_dir instead. This function does not check
|
||||
/// timeline eviction status and WAL files might not be present on disk.
|
||||
pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
|
||||
get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
|
||||
}
|
||||
|
||||
@@ -3,23 +3,42 @@
|
||||
//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
|
||||
//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
|
||||
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::{
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use tracing::{info, instrument, warn};
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use tokio::task::{JoinError, JoinHandle};
|
||||
use tracing::{info, info_span, instrument, warn, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
control_file::Storage,
|
||||
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
|
||||
recovery::recovery_main,
|
||||
remove_wal::calc_horizon_lsn,
|
||||
send_wal::WalSenders,
|
||||
timeline::{PeerInfo, ReadGuardSharedState, Timeline},
|
||||
timelines_set::TimelinesSet,
|
||||
timelines_set::{TimelineSetGuard, TimelinesSet},
|
||||
wal_backup::{self, WalBackupTaskHandle},
|
||||
SafeKeeperConf,
|
||||
wal_backup_partial, SafeKeeperConf,
|
||||
};
|
||||
|
||||
pub struct StateSnapshot {
|
||||
// inmem values
|
||||
pub commit_lsn: Lsn,
|
||||
pub backup_lsn: Lsn,
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
|
||||
// persistent control file values
|
||||
pub cfile_peer_horizon_lsn: Lsn,
|
||||
pub cfile_remote_consistent_lsn: Lsn,
|
||||
pub cfile_backup_lsn: Lsn,
|
||||
|
||||
// misc
|
||||
pub cfile_last_persist_at: Instant,
|
||||
pub inmem_flush_pending: bool,
|
||||
pub peers: Vec<PeerInfo>,
|
||||
}
|
||||
|
||||
@@ -30,17 +49,34 @@ impl StateSnapshot {
|
||||
commit_lsn: read_guard.sk.state.inmem.commit_lsn,
|
||||
backup_lsn: read_guard.sk.state.inmem.backup_lsn,
|
||||
remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
|
||||
cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn,
|
||||
cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn,
|
||||
cfile_backup_lsn: read_guard.sk.state.backup_lsn,
|
||||
cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(),
|
||||
inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard),
|
||||
peers: read_guard.get_peers(heartbeat_timeout),
|
||||
}
|
||||
}
|
||||
|
||||
fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool {
|
||||
let state = &read_guard.sk.state;
|
||||
state.inmem.commit_lsn > state.commit_lsn
|
||||
|| state.inmem.backup_lsn > state.backup_lsn
|
||||
|| state.inmem.peer_horizon_lsn > state.peer_horizon_lsn
|
||||
|| state.inmem.remote_consistent_lsn > state.remote_consistent_lsn
|
||||
}
|
||||
}
|
||||
|
||||
/// Control how often the manager task should wake up to check updates.
|
||||
/// There is no need to check for updates more often than this.
|
||||
const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
|
||||
|
||||
/// How often to save the control file if the is no other activity.
|
||||
const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
|
||||
|
||||
/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
|
||||
/// background tasks.
|
||||
/// Be careful, this task is not respawned on panic, so it should not panic.
|
||||
#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn main_task(
|
||||
tli: Arc<Timeline>,
|
||||
@@ -55,20 +91,50 @@ pub async fn main_task(
|
||||
}
|
||||
};
|
||||
|
||||
// sets whether timeline is active for broker pushes or not
|
||||
let mut tli_broker_active = broker_active_set.guard(tli.clone());
|
||||
|
||||
let ttid = tli.ttid;
|
||||
// configuration & dependencies
|
||||
let wal_seg_size = tli.get_wal_seg_size().await;
|
||||
let heartbeat_timeout = conf.heartbeat_timeout;
|
||||
|
||||
let mut state_version_rx = tli.get_state_version_rx();
|
||||
|
||||
let walsenders = tli.get_walsenders();
|
||||
let walreceivers = tli.get_walreceivers();
|
||||
|
||||
// current state
|
||||
let mut state_version_rx = tli.get_state_version_rx();
|
||||
let mut num_computes_rx = walreceivers.get_num_rx();
|
||||
let mut tli_broker_active = broker_active_set.guard(tli.clone());
|
||||
let mut last_removed_segno = 0 as XLogSegNo;
|
||||
|
||||
// list of background tasks
|
||||
let mut backup_task: Option<WalBackupTaskHandle> = None;
|
||||
let mut recovery_task: Option<JoinHandle<()>> = None;
|
||||
let mut partial_backup_task: Option<JoinHandle<()>> = None;
|
||||
let mut wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>> = None;
|
||||
|
||||
// Start recovery task which always runs on the timeline.
|
||||
if conf.peer_recovery_enabled {
|
||||
match tli.full_access_guard().await {
|
||||
Ok(tli) => {
|
||||
recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone())));
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("failed to start recovery task: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start partial backup task which always runs on the timeline.
|
||||
if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
|
||||
match tli.full_access_guard().await {
|
||||
Ok(tli) => {
|
||||
partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
|
||||
tli,
|
||||
conf.clone(),
|
||||
)));
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("failed to start partial backup task: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let last_state = 'outer: loop {
|
||||
MANAGER_ITERATIONS_TOTAL.inc();
|
||||
@@ -76,47 +142,36 @@ pub async fn main_task(
|
||||
let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
|
||||
let num_computes = *num_computes_rx.borrow();
|
||||
|
||||
let is_wal_backup_required =
|
||||
wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
|
||||
let is_wal_backup_required = update_backup(
|
||||
&conf,
|
||||
&tli,
|
||||
wal_seg_size,
|
||||
num_computes,
|
||||
&state_snapshot,
|
||||
&mut backup_task,
|
||||
)
|
||||
.await;
|
||||
|
||||
if conf.is_wal_backup_enabled() {
|
||||
wal_backup::update_task(
|
||||
&conf,
|
||||
ttid,
|
||||
is_wal_backup_required,
|
||||
&state_snapshot,
|
||||
&mut backup_task,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
let _is_active = update_is_active(
|
||||
is_wal_backup_required,
|
||||
num_computes,
|
||||
&state_snapshot,
|
||||
&mut tli_broker_active,
|
||||
&tli,
|
||||
);
|
||||
|
||||
let is_active = is_wal_backup_required
|
||||
|| num_computes > 0
|
||||
|| state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
|
||||
let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await;
|
||||
|
||||
// update the broker timeline set
|
||||
if tli_broker_active.set(is_active) {
|
||||
// write log if state has changed
|
||||
info!(
|
||||
"timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
|
||||
is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
|
||||
);
|
||||
|
||||
MANAGER_ACTIVE_CHANGES.inc();
|
||||
|
||||
if !is_active {
|
||||
// TODO: maybe use tokio::spawn?
|
||||
if let Err(e) = tli.maybe_persist_control_file(false).await {
|
||||
warn!("control file save in update_status failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update the state in Arc<Timeline>
|
||||
tli.wal_backup_active
|
||||
.store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
|
||||
tli.broker_active
|
||||
.store(is_active, std::sync::atomic::Ordering::Relaxed);
|
||||
update_wal_removal(
|
||||
&conf,
|
||||
walsenders,
|
||||
&tli,
|
||||
wal_seg_size,
|
||||
&state_snapshot,
|
||||
last_removed_segno,
|
||||
&mut wal_removal_task,
|
||||
)
|
||||
.await;
|
||||
|
||||
// wait until something changes. tx channels are stored under Arc, so they will not be
|
||||
// dropped until the manager task is finished.
|
||||
@@ -135,11 +190,192 @@ pub async fn main_task(
|
||||
_ = num_computes_rx.changed() => {
|
||||
// number of connected computes was updated
|
||||
}
|
||||
_ = async {
|
||||
if let Some(timeout) = next_cfile_save {
|
||||
tokio::time::sleep_until(timeout).await
|
||||
} else {
|
||||
futures::future::pending().await
|
||||
}
|
||||
} => {
|
||||
// it's time to save the control file
|
||||
}
|
||||
res = async {
|
||||
if let Some(task) = &mut wal_removal_task {
|
||||
task.await
|
||||
} else {
|
||||
futures::future::pending().await
|
||||
}
|
||||
} => {
|
||||
// WAL removal task finished
|
||||
wal_removal_task = None;
|
||||
update_wal_removal_end(res, &tli, &mut last_removed_segno);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// shutdown background tasks
|
||||
if conf.is_wal_backup_enabled() {
|
||||
wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
|
||||
wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
|
||||
}
|
||||
|
||||
if let Some(recovery_task) = recovery_task {
|
||||
if let Err(e) = recovery_task.await {
|
||||
warn!("recovery task failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(partial_backup_task) = partial_backup_task {
|
||||
if let Err(e) = partial_backup_task.await {
|
||||
warn!("partial backup task failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(wal_removal_task) = wal_removal_task {
|
||||
let res = wal_removal_task.await;
|
||||
update_wal_removal_end(res, &tli, &mut last_removed_segno);
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawns/kills backup task and returns true if backup is required.
|
||||
async fn update_backup(
|
||||
conf: &SafeKeeperConf,
|
||||
tli: &Arc<Timeline>,
|
||||
wal_seg_size: usize,
|
||||
num_computes: usize,
|
||||
state: &StateSnapshot,
|
||||
backup_task: &mut Option<WalBackupTaskHandle>,
|
||||
) -> bool {
|
||||
let is_wal_backup_required =
|
||||
wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state);
|
||||
|
||||
if conf.is_wal_backup_enabled() {
|
||||
wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await;
|
||||
}
|
||||
|
||||
// update the state in Arc<Timeline>
|
||||
tli.wal_backup_active
|
||||
.store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
|
||||
is_wal_backup_required
|
||||
}
|
||||
|
||||
/// Update is_active flag and returns its value.
|
||||
fn update_is_active(
|
||||
is_wal_backup_required: bool,
|
||||
num_computes: usize,
|
||||
state: &StateSnapshot,
|
||||
tli_broker_active: &mut TimelineSetGuard,
|
||||
tli: &Arc<Timeline>,
|
||||
) -> bool {
|
||||
let is_active = is_wal_backup_required
|
||||
|| num_computes > 0
|
||||
|| state.remote_consistent_lsn < state.commit_lsn;
|
||||
|
||||
// update the broker timeline set
|
||||
if tli_broker_active.set(is_active) {
|
||||
// write log if state has changed
|
||||
info!(
|
||||
"timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
|
||||
is_active, state.remote_consistent_lsn, state.commit_lsn,
|
||||
);
|
||||
|
||||
MANAGER_ACTIVE_CHANGES.inc();
|
||||
}
|
||||
|
||||
// update the state in Arc<Timeline>
|
||||
tli.broker_active
|
||||
.store(is_active, std::sync::atomic::Ordering::Relaxed);
|
||||
is_active
|
||||
}
|
||||
|
||||
/// Save control file if needed. Returns Instant if we should persist the control file in the future.
|
||||
async fn update_control_file_save(
|
||||
state: &StateSnapshot,
|
||||
tli: &Arc<Timeline>,
|
||||
) -> Option<tokio::time::Instant> {
|
||||
if !state.inmem_flush_pending {
|
||||
return None;
|
||||
}
|
||||
|
||||
if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL {
|
||||
let mut write_guard = tli.write_shared_state().await;
|
||||
// this can be done in the background because it blocks manager task, but flush() should
|
||||
// be fast enough not to be a problem now
|
||||
if let Err(e) = write_guard.sk.state.flush().await {
|
||||
warn!("failed to save control file: {:?}", e);
|
||||
}
|
||||
|
||||
None
|
||||
} else {
|
||||
// we should wait until next CF_SAVE_INTERVAL
|
||||
Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into())
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawns WAL removal task if needed.
|
||||
async fn update_wal_removal(
|
||||
conf: &SafeKeeperConf,
|
||||
walsenders: &Arc<WalSenders>,
|
||||
tli: &Arc<Timeline>,
|
||||
wal_seg_size: usize,
|
||||
state: &StateSnapshot,
|
||||
last_removed_segno: u64,
|
||||
wal_removal_task: &mut Option<JoinHandle<anyhow::Result<u64>>>,
|
||||
) {
|
||||
if wal_removal_task.is_some() {
|
||||
// WAL removal is already in progress
|
||||
return;
|
||||
}
|
||||
|
||||
// If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
|
||||
// This allows to get better read speed for pageservers that are lagging behind,
|
||||
// at the cost of keeping more WAL on disk.
|
||||
let replication_horizon_lsn = if conf.walsenders_keep_horizon {
|
||||
walsenders.laggard_lsn()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn);
|
||||
let removal_horizon_segno = removal_horizon_lsn
|
||||
.segment_number(wal_seg_size)
|
||||
.saturating_sub(1);
|
||||
|
||||
if removal_horizon_segno > last_removed_segno {
|
||||
// we need to remove WAL
|
||||
let remover = crate::wal_storage::Storage::remove_up_to(
|
||||
&tli.read_shared_state().await.sk.wal_store,
|
||||
removal_horizon_segno,
|
||||
);
|
||||
*wal_removal_task = Some(tokio::spawn(
|
||||
async move {
|
||||
remover.await?;
|
||||
Ok(removal_horizon_segno)
|
||||
}
|
||||
.instrument(info_span!("WAL removal", ttid=%tli.ttid)),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the state after WAL removal task finished.
|
||||
fn update_wal_removal_end(
|
||||
res: Result<anyhow::Result<u64>, JoinError>,
|
||||
tli: &Arc<Timeline>,
|
||||
last_removed_segno: &mut u64,
|
||||
) {
|
||||
let new_last_removed_segno = match res {
|
||||
Ok(Ok(segno)) => segno,
|
||||
Err(e) => {
|
||||
warn!("WAL removal task failed: {:?}", e);
|
||||
return;
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!("WAL removal task failed: {:?}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
*last_removed_segno = new_last_removed_segno;
|
||||
// update the state in Arc<Timeline>
|
||||
tli.last_removed_segno
|
||||
.store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
//! all from the disk on startup and keeping them in memory.
|
||||
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::timeline::{Timeline, TimelineError};
|
||||
use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::SafeKeeperConf;
|
||||
use anyhow::{bail, Context, Result};
|
||||
@@ -127,7 +127,7 @@ impl GlobalTimelines {
|
||||
state.get_dependencies()
|
||||
};
|
||||
|
||||
let timelines_dir = conf.tenant_dir(&tenant_id);
|
||||
let timelines_dir = get_tenant_dir(&conf, &tenant_id);
|
||||
for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
|
||||
.with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
|
||||
{
|
||||
@@ -348,11 +348,7 @@ impl GlobalTimelines {
|
||||
}
|
||||
Err(_) => {
|
||||
// Timeline is not memory, but it may still exist on disk in broken state.
|
||||
let dir_path = TIMELINES_STATE
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get_conf()
|
||||
.timeline_dir(ttid);
|
||||
let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
|
||||
let dir_existed = delete_dir(dir_path)?;
|
||||
|
||||
Ok(TimelineDeleteForceResult {
|
||||
@@ -401,13 +397,10 @@ impl GlobalTimelines {
|
||||
// Note that we could concurrently create new timelines while we were deleting them,
|
||||
// so the directory may be not empty. In this case timelines will have bad state
|
||||
// and timeline background jobs can panic.
|
||||
delete_dir(
|
||||
TIMELINES_STATE
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get_conf()
|
||||
.tenant_dir(tenant_id),
|
||||
)?;
|
||||
delete_dir(get_tenant_dir(
|
||||
TIMELINES_STATE.lock().unwrap().get_conf(),
|
||||
tenant_id,
|
||||
))?;
|
||||
|
||||
// FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
|
||||
// let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
|
||||
|
||||
@@ -30,9 +30,9 @@ use tracing::*;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
|
||||
use crate::timeline::{PeerInfo, Timeline};
|
||||
use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline};
|
||||
use crate::timeline_manager::StateSnapshot;
|
||||
use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
|
||||
use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
@@ -63,13 +63,13 @@ pub fn is_wal_backup_required(
|
||||
/// is running, kill it.
|
||||
pub async fn update_task(
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: TenantTimelineId,
|
||||
tli: &Arc<Timeline>,
|
||||
need_backup: bool,
|
||||
state: &StateSnapshot,
|
||||
entry: &mut Option<WalBackupTaskHandle>,
|
||||
) {
|
||||
let (offloader, election_dbg_str) =
|
||||
determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
|
||||
determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf);
|
||||
let elected_me = Some(conf.my_id) == offloader;
|
||||
|
||||
let should_task_run = need_backup && elected_me;
|
||||
@@ -80,15 +80,8 @@ pub async fn update_task(
|
||||
info!("elected for backup: {}", election_dbg_str);
|
||||
|
||||
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
|
||||
let timeline_dir = conf.timeline_dir(&ttid);
|
||||
|
||||
let async_task = backup_task_main(
|
||||
ttid,
|
||||
timeline_dir,
|
||||
conf.workdir.clone(),
|
||||
conf.backup_parallel_jobs,
|
||||
shutdown_rx,
|
||||
);
|
||||
let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx);
|
||||
|
||||
let handle = if conf.current_thread_runtime {
|
||||
tokio::spawn(async_task)
|
||||
@@ -198,39 +191,32 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
|
||||
}
|
||||
|
||||
struct WalBackupTask {
|
||||
timeline: Arc<Timeline>,
|
||||
timeline: FullAccessTimeline,
|
||||
timeline_dir: Utf8PathBuf,
|
||||
workspace_dir: Utf8PathBuf,
|
||||
wal_seg_size: usize,
|
||||
parallel_jobs: usize,
|
||||
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||
}
|
||||
|
||||
/// Offload single timeline.
|
||||
#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
|
||||
async fn backup_task_main(
|
||||
ttid: TenantTimelineId,
|
||||
timeline_dir: Utf8PathBuf,
|
||||
workspace_dir: Utf8PathBuf,
|
||||
parallel_jobs: usize,
|
||||
mut shutdown_rx: Receiver<()>,
|
||||
) {
|
||||
#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
|
||||
async fn backup_task_main(tli: Arc<Timeline>, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) {
|
||||
let _guard = WAL_BACKUP_TASKS.guard();
|
||||
|
||||
let tli = match tli.full_access_guard().await {
|
||||
Ok(tli) => tli,
|
||||
Err(e) => {
|
||||
error!("backup error: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
info!("started");
|
||||
let res = GlobalTimelines::get(ttid);
|
||||
if let Err(e) = res {
|
||||
error!("backup error: {}", e);
|
||||
return;
|
||||
}
|
||||
let tli = res.unwrap();
|
||||
|
||||
let mut wb = WalBackupTask {
|
||||
wal_seg_size: tli.get_wal_seg_size().await,
|
||||
commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
|
||||
timeline_dir: tli.get_timeline_dir(),
|
||||
timeline: tli,
|
||||
timeline_dir,
|
||||
workspace_dir,
|
||||
parallel_jobs,
|
||||
};
|
||||
|
||||
@@ -297,7 +283,6 @@ impl WalBackupTask {
|
||||
commit_lsn,
|
||||
self.wal_seg_size,
|
||||
&self.timeline_dir,
|
||||
&self.workspace_dir,
|
||||
self.parallel_jobs,
|
||||
)
|
||||
.await
|
||||
@@ -319,18 +304,18 @@ impl WalBackupTask {
|
||||
}
|
||||
|
||||
async fn backup_lsn_range(
|
||||
timeline: &Arc<Timeline>,
|
||||
timeline: &FullAccessTimeline,
|
||||
backup_lsn: &mut Lsn,
|
||||
end_lsn: Lsn,
|
||||
wal_seg_size: usize,
|
||||
timeline_dir: &Utf8Path,
|
||||
workspace_dir: &Utf8Path,
|
||||
parallel_jobs: usize,
|
||||
) -> Result<()> {
|
||||
if parallel_jobs < 1 {
|
||||
anyhow::bail!("parallel_jobs must be >= 1");
|
||||
}
|
||||
|
||||
let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
|
||||
let start_lsn = *backup_lsn;
|
||||
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
|
||||
|
||||
@@ -343,7 +328,11 @@ async fn backup_lsn_range(
|
||||
loop {
|
||||
let added_task = match iter.next() {
|
||||
Some(s) => {
|
||||
uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir));
|
||||
uploads.push_back(backup_single_segment(
|
||||
s,
|
||||
timeline_dir,
|
||||
&remote_timeline_path,
|
||||
));
|
||||
true
|
||||
}
|
||||
None => false,
|
||||
@@ -381,18 +370,10 @@ async fn backup_lsn_range(
|
||||
async fn backup_single_segment(
|
||||
seg: &Segment,
|
||||
timeline_dir: &Utf8Path,
|
||||
workspace_dir: &Utf8Path,
|
||||
remote_timeline_path: &RemotePath,
|
||||
) -> Result<Segment> {
|
||||
let segment_file_path = seg.file_path(timeline_dir)?;
|
||||
let remote_segment_path = segment_file_path
|
||||
.strip_prefix(workspace_dir)
|
||||
.context("Failed to strip workspace dir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
|
||||
)
|
||||
})?;
|
||||
let remote_segment_path = seg.remote_path(remote_timeline_path);
|
||||
|
||||
let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
|
||||
if res.is_ok() {
|
||||
@@ -430,6 +411,10 @@ impl Segment {
|
||||
Ok(timeline_dir.join(self.object_name()))
|
||||
}
|
||||
|
||||
pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath {
|
||||
remote_timeline_path.join(self.object_name())
|
||||
}
|
||||
|
||||
pub fn size(self) -> usize {
|
||||
(u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize
|
||||
}
|
||||
@@ -530,8 +515,7 @@ pub async fn read_object(
|
||||
/// when called.
|
||||
pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
let storage = get_configured_remote_storage();
|
||||
let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
|
||||
let remote_path = RemotePath::new(&ttid_path)?;
|
||||
let remote_path = remote_timeline_path(ttid)?;
|
||||
|
||||
// see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
|
||||
// const Option unwrap is not stable, otherwise it would be const.
|
||||
@@ -613,15 +597,17 @@ pub async fn copy_s3_segments(
|
||||
.as_ref()
|
||||
.unwrap();
|
||||
|
||||
let relative_dst_path =
|
||||
Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
|
||||
|
||||
let remote_path = RemotePath::new(&relative_dst_path)?;
|
||||
let remote_dst_path = remote_timeline_path(dst_ttid)?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let files = storage
|
||||
.list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
|
||||
.list(
|
||||
Some(&remote_dst_path),
|
||||
ListingMode::NoDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?
|
||||
.keys;
|
||||
|
||||
@@ -635,9 +621,6 @@ pub async fn copy_s3_segments(
|
||||
uploaded_segments
|
||||
);
|
||||
|
||||
let relative_src_path =
|
||||
Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
|
||||
|
||||
for segno in from_segment..to_segment {
|
||||
if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
|
||||
info!("copied all segments from {} until {}", from_segment, segno);
|
||||
@@ -649,8 +632,8 @@ pub async fn copy_s3_segments(
|
||||
}
|
||||
debug!("copying segment {}", segment_name);
|
||||
|
||||
let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
|
||||
let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
|
||||
let from = remote_timeline_path(src_ttid)?.join(&segment_name);
|
||||
let to = remote_dst_path.join(&segment_name);
|
||||
|
||||
storage.copy_object(&from, &to, &cancel).await?;
|
||||
}
|
||||
@@ -661,3 +644,8 @@ pub async fn copy_s3_segments(
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get S3 (remote_storage) prefix path used for timeline files.
|
||||
pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result<RemotePath> {
|
||||
RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()))
|
||||
}
|
||||
|
||||
@@ -18,22 +18,21 @@
|
||||
//! This way control file stores information about all potentially existing
|
||||
//! remote partial segments and can clean them up after uploading a newer version.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
|
||||
use rand::Rng;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use tracing::{debug, error, info, instrument};
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
|
||||
safekeeper::Term,
|
||||
timeline::Timeline,
|
||||
wal_backup, SafeKeeperConf,
|
||||
timeline::FullAccessTimeline,
|
||||
wal_backup::{self, remote_timeline_path},
|
||||
SafeKeeperConf,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
@@ -83,10 +82,10 @@ impl State {
|
||||
|
||||
struct PartialBackup {
|
||||
wal_seg_size: usize,
|
||||
tli: Arc<Timeline>,
|
||||
tli: FullAccessTimeline,
|
||||
conf: SafeKeeperConf,
|
||||
local_prefix: Utf8PathBuf,
|
||||
remote_prefix: Utf8PathBuf,
|
||||
remote_timeline_path: RemotePath,
|
||||
|
||||
state: State,
|
||||
}
|
||||
@@ -153,7 +152,7 @@ impl PartialBackup {
|
||||
let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
|
||||
|
||||
let local_path = self.local_prefix.join(self.local_segment_name(segno));
|
||||
let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
|
||||
let remote_path = self.remote_timeline_path.join(&prepared.name);
|
||||
|
||||
// Upload first `backup_bytes` bytes of the segment to the remote storage.
|
||||
wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
|
||||
@@ -253,7 +252,7 @@ impl PartialBackup {
|
||||
info!("deleting objects: {:?}", segments_to_delete);
|
||||
let mut objects_to_delete = vec![];
|
||||
for seg in segments_to_delete.iter() {
|
||||
let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
|
||||
let remote_path = self.remote_timeline_path.join(seg);
|
||||
objects_to_delete.push(remote_path);
|
||||
}
|
||||
|
||||
@@ -273,7 +272,7 @@ impl PartialBackup {
|
||||
}
|
||||
|
||||
#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
debug!("started");
|
||||
let await_duration = conf.partial_backup_timeout;
|
||||
|
||||
@@ -289,11 +288,11 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
|
||||
let wal_seg_size = tli.get_wal_seg_size().await;
|
||||
|
||||
let local_prefix = tli.timeline_dir.clone();
|
||||
let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
|
||||
Ok(path) => path.to_owned(),
|
||||
let local_prefix = tli.get_timeline_dir();
|
||||
let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
|
||||
Ok(path) => path,
|
||||
Err(e) => {
|
||||
error!("failed to strip workspace dir prefix: {:?}", e);
|
||||
error!("failed to create remote path: {:?}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
@@ -304,12 +303,28 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
state: persistent_state.partial_backup,
|
||||
conf,
|
||||
local_prefix,
|
||||
remote_prefix,
|
||||
remote_timeline_path,
|
||||
};
|
||||
|
||||
debug!("state: {:?}", backup.state);
|
||||
|
||||
// The general idea is that each safekeeper keeps only one partial segment
|
||||
// both in remote storage and in local state. If this is not true, something
|
||||
// went wrong.
|
||||
const MAX_SIMULTANEOUS_SEGMENTS: usize = 10;
|
||||
|
||||
'outer: loop {
|
||||
if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS {
|
||||
warn!(
|
||||
"too many segments in control_file state, running gc: {}",
|
||||
backup.state.segments.len()
|
||||
);
|
||||
|
||||
backup.gc().await.unwrap_or_else(|e| {
|
||||
error!("failed to run gc: {:#}", e);
|
||||
});
|
||||
}
|
||||
|
||||
// wait until we have something to upload
|
||||
let uploaded_segment = backup.state.uploaded_segment();
|
||||
if let Some(seg) = &uploaded_segment {
|
||||
|
||||
@@ -25,7 +25,7 @@ use utils::crashsafe::durable_rename;
|
||||
|
||||
use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::wal_backup::read_object;
|
||||
use crate::wal_backup::{read_object, remote_timeline_path};
|
||||
use crate::SafeKeeperConf;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::XLogFileName;
|
||||
@@ -536,7 +536,7 @@ async fn remove_segments_from_disk(
|
||||
}
|
||||
|
||||
pub struct WalReader {
|
||||
workdir: Utf8PathBuf,
|
||||
remote_path: RemotePath,
|
||||
timeline_dir: Utf8PathBuf,
|
||||
wal_seg_size: usize,
|
||||
pos: Lsn,
|
||||
@@ -558,7 +558,7 @@ pub struct WalReader {
|
||||
|
||||
impl WalReader {
|
||||
pub fn new(
|
||||
workdir: Utf8PathBuf,
|
||||
ttid: &TenantTimelineId,
|
||||
timeline_dir: Utf8PathBuf,
|
||||
state: &TimelinePersistentState,
|
||||
start_pos: Lsn,
|
||||
@@ -586,7 +586,7 @@ impl WalReader {
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
workdir,
|
||||
remote_path: remote_timeline_path(ttid)?,
|
||||
timeline_dir,
|
||||
wal_seg_size: state.server.wal_seg_size as usize,
|
||||
pos: start_pos,
|
||||
@@ -684,7 +684,7 @@ impl WalReader {
|
||||
let xlogoff = self.pos.segment_offset(self.wal_seg_size);
|
||||
let segno = self.pos.segment_number(self.wal_seg_size);
|
||||
let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
|
||||
let wal_file_path = self.timeline_dir.join(wal_file_name);
|
||||
let wal_file_path = self.timeline_dir.join(&wal_file_name);
|
||||
|
||||
// Try to open local file, if we may have WAL locally
|
||||
if self.pos >= self.local_start_lsn {
|
||||
@@ -712,16 +712,7 @@ impl WalReader {
|
||||
|
||||
// Try to open remote file, if remote reads are enabled
|
||||
if self.enable_remote_read {
|
||||
let remote_wal_file_path = wal_file_path
|
||||
.strip_prefix(&self.workdir)
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to resolve remote part of path {:?} for base {:?}",
|
||||
wal_file_path, self.workdir,
|
||||
)
|
||||
})?;
|
||||
let remote_wal_file_path = self.remote_path.join(&wal_file_name);
|
||||
return read_object(&remote_wal_file_path, xlogoff as u64).await;
|
||||
}
|
||||
|
||||
|
||||
@@ -72,6 +72,18 @@ class Lsn:
|
||||
def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
|
||||
return Lsn(self.lsn_int - (self.lsn_int % seg_sz))
|
||||
|
||||
def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int:
|
||||
return self.lsn_int // seg_sz
|
||||
|
||||
def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str:
|
||||
segno = self.segno(seg_sz)
|
||||
# The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex.
|
||||
# XXXXXXXX is the higher 8 hex digits of segno
|
||||
high_bits = segno >> 8
|
||||
# YY is the lower 2 hex digits of segno
|
||||
low_bits = segno & 0xFF
|
||||
return f"00000001{high_bits:08X}000000{low_bits:02X}"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Key:
|
||||
|
||||
@@ -973,6 +973,9 @@ class NeonEnvBuilder:
|
||||
for pageserver in self.env.pageservers:
|
||||
pageserver.assert_no_errors()
|
||||
|
||||
for safekeeper in self.env.safekeepers:
|
||||
safekeeper.assert_no_errors()
|
||||
|
||||
self.env.storage_controller.assert_no_errors()
|
||||
|
||||
try:
|
||||
@@ -3813,6 +3816,9 @@ class Safekeeper(LogUtils):
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def assert_no_errors(self):
|
||||
assert not self.log_contains("manager task finished prematurely")
|
||||
|
||||
def append_logical_message(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
@@ -3898,6 +3904,15 @@ class Safekeeper(LogUtils):
|
||||
"""
|
||||
cli = self.http_client()
|
||||
|
||||
target_segment_file = lsn.segment_name()
|
||||
|
||||
def are_segments_removed():
|
||||
segments = self.list_segments(tenant_id, timeline_id)
|
||||
log.info(
|
||||
f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}"
|
||||
)
|
||||
assert all(target_segment_file <= s for s in segments)
|
||||
|
||||
def are_lsns_advanced():
|
||||
stat = cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(
|
||||
@@ -3909,6 +3924,7 @@ class Safekeeper(LogUtils):
|
||||
# pageserver to this safekeeper
|
||||
wait_until(30, 1, are_lsns_advanced)
|
||||
cli.checkpoint(tenant_id, timeline_id)
|
||||
wait_until(30, 1, are_segments_removed)
|
||||
|
||||
def wait_until_paused(self, failpoint: str):
|
||||
msg = f"at failpoint {failpoint}"
|
||||
@@ -4323,7 +4339,7 @@ def check_restored_datadir_content(
|
||||
cmd = f"diff {f1}.hex {f2}.hex"
|
||||
subprocess.run([cmd], stdout=stdout_f, shell=True)
|
||||
|
||||
assert (mismatch, error) == ([], [])
|
||||
# assert (mismatch, error) == ([], [])
|
||||
|
||||
|
||||
def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn:
|
||||
|
||||
@@ -66,7 +66,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
|
||||
".*task iteration took longer than the configured period.*",
|
||||
# these can happen anytime we do compactions from background task and shutdown pageserver
|
||||
r".*ERROR.*ancestor timeline \S+ is being stopped",
|
||||
".*could not compact.*cancelled.*",
|
||||
# this is expected given our collaborative shutdown approach for the UploadQueue
|
||||
".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
|
||||
".*Compaction failed.*, retrying in .*: ShuttingDown",
|
||||
|
||||
@@ -19,7 +19,8 @@ class Walreceiver:
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
term: int
|
||||
last_log_term: int
|
||||
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
flush_lsn: Lsn
|
||||
commit_lsn: Lsn
|
||||
@@ -156,7 +157,8 @@ class SafekeeperHttpClient(requests.Session):
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
return SafekeeperTimelineStatus(
|
||||
acceptor_epoch=resj["acceptor_state"]["epoch"],
|
||||
term=resj["acceptor_state"]["term"],
|
||||
last_log_term=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
flush_lsn=Lsn(resj["flush_lsn"]),
|
||||
commit_lsn=Lsn(resj["commit_lsn"]),
|
||||
|
||||
@@ -85,7 +85,13 @@ page_cache_size=10
|
||||
|
||||
vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
|
||||
vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
|
||||
vectored_average = vectored_sum.value / vectored_count.value
|
||||
if vectored_count.value > 0:
|
||||
assert vectored_sum.value > 0
|
||||
vectored_average = vectored_sum.value / vectored_count.value
|
||||
else:
|
||||
# special case: running local tests with default legacy configuration
|
||||
assert vectored_sum.value == 0
|
||||
vectored_average = 0
|
||||
|
||||
log.info(f"{non_vectored_average=} {vectored_average=}")
|
||||
|
||||
|
||||
@@ -402,7 +402,7 @@ def test_download_remote_layers_api(
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*download failed: downloading evicted layer file failed.*",
|
||||
f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed",
|
||||
f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed.*downloading evicted layer file failed",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -302,7 +302,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# gc should not try to even start on a timeline that doesn't exist
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException, match="gc target timeline does not exist"
|
||||
expected_exception=PageserverApiException, match="NotFound: Timeline not found"
|
||||
):
|
||||
bogus_timeline_id = TimelineId.generate()
|
||||
pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
|
||||
@@ -310,7 +310,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# the error will be printed to the log too
|
||||
".*gc target timeline does not exist.*",
|
||||
".*NotFound: Timeline not found.*",
|
||||
# Timelines get stopped during detach, ignore the gc calls that error, witnessing that
|
||||
".*InternalServerError\\(timeline is Stopping.*",
|
||||
]
|
||||
|
||||
@@ -17,6 +17,7 @@ import psycopg2
|
||||
import psycopg2.errors
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.broker import NeonBroker
|
||||
from fixtures.common_types import Lsn, TenantId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
@@ -841,7 +842,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
|
||||
# fetch something sensible from status
|
||||
tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
epoch = tli_status.acceptor_epoch
|
||||
term = tli_status.term
|
||||
timeline_start_lsn = tli_status.timeline_start_lsn
|
||||
|
||||
if auth_enabled:
|
||||
@@ -862,8 +863,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
endpoint.safe_psql("insert into t values(10)")
|
||||
|
||||
tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
epoch_after_reboot = tli_status.acceptor_epoch
|
||||
assert epoch_after_reboot > epoch
|
||||
term_after_reboot = tli_status.term
|
||||
assert term_after_reboot > term
|
||||
|
||||
# and timeline_start_lsn stays the same
|
||||
assert tli_status.timeline_start_lsn == timeline_start_lsn
|
||||
@@ -1104,11 +1105,11 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
|
||||
# First check that term / flush_lsn are the same: it is easier to
|
||||
# report/understand if WALs are different due to that.
|
||||
statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
|
||||
term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
|
||||
term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses]
|
||||
for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
|
||||
assert (
|
||||
term_flush_lsns[0] == tfl
|
||||
), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
|
||||
), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
|
||||
|
||||
# check that WALs are identic.
|
||||
segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
|
||||
@@ -1867,6 +1868,65 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
|
||||
assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}"
|
||||
|
||||
|
||||
# Test pull_timeline while concurrently changing term on the donor:
|
||||
# 1) Start pull_timeline, listing files to fetch.
|
||||
# 2) Change term on the donor
|
||||
# 3) Finish pull_timeline.
|
||||
#
|
||||
# Currently (until proper membership change procedure), we want to pull_timeline
|
||||
# to fetch the log up to <last_log_term, flush_lsn>. This is unsafe if term
|
||||
# changes during the procedure (unless timeline is locked all the time but we
|
||||
# don't want that): recepient might end up with mix of WAL from different
|
||||
# histories. Thus the schedule above is expected to fail. Later we'd allow
|
||||
# pull_timeline to only initialize timeline to any valid state (up to
|
||||
# commit_lsn), holding switch to fully new configuration until it recovers
|
||||
# enough, so it won't be affected by term change anymore.
|
||||
#
|
||||
# Expected to fail while term check is not implemented.
|
||||
@pytest.mark.xfail
|
||||
def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
(src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
|
||||
|
||||
log.info("use only first 2 safekeepers, 3rd will be seeded")
|
||||
ep = env.endpoints.create("main")
|
||||
ep.active_safekeepers = [1, 2]
|
||||
ep.start()
|
||||
ep.safe_psql("create table t(key int, value text)")
|
||||
ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
|
||||
|
||||
dst_http = dst_sk.http_client()
|
||||
# run pull_timeline which will halt before downloading files
|
||||
dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
|
||||
pt_handle = PropagatingThread(
|
||||
target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
|
||||
)
|
||||
pt_handle.start()
|
||||
dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
|
||||
|
||||
src_http = src_sk.http_client()
|
||||
term_before = src_http.timeline_status(tenant_id, timeline_id).term
|
||||
|
||||
# restart compute to bump term
|
||||
ep.stop()
|
||||
ep = env.endpoints.create("main")
|
||||
ep.active_safekeepers = [1, 2]
|
||||
ep.start()
|
||||
ep.safe_psql("insert into t select generate_series(1, 100), 'pear'")
|
||||
|
||||
term_after = src_http.timeline_status(tenant_id, timeline_id).term
|
||||
assert term_after > term_before, f"term_after={term_after}, term_before={term_before}"
|
||||
|
||||
dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
|
||||
with pytest.raises(requests.exceptions.HTTPError):
|
||||
pt_handle.join()
|
||||
|
||||
|
||||
# In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
|
||||
# when compute is active, but there are no writes to the timeline. In that case
|
||||
# pageserver should maintain a single connection to safekeeper and don't attempt
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 17e0f5ff4e...a9bfeec24d
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: c2c3d40534...8cc683b542
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: b228f20372...e2cccb954d
6
vendor/revisions.json
vendored
6
vendor/revisions.json
vendored
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"v16": ["16.3", "b228f20372ebcabfd7946647cb7adbd38bacb14a"],
|
||||
"v15": ["15.7", "c2c3d40534db97d83dd7e185d1971e707fa2f445"],
|
||||
"v14": ["14.12", "17e0f5ff4e1905691aa40e1e08f9b79b14c99652"]
|
||||
"v16": ["16.3", "e2cccb954d4aa96713f2ae4a72b2806300f199f7"],
|
||||
"v15": ["15.7", "8cc683b5428b9532f3897f3842fe44af90048617"],
|
||||
"v14": ["14.12", "a9bfeec24d08f36eaffcd3548284e4732ad57a5c"]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user