mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-30 11:30:37 +00:00
Merge remote-tracking branch 'origin/main' into layer_compression-heikki2
This commit is contained in:
@@ -15,3 +15,4 @@ console_mgmt_base_url = http://console-release.local
|
||||
bucket_name = zenith-storage-oregon
|
||||
bucket_region = us-west-2
|
||||
etcd_endpoints = etcd-release.local:2379
|
||||
safekeeper_enable_s3_offload = true
|
||||
|
||||
@@ -16,3 +16,4 @@ console_mgmt_base_url = http://console-staging.local
|
||||
bucket_name = zenith-staging-storage-us-east-1
|
||||
bucket_region = us-east-1
|
||||
etcd_endpoints = etcd-staging.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }}
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Ordered map datastructure implemented in a Vec.
|
||||
/// Append only - can only add keys that are larger than the
|
||||
/// current max key.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct VecMap<K, V>(Vec<(K, V)>);
|
||||
|
||||
impl<K, V> Default for VecMap<K, V> {
|
||||
|
||||
@@ -23,6 +23,7 @@ distribution depends on the workload: the updates could be totally random, or
|
||||
there could be a long stream of updates to a single relation when data is bulk
|
||||
loaded, for example, or something in between.
|
||||
|
||||
```
|
||||
Cloud Storage Page Server Safekeeper
|
||||
L1 L0 Memory WAL
|
||||
|
||||
@@ -37,6 +38,7 @@ Cloud Storage Page Server Safekeeper
|
||||
+----+----+ +----+----+ | | |
|
||||
|EEEE| |EEEE|EEEE| +---+-----+
|
||||
+----+ +----+----+
|
||||
```
|
||||
|
||||
In this illustration, WAL is received as a stream from the Safekeeper, from the
|
||||
right. It is immediately captured by the page server and stored quickly in
|
||||
@@ -47,7 +49,7 @@ the same page and relation close to each other.
|
||||
From the page server memory, whenever enough WAL has been accumulated, it is flushed
|
||||
to disk into a new L0 layer file, and the memory is released.
|
||||
|
||||
When enough L0 files have been accumulated, they are merged together rand sliced
|
||||
When enough L0 files have been accumulated, they are merged together and sliced
|
||||
per key-space, producing a new set of files where each file contains a more
|
||||
narrow key range, but larger LSN range.
|
||||
|
||||
@@ -121,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
|
||||
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
||||
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
||||
timeline are stored in the timeline's subdirectory under
|
||||
.zenith/tenants/<tenantid>/timelines.
|
||||
`.zenith/tenants/<tenantid>/timelines`.
|
||||
|
||||
There are two kind of layer files: images, and delta layers. An image file
|
||||
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
||||
@@ -130,8 +132,11 @@ range of LSN.
|
||||
|
||||
image file:
|
||||
|
||||
```
|
||||
000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
start key end key LSN
|
||||
```
|
||||
|
||||
|
||||
The first parts define the key range that the layer covers. See
|
||||
pgdatadir_mapping.rs for how the key space is used. The last part is the LSN.
|
||||
@@ -140,8 +145,10 @@ delta file:
|
||||
|
||||
Delta files are named similarly, but they cover a range of LSNs:
|
||||
|
||||
```
|
||||
000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
start key end key start LSN end LSN
|
||||
```
|
||||
|
||||
A delta file contains all the key-values in the key-range that were updated in
|
||||
the LSN range. If a key has not been modified, there is no trace of it in the
|
||||
@@ -151,7 +158,9 @@ delta layer.
|
||||
A delta layer file can cover a part of the overall key space, as in the previous
|
||||
example, or the whole key range like this:
|
||||
|
||||
```
|
||||
000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051
|
||||
```
|
||||
|
||||
A file that covers the whole key range is called a L0 file (Level 0), while a
|
||||
file that covers only part of the key range is called a L1 file. The "level" of
|
||||
@@ -168,7 +177,9 @@ version, and how branching and GC works is still valid.
|
||||
|
||||
The full path of a delta file looks like this:
|
||||
|
||||
```
|
||||
.zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
||||
```
|
||||
|
||||
For simplicity, the examples below use a simplified notation for the
|
||||
paths. The tenant ID is left out, the timeline ID is replaced with
|
||||
@@ -177,8 +188,10 @@ with a human-readable table name. The LSNs are also shorter. For
|
||||
example, a base image file at LSN 100 and a delta file between 100-200
|
||||
for 'orders' table on 'main' branch is represented like this:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
```
|
||||
|
||||
|
||||
# Creating layer files
|
||||
@@ -188,12 +201,14 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end
|
||||
of WAL is currently at LSN 250. In this starting situation, you would
|
||||
have these files on disk:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
main/customers_100
|
||||
main/customers_100_200
|
||||
main/customers_200
|
||||
```
|
||||
|
||||
In addition to those files, the recent changes between LSN 200 and the
|
||||
end of WAL at 250 are kept in memory. If the page server crashes, the
|
||||
@@ -224,6 +239,7 @@ If the customers table is modified later, a new file is created for it
|
||||
at the next checkpoint. The new file will cover the "gap" from the
|
||||
last layer file, so the LSN ranges are always contiguous:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -236,6 +252,7 @@ last layer file, so the LSN ranges are always contiguous:
|
||||
main/customers_200
|
||||
main/customers_200_500
|
||||
main/customers_500
|
||||
```
|
||||
|
||||
## Reading page versions
|
||||
|
||||
@@ -259,15 +276,18 @@ involves replaying any WAL records applicable to the page between LSNs
|
||||
|
||||
Imagine that a child branch is created at LSN 250:
|
||||
|
||||
```
|
||||
@250
|
||||
----main--+-------------------------->
|
||||
\
|
||||
+---child-------------->
|
||||
```
|
||||
|
||||
|
||||
Then, the 'orders' table is updated differently on the 'main' and
|
||||
'child' branches. You now have this situation on disk:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -282,6 +302,7 @@ Then, the 'orders' table is updated differently on the 'main' and
|
||||
child/orders_300
|
||||
child/orders_300_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
Because the 'customers' table hasn't been modified on the child
|
||||
branch, there is no file for it there. If you request a page for it on
|
||||
@@ -294,6 +315,7 @@ is linear, and the request's LSN identifies unambiguously which file
|
||||
you need to look at. For example, the history for the 'orders' table
|
||||
on the 'main' branch consists of these files:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -301,10 +323,12 @@ on the 'main' branch consists of these files:
|
||||
main/orders_300
|
||||
main/orders_300_400
|
||||
main/orders_400
|
||||
```
|
||||
|
||||
And from the 'child' branch's point of view, it consists of these
|
||||
files:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -313,6 +337,7 @@ files:
|
||||
child/orders_300
|
||||
child/orders_300_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
The branch metadata includes the point where the child branch was
|
||||
created, LSN 250. If a page request comes with LSN 275, we read the
|
||||
@@ -345,6 +370,7 @@ Let's look at the single branch scenario again. Imagine that the end
|
||||
of the branch is LSN 525, so that the GC horizon is currently at
|
||||
525-150 = 375
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -357,11 +383,13 @@ of the branch is LSN 525, so that the GC horizon is currently at
|
||||
main/customers_100
|
||||
main/customers_100_200
|
||||
main/customers_200
|
||||
```
|
||||
|
||||
We can remove the following files because the end LSNs of those files are
|
||||
older than GC horizon 375, and there are more recent layer files for the
|
||||
table:
|
||||
|
||||
```
|
||||
main/orders_100 DELETE
|
||||
main/orders_100_200 DELETE
|
||||
main/orders_200 DELETE
|
||||
@@ -374,8 +402,9 @@ table:
|
||||
main/customers_100 DELETE
|
||||
main/customers_100_200 DELETE
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
```
|
||||
|
||||
'main/customers_100_200' is old enough, but it cannot be
|
||||
'main/customers_200' is old enough, but it cannot be
|
||||
removed because there is no newer layer file for the table.
|
||||
|
||||
Things get slightly more complicated with multiple branches. All of
|
||||
@@ -384,6 +413,7 @@ retain older shapshot files that are still needed by child branches.
|
||||
For example, if child branch is created at LSN 150, and the 'customers'
|
||||
table is updated on the branch, you would have these files:
|
||||
|
||||
```
|
||||
main/orders_100 KEEP, NEEDED BY child BRANCH
|
||||
main/orders_100_200 KEEP, NEEDED BY child BRANCH
|
||||
main/orders_200 DELETE
|
||||
@@ -398,6 +428,7 @@ table is updated on the branch, you would have these files:
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
child/customers_150_300 DELETE
|
||||
child/customers_300 KEEP, NO NEWER VERSION
|
||||
```
|
||||
|
||||
In this situation, 'main/orders_100' and 'main/orders_100_200' cannot
|
||||
be removed, even though they are older than the GC horizon, because
|
||||
@@ -407,6 +438,7 @@ and 'main/orders_200_300' can still be removed.
|
||||
If 'orders' is modified later on the 'child' branch, we will create a
|
||||
new base image and delta file for it on the child:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
|
||||
@@ -419,6 +451,7 @@ new base image and delta file for it on the child:
|
||||
child/customers_300
|
||||
child/orders_150_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
After this, the 'main/orders_100' and 'main/orders_100_200' file could
|
||||
be removed. It is no longer needed by the child branch, because there
|
||||
@@ -434,6 +467,7 @@ Describe GC and checkpoint interval settings.
|
||||
In principle, each relation can be checkpointed separately, i.e. the
|
||||
LSN ranges of the files don't need to line up. So this would be legal:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -446,6 +480,7 @@ LSN ranges of the files don't need to line up. So this would be legal:
|
||||
main/customers_250
|
||||
main/customers_250_500
|
||||
main/customers_500
|
||||
```
|
||||
|
||||
However, the code currently always checkpoints all relations together.
|
||||
So that situation doesn't arise in practice.
|
||||
@@ -468,11 +503,13 @@ does that. It could be useful, however, as a transient state when
|
||||
garbage collecting around branch points, or explicit recovery
|
||||
points. For example, if we start with this:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
main/orders_200_300
|
||||
main/orders_300
|
||||
```
|
||||
|
||||
And there is a branch or explicit recovery point at LSN 150, we could
|
||||
replace 'main/orders_100_200' with 'main/orders_150' to keep a
|
||||
|
||||
@@ -115,6 +115,14 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("enable-s3-offload")
|
||||
.long("enable-s3-offload")
|
||||
.takes_value(true)
|
||||
.default_value("true")
|
||||
.default_missing_value("true")
|
||||
.help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("dump-control-file") {
|
||||
@@ -172,6 +180,13 @@ fn main() -> Result<()> {
|
||||
conf.broker_etcd_prefix = prefix.to_string();
|
||||
}
|
||||
|
||||
// Seems like there is no better way to accept bool values explicitly in clap.
|
||||
conf.s3_offload_enabled = arg_matches
|
||||
.value_of("enable-s3-offload")
|
||||
.unwrap()
|
||||
.parse()
|
||||
.context("failed to parse bool enable-s3-offload bool")?;
|
||||
|
||||
start_safekeeper(conf, given_id, arg_matches.is_present("init"))
|
||||
}
|
||||
|
||||
|
||||
@@ -53,6 +53,7 @@ pub struct SafeKeeperConf {
|
||||
pub my_id: ZNodeId,
|
||||
pub broker_endpoints: Option<Vec<Url>>,
|
||||
pub broker_etcd_prefix: String,
|
||||
pub s3_offload_enabled: bool,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -79,6 +80,7 @@ impl Default for SafeKeeperConf {
|
||||
my_id: ZNodeId(0),
|
||||
broker_endpoints: None,
|
||||
broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(),
|
||||
s3_offload_enabled: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) {
|
||||
let active_tlis = GlobalTimelines::get_active_timelines();
|
||||
for zttid in &active_tlis {
|
||||
if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) {
|
||||
if let Err(e) = tli.remove_old_wal() {
|
||||
if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) {
|
||||
warn!(
|
||||
"failed to remove WAL for tenant {} timeline {}: {}",
|
||||
tli.zttid.tenant_id, tli.zttid.timeline_id, e
|
||||
|
||||
@@ -930,13 +930,18 @@ where
|
||||
/// offloading.
|
||||
/// While it is safe to use inmem values for determining horizon,
|
||||
/// we use persistent to make possible normal states less surprising.
|
||||
pub fn get_horizon_segno(&self) -> XLogSegNo {
|
||||
pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo {
|
||||
let s3_offload_horizon = if s3_offload_enabled {
|
||||
self.state.s3_wal_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
};
|
||||
let horizon_lsn = min(
|
||||
min(
|
||||
self.state.remote_consistent_lsn,
|
||||
self.state.peer_horizon_lsn,
|
||||
),
|
||||
self.state.s3_wal_lsn,
|
||||
s3_offload_horizon,
|
||||
);
|
||||
horizon_lsn.segment_number(self.state.server.wal_seg_size as usize)
|
||||
}
|
||||
|
||||
@@ -479,7 +479,7 @@ impl Timeline {
|
||||
shared_state.sk.wal_store.flush_lsn()
|
||||
}
|
||||
|
||||
pub fn remove_old_wal(&self) -> Result<()> {
|
||||
pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> {
|
||||
let horizon_segno: XLogSegNo;
|
||||
let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
|
||||
{
|
||||
@@ -488,7 +488,7 @@ impl Timeline {
|
||||
if shared_state.sk.state.server.wal_seg_size == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
horizon_segno = shared_state.sk.get_horizon_segno();
|
||||
horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled);
|
||||
remover = shared_state.sk.wal_store.remove_up_to();
|
||||
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
|
||||
return Ok(());
|
||||
|
||||
Reference in New Issue
Block a user