Compare commits

...

5 Commits

Author SHA1 Message Date
anastasia
5e91d15f06 REVERT ME. Add unlink test and a bunch of debug printlns 2021-09-02 17:29:29 +03:00
anastasia
8c46178256 Attempt to fix list_rels & list_nonrels for dropped segments 2021-09-02 16:46:11 +03:00
anastasia
2a4d405f96 Rename put_unlink() to drop_relish() in Timeline trait.
Rename put_unlink() to drop_segment() in Layer trait.
2021-09-02 16:37:05 +03:00
anastasia
66174c0342 Improve comments for Layer trait. 2021-09-02 16:33:47 +03:00
anastasia
cee366ac17 Don't use term 'snapshot' to describe layers 2021-09-02 16:33:30 +03:00
13 changed files with 249 additions and 112 deletions

View File

@@ -26,7 +26,7 @@ A checkpoint record in the WAL marks a point in the WAL sequence at which it is
NOTE: This is an overloaded term.
Whenever enough WAL has been accumulated in memory, the page server []
writes out the changes in memory into new layer files[]. This process
writes out the changes from in-memory layers into new layer files[]. This process
is called "checkpointing". The page server only creates layer files for
relations that have been modified since the last checkpoint.
@@ -41,17 +41,28 @@ Stateless Postgres node that stores data in pageserver.
Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
Each PostgreSQL fork is considered a separate relish.
### Layer file
### Layer
Each layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
There are two kinds of layers, in-memory and on-disk layers. In-memory
layers are used to ingest incoming WAL, and provide fast access
to the recent page versions. On-disk layers are stored as files on disk, and
are immutable.
### Layer file (on-disk layer)
Layered repository on-disk format is based on immutable files. The
files are called "layer files". Each file corresponds to one 10 MB
files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE
segment of a PostgreSQL relation fork. There are two kinds of layer
files: image files and delta files. An image file contains a
"snapshot" of the segment at a particular LSN, and a delta file
contains WAL records applicable to the segment, in a range of LSNs.
### Layer map
The layer map tracks what layers exist for all the relishes in a timeline.
### Layered repository
Zenith repository implementation that keeps data in layers.
### LSN
@@ -121,7 +132,7 @@ Each SLRU segment is considered a separate relish[].
### Tenant (Multitenancy)
Tenant represents a single customer, interacting with Zenith.
Wal redo[] activity, timelines[], snapshots[] are managed for each tenant independently.
Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
One pageserver[] can serve multiple tenants at once.
One safekeeper

View File

@@ -37,7 +37,7 @@ On the page server tenants introduce one level of indirection, so data directory
├── de182bc61fb11a5a6b390a8aed3a804a
└── ee6016ec31116c1b7c33dfdfca38891f
```
Wal redo activity, timelines, snapshots are managed for each tenant independently.
Wal redo activity and timelines are managed for each tenant independently.
For local environment used for example in tests there also new level of indirection for tenants. It touches `pgdatadirs` directory. Now it contains `tenants` subdirectory so the structure looks the following way:

View File

@@ -583,7 +583,7 @@ impl Timeline for LayeredTimeline {
}
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>> {
trace!("list_rels called at {}", lsn);
println!("list_rels called at {}", lsn);
// List all rels in this timeline, and all its ancestors.
let mut all_rels = HashSet::new();
@@ -675,14 +675,14 @@ impl Timeline for LayeredTimeline {
(relsize - 1) / RELISH_SEG_SIZE
};
// Unlink segments beyond the last remaining segment.
// Drop segments beyond the last remaining segment.
for remove_segno in (last_remain_seg + 1)..=old_last_seg {
let seg = SegmentTag {
rel,
segno: remove_segno,
};
let layer = self.get_layer_for_write(seg, lsn)?;
layer.put_unlink(lsn)?;
layer.drop_segment(lsn)?;
}
// Truncate the last remaining segment to the specified size
@@ -698,8 +698,8 @@ impl Timeline for LayeredTimeline {
Ok(())
}
fn put_unlink(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
trace!("put_unlink: {} at {}", rel, lsn);
fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
trace!("drop_segment: {} at {}", rel, lsn);
if rel.is_blocky() {
let oldsize_opt = self.get_relish_size(rel, self.get_last_record_lsn())?;
@@ -710,25 +710,25 @@ impl Timeline for LayeredTimeline {
(oldsize - 1) / RELISH_SEG_SIZE
};
// Unlink all segments
// Drop all segments of the relish
for remove_segno in 0..=old_last_seg {
let seg = SegmentTag {
rel,
segno: remove_segno,
};
let layer = self.get_layer_for_write(seg, lsn)?;
layer.put_unlink(lsn)?;
layer.drop_segment(lsn)?;
}
} else {
warn!(
"put_unlink called on non-existent relish {} at {}",
"drop_segment called on non-existent relish {} at {}",
rel, lsn
);
}
} else {
let seg = SegmentTag::from_blknum(rel, 0);
let layer = self.get_layer_for_write(seg, lsn)?;
layer.put_unlink(lsn)?;
layer.drop_segment(lsn)?;
}
Ok(())
@@ -927,7 +927,6 @@ impl LayeredTimeline {
assert!(layer.get_start_lsn() <= lsn);
if layer.is_dropped() && layer.get_end_lsn() <= lsn {
// The segment was unlinked
return Ok(None);
}
@@ -988,7 +987,7 @@ impl LayeredTimeline {
if prev_layer.get_timeline_id() != self.timelineid {
// First modification on this timeline
start_lsn = self.ancestor_lsn;
trace!(
println!(
"creating file for write for {} at branch point {}/{}",
seg,
self.timelineid,
@@ -996,14 +995,14 @@ impl LayeredTimeline {
);
} else {
start_lsn = prev_layer.get_end_lsn();
trace!(
println!(
"creating file for write for {} after previous layer {}/{}",
seg,
self.timelineid,
start_lsn
);
}
trace!(
println!(
"prev layer is at {}/{} - {}",
prev_layer.get_timeline_id(),
prev_layer.get_start_lsn(),
@@ -1019,7 +1018,7 @@ impl LayeredTimeline {
)?;
} else {
// New relation.
trace!(
println!(
"creating layer for write for new rel {} at {}/{}",
seg,
self.timelineid,
@@ -1031,6 +1030,7 @@ impl LayeredTimeline {
}
let mut layers = self.layers.lock().unwrap();
println!("before layers.insert_open {:?}", layer.filename());
let layer_rc: Arc<InMemoryLayer> = Arc::new(layer);
layers.insert_open(Arc::clone(&layer_rc));
@@ -1091,7 +1091,7 @@ impl LayeredTimeline {
prev: prev_record_lsn,
} = self.last_record_lsn.load();
trace!(
println!(
"checkpointing timeline {} at {}",
self.timelineid,
last_record_lsn
@@ -1227,7 +1227,7 @@ impl LayeredTimeline {
//
// Determine for each file if it needs to be retained
// FIXME: also scan open in-memory layers. Normally we cannot remove the
// latest layer of any seg, but if it was unlinked it's possible
// latest layer of any seg, but if it was dropped it's possible
let mut layers = self.layers.lock().unwrap();
'outer: for l in layers.iter_historic_layers() {
let seg = l.get_seg_tag();

View File

@@ -6,13 +6,14 @@ which pages they apply to, and accumulates the incoming changes in
memory. Every now and then, the accumulated changes are written out to
new files.
The files are called "snapshot files". Each snapshot file corresponds
to one 10 MB slice of a PostgreSQL relation fork. The snapshot files
The files are called "layer files". Each layer file corresponds
to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or
non-rel file in a range of LSNs. The layer files
for each timeline are stored in the timeline's subdirectory under
.zenith/tenants/<tenantid>/timelines.
There are two kind of snapshot file: base images, and deltas. A base
image file contains a snapshot of a segment as it was at one LSN,
There are two kind of layer file: base images, and deltas. A base
image file contains a layer of a segment as it was at one LSN,
whereas a delta file contains modifications to a segment - mostly in
the form of WAL records - in a range of LSN
@@ -44,7 +45,7 @@ managed, except that the first part of file names is different.
Internally, the relations and non-relation files that are managed in
the versioned store are together called "relishes".
If a file has been dropped, the last snapshot file for it is created
If a file has been dropped, the last layer file for it is created
with the _DROPPED suffix, e.g.
rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED
@@ -67,7 +68,7 @@ for 'orders' table on 'main' branch is represented like this:
main/orders_100_200
# Creating snapshot files
# Creating layer files
Let's start with a simple example with a system that contains one
branch called 'main' and two tables, 'orders' and 'customers'. The end
@@ -86,10 +87,10 @@ end of WAL at 250 are kept in memory. If the page server crashes, the
latest records between 200-250 need to be re-read from the WAL.
Whenever enough WAL has been accumulated in memory, the page server
writes out the changes in memory into new snapshot files. This process
writes out the changes in memory into new layer files. This process
is called "checkpointing" (not to be confused with the PostgreSQL
checkpoints, that's a different thing). The page server only creates
snapshot files for relations that have been modified since the last
layer files for relations that have been modified since the last
checkpoint. For example, if the current end of WAL is at LSN 450, and
the last checkpoint happened at LSN 400 but there hasn't been any
recent changes to 'customers' table, you would have these files on
@@ -108,7 +109,7 @@ disk:
If the customers table is modified later, a new file is created for it
at the next checkpoint. The new file will cover the "gap" from the
last snapshot file, so the LSN ranges are always contiguous:
last layer file, so the LSN ranges are always contiguous:
main/orders_100
main/orders_100_200
@@ -130,13 +131,13 @@ page server needs to reconstruct the requested page, as it was at the
requested LSN. To do that, the page server first checks the recent
in-memory layer; if the requested page version is found there, it can
be returned immediatedly without looking at the files on
disk. Otherwise the page server needs to locate the snapshot file that
disk. Otherwise the page server needs to locate the layer file that
contains the requested page version.
For example, if a request comes in for table 'orders' at LSN 250, the
page server would load the 'main/orders_200_300' file into memory, and
reconstruct and return the requested page from it, as it was at
LSN 250. Because the snapshot file consists of a full image of the
LSN 250. Because the layer file consists of a full image of the
relation at the start LSN and the WAL, reconstructing the page
involves replaying any WAL records applicable to the page between LSNs
200-250, starting from the base image at LSN 200.
@@ -171,7 +172,7 @@ Then, the 'orders' table is updated differently on the 'main' and
Because the 'customers' table hasn't been modified on the child
branch, there is no file for it there. If you request a page for it on
the 'child' branch, the page server will not find any snapshot file
the 'child' branch, the page server will not find any layer file
for it in the 'child' directory, so it will recurse to look into the
parent 'main' branch instead.
@@ -217,7 +218,7 @@ branch at a historic LSN, is how we support PITR in Zenith.
# Garbage collection
In this scheme, we keep creating new snapshot files over time. We also
In this scheme, we keep creating new layer files over time. We also
need a mechanism to remove old files that are no longer needed,
because disk space isn't infinite.
@@ -245,7 +246,7 @@ of the branch is LSN 525, so that the GC horizon is currently at
main/customers_200
We can remove the following files because the end LSNs of those files are
older than GC horizon 375, and there are more recent snapshot files for the
older than GC horizon 375, and there are more recent layer files for the
table:
main/orders_100 DELETE
@@ -262,7 +263,7 @@ table:
main/customers_200 KEEP, NO NEWER VERSION
'main/customers_100_200' is old enough, but it cannot be
removed because there is no newer snapshot file for the table.
removed because there is no newer layer file for the table.
Things get slightly more complicated with multiple branches. All of
the above still holds, but in addition to recent files we must also
@@ -308,7 +309,7 @@ new base image and delta file for it on the child:
After this, the 'main/orders_100' and 'main/orders_100_200' file could
be removed. It is no longer needed by the child branch, because there
is a newer snapshot file there. TODO: This optimization hasn't been
is a newer layer file there. TODO: This optimization hasn't been
implemented! The GC algorithm will currently keep the file on the
'main' branch anyway, for as long as the child branch exists.
@@ -346,7 +347,7 @@ It would also be OK to have overlapping LSN ranges for the same relation:
main/orders_300_400
main/orders_400
The code that reads the snapshot files should cope with this, but this
The code that reads the layer files should cope with this, but this
situation doesn't arise either, because the checkpointing code never
does that. It could be useful, however, as a transient state when
garbage collecting around branch points, or explicit recovery
@@ -360,6 +361,6 @@ points. For example, if we start with this:
And there is a branch or explicit recovery point at LSN 150, we could
replace 'main/orders_100_200' with 'main/orders_150' to keep a
snapshot only at that exact point that's still needed, removing the
layer only at that exact point that's still needed, removing the
other page versions around it. But such compaction has not been
implemented yet.

View File

@@ -174,7 +174,7 @@ impl Layer for DeltaLayer {
{
// Open the file and lock the metadata in memory
// TODO: avoid opening the snapshot file for each read
// TODO: avoid opening the file for each read
let (_path, book) = self.open_book()?;
let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
let inner = self.load()?;
@@ -251,6 +251,8 @@ impl Layer for DeltaLayer {
/// Does this segment exist at given LSN?
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
assert!(lsn >= self.start_lsn);
// Is the requested LSN after the rel was dropped?
if self.dropped && lsn >= self.end_lsn {
return Ok(false);

View File

@@ -2,8 +2,9 @@
//! It is stored in a file on disk.
//!
//! On disk, the image files are stored in timelines/<timelineid> directory.
//! Currently, there are no subdirectories, and each snapshot file is named like this:
//! Currently, there are no subdirectories, and each image layer file is named like this:
//!
//! Note that segno is
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
//!
//! For example:
@@ -15,10 +16,10 @@
//! Only metadata is loaded into memory by the load function.
//! When images are needed, they are read directly from disk.
//!
//! For blocky segments, the images are stored in BLOCKY_IMAGES_CHAPTER.
//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
//! All the images are required to be BLOCK_SIZE, which allows for random access.
//!
//! For non-blocky segments, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
//!
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
use crate::layered_repository::storage_layer::{
@@ -156,7 +157,9 @@ impl Layer for ImageLayer {
}
/// Get size of the segment
fn get_seg_size(&self, _lsn: Lsn) -> Result<u32> {
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
assert!(lsn >= self.lsn);
let inner = self.load()?;
match inner.image_type {
ImageType::Blocky { num_blocks } => Ok(num_blocks),
@@ -165,7 +168,8 @@ impl Layer for ImageLayer {
}
/// Does this segment exist at given LSN?
fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
assert!(lsn >= self.lsn);
Ok(true)
}

View File

@@ -198,6 +198,8 @@ impl Layer for InMemoryLayer {
/// Does this segment exist at given LSN?
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
assert!(lsn >= self.start_lsn);
let inner = self.inner.lock().unwrap();
// Is the requested LSN after the segment was dropped?
@@ -239,8 +241,8 @@ impl Layer for InMemoryLayer {
.unwrap_or_default();
println!(
"----- in-memory layer for {} {}-{} ----",
self.seg, self.start_lsn, end_str
"----- in-memory layer for tli {} seg {} {}-{} ----",
self.timelineid, self.seg, self.start_lsn, end_str
);
for (k, v) in inner.segsizes.iter() {
@@ -271,7 +273,7 @@ impl InMemoryLayer {
start_lsn: Lsn,
oldest_pending_lsn: Lsn,
) -> Result<InMemoryLayer> {
trace!(
println!(
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
seg,
timelineid,
@@ -325,7 +327,7 @@ impl InMemoryLayer {
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
assert!(self.seg.blknum_in_seg(blknum));
trace!(
println!(
"put_page_version blk {} of {} at {}/{}",
blknum,
self.seg.rel,
@@ -352,7 +354,7 @@ impl InMemoryLayer {
// which we've just acquired above
let oldsize = inner.get_seg_size(lsn);
if newsize > oldsize {
trace!(
println!(
"enlarging segment {} from {} to {} blocks at {}",
self.seg,
oldsize,
@@ -380,13 +382,41 @@ impl InMemoryLayer {
}
/// Remember that the segment was dropped at given LSN
pub fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
pub fn drop_segment(&self, lsn: Lsn) -> anyhow::Result<()> {
let mut inner = self.inner.lock().unwrap();
assert!(inner.drop_lsn.is_none());
inner.drop_lsn = Some(lsn);
info!("dropped segment {} at {}", self.seg, lsn);
println!("dropped segment {} at {}", self.seg, lsn);
let end_str = inner
.drop_lsn
.as_ref()
.map(|drop_lsn| drop_lsn.to_string())
.unwrap_or_default();
{
let mut result = format!(
"----- inmemory layer tli {} for {} {}-{} ----\n",
self.timelineid, self.seg, self.start_lsn, end_str
);
for (k, v) in inner.segsizes.iter() {
result += &format!("{}: {}\n", k, v);
}
for (k, v) in inner.page_versions.iter() {
result += &format!(
"blk {} at {}: {}/{}\n",
k.0,
k.1,
v.page_image.is_some(),
v.record.is_some()
);
}
println!("{}", result);
}
Ok(())
}
@@ -405,7 +435,7 @@ impl InMemoryLayer {
) -> Result<InMemoryLayer> {
let seg = src.get_seg_tag();
trace!(
println!(
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
seg,
timelineid,
@@ -436,7 +466,7 @@ impl InMemoryLayer {
}
///
/// Write the this in-memory layer to disk, as a snapshot layer.
/// Write the this in-memory layer to disk.
///
/// The cutoff point for the layer that's written to disk is 'end_lsn'.
///
@@ -455,7 +485,7 @@ impl InMemoryLayer {
// This is needed just to call materialize_page()
timeline: &LayeredTimeline,
) -> Result<(Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>)> {
info!(
println!(
"freezing in memory layer for {} on timeline {} at {}",
self.seg, self.timelineid, cutoff_lsn
);
@@ -531,7 +561,7 @@ impl InMemoryLayer {
)?;
let delta_layer_rc: Arc<dyn Layer> = Arc::new(delta_layer);
frozen_layers.push(delta_layer_rc);
trace!(
println!(
"freeze: created delta layer {} {}-{}",
self.seg,
self.start_lsn,
@@ -547,7 +577,7 @@ impl InMemoryLayer {
let imgfile = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
let imgfile_rc: Arc<dyn Layer> = Arc::new(imgfile);
frozen_layers.push(Arc::clone(&imgfile_rc));
trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
println!("freeze: created image layer {} at {}", self.seg, end_lsn);
// If there were any page versions newer than the cutoff, initialize a new in-memory
// layer to hold them
@@ -564,7 +594,7 @@ impl InMemoryLayer {
new_inner.page_versions.append(&mut after_page_versions);
new_inner.segsizes.append(&mut after_segsizes);
drop(new_inner);
trace!("freeze: created new in-mem layer {} {}-", self.seg, end_lsn);
println!("freeze: created new in-mem layer {} {}-", self.seg, end_lsn);
new_open_rc = Some(Arc::new(new_open))
}
@@ -576,13 +606,22 @@ impl InMemoryLayer {
/// debugging function to print out the contents of the layer
#[allow(unused)]
pub fn dump(&self) -> String {
let mut result = format!(
"----- inmemory layer for {} {}-> ----\n",
self.seg, self.start_lsn
);
let inner = self.inner.lock().unwrap();
let end_str = inner
.drop_lsn
.as_ref()
.map(|drop_lsn| drop_lsn.to_string())
.unwrap_or_default();
let mut result = format!(
"----- dump inmemory layer for tli {} seg {} {}-{} ----",
self.timelineid, self.seg, self.start_lsn, end_str
);
for (k, v) in inner.segsizes.iter() {
result += &format!("{}: {}\n", k, v);
}
@@ -596,6 +635,8 @@ impl InMemoryLayer {
);
}
println!("inner println {}", result);
result
}
}

View File

@@ -1,5 +1,5 @@
//!
//! The layer map tracks what layers exist for all the relations in a timeline.
//! The layer map tracks what layers exist for all the relishes in a timeline.
//!
//! When the timeline is first accessed, the server lists of all layer files
//! in the timelines/<timelineid> directory, and populates this map with
@@ -149,6 +149,7 @@ impl LayerMap {
if let Some(segentry) = self.segs.get_mut(&tag) {
if let Some(_old) = &segentry.open {
// FIXME: shouldn't exist, but check
println!("insert_open() // FIXME: shouldn't exist, but check");
}
segentry.open = Some(Arc::clone(&layer));
} else {
@@ -220,6 +221,8 @@ impl LayerMap {
pub fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>> {
let mut rels: HashSet<RelTag> = HashSet::new();
println!("layer list_rels()");
for (seg, segentry) in self.segs.iter() {
if let RelishTag::Relation(reltag) = seg.rel {
if (spcnode == 0 || reltag.spcnode == spcnode)
@@ -227,15 +230,33 @@ impl LayerMap {
{
// Add only if it exists at the requested LSN.
if let Some(open) = &segentry.open {
if open.get_end_lsn() > lsn {
println!("explore segentry.open {:?}", open.filename());
if open.get_end_lsn() > lsn && open.get_start_lsn() <= lsn {
rels.insert(reltag);
println!("found rel {} at lsn {} in open layer start_lsn {} end lsn {}",
reltag, lsn, open.get_start_lsn(), open.get_end_lsn());
println!("{}", open.dump());
}
} else if let Some((_k, _v)) = segentry
else
{
println!("found DROPPED rel {} at lsn {} in open layer start_lsn {} end lsn {}",
reltag, lsn, open.get_start_lsn(), open.get_end_lsn());
println!("{}", open.dump());
}
} else if let Some((l_start_lsn, layer)) = segentry
.historic
.range((Included(Lsn(0)), Included(lsn)))
.next_back()
{
rels.insert(reltag);
println!("explore historic segment {:?}", layer.filename());
if !layer.is_dropped() && l_start_lsn <= &lsn {
rels.insert(reltag);
println!("found rel {} in historic layer", reltag);
layer.dump()?;
}
}
}
}
@@ -253,15 +274,17 @@ impl LayerMap {
} else {
// Add only if it exists at the requested LSN.
if let Some(open) = &segentry.open {
if open.get_end_lsn() > lsn {
if open.get_end_lsn() > lsn && open.get_start_lsn() <= lsn {
rels.insert(seg.rel);
}
} else if let Some((_k, _v)) = segentry
} else if let Some((l_start_lsn, layer)) = segentry
.historic
.range((Included(Lsn(0)), Included(lsn)))
.next_back()
{
rels.insert(seg.rel);
if !layer.is_dropped() && l_start_lsn <= &lsn {
rels.insert(seg.rel);
}
}
}
}
@@ -271,15 +294,16 @@ impl LayerMap {
/// Is there a newer image layer for given segment?
///
/// This is used for garbage collection, to determine if an old layer can
/// be deleted. We ignore in-memory layers because they are not durable
/// on disk, and delta layers because they depend on an older layer.
/// be deleted.
pub fn newer_image_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
if let Some(segentry) = self.segs.get(&seg) {
// We only check on-disk layers, because
// in-memory layers are not durable
for (newer_lsn, layer) in segentry
.historic
.range((Included(lsn), Included(Lsn(u64::MAX))))
{
// Ignore delta layers.
// Ignore layers that depend on an older layer.
if layer.is_incremental() {
continue;
}

View File

@@ -97,24 +97,32 @@ pub enum PageReconstructResult {
}
///
/// A Layer holds all page versions for one segment of a relish, in a range of LSNs.
/// There are two kinds of layers, in-memory and snapshot layers. In-memory
/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
/// layers are used to ingest incoming WAL, and provide fast access
/// to the recent page versions. Snaphot layers are stored on disk, and
/// to the recent page versions. On-disk layers are stored as files on disk, and
/// are immutable. This trait presents the common functionality of
/// in-memory and snapshot layers.
///
/// Each layer contains a full snapshot of the segment at the start
/// LSN. In addition to that, it contains WAL (or more page images)
/// needed to recontruct any page version up to the end LSN.
/// in-memory and on-disk layers.
///
pub trait Layer: Send + Sync {
// These functions identify the relish segment and the LSN range
// that this Layer holds.
/// Identify the timeline this relish belongs to
fn get_timeline_id(&self) -> ZTimelineId;
/// Identify the relish segment
fn get_seg_tag(&self) -> SegmentTag;
/// Inclusive start bound of the LSN range that this layer hold
fn get_start_lsn(&self) -> Lsn;
/// 'end_lsn' meaning depends on the layer kind:
/// - in-memory layer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
/// - image layer represents snapshot at one LSN, so end_lsn = lsn
/// - delta layer has end_lsn
///
/// TODO Is end_lsn always exclusive for all layer kinds?
fn get_end_lsn(&self) -> Lsn;
/// Is the segment represented by this layer dropped by PostgreSQL?
fn is_dropped(&self) -> bool;
/// Filename used to store this layer on disk. (Even in-memory layers

View File

@@ -24,7 +24,7 @@ pub fn init_logging(
if record.level().is_at_least(slog::Level::Info) {
return true;
}
false
true
});
let drain = std::sync::Mutex::new(drain).fuse();
let logger = slog::Logger::root(

View File

@@ -346,7 +346,7 @@ impl PageServerHandler {
pgb.write_message(&BeMessage::CopyOutResponse)?;
info!("sent CopyOut");
/* Send a tarball of the latest snapshot on the timeline */
/* Send a tarball of the latest layer on the timeline */
{
let mut writer = CopyDataSink { pgb };
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn);
@@ -582,18 +582,18 @@ impl postgres_backend::Handler for PageServerHandler {
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"snapshot_relfiles_total"),
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
RowDescriptor::int8_col(b"layer_relfiles_total"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
RowDescriptor::int8_col(b"layer_relfiles_removed"),
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
RowDescriptor::int8_col(b"elapsed"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[

View File

@@ -133,9 +133,8 @@ pub trait Timeline: Send + Sync {
/// Truncate relation
fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;
/// Unlink relish.
/// This method is used for marking dropped relations and truncated SLRU segments
fn put_unlink(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
/// This method is used for marking dropped relations and truncated SLRU files
fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
/// Track end of the latest digested WAL record.
///
@@ -229,6 +228,8 @@ mod tests {
forknum: 0,
});
const TESTDB: u32 = 111;
/// Convenience function to create a page image with given string as the only content
#[allow(non_snake_case)]
fn TEST_IMG(s: &str) -> Bytes {
@@ -417,6 +418,53 @@ mod tests {
Ok(())
}
///
/// Test list_rels() function, with branches and unlinking
///
#[test]
fn test_list_rels_and_unlink() -> Result<()> {
let repo = get_test_repo("test_unlink")?;
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid, Lsn(0x00))?;
// Import initial dummy checkpoint record, otherwise the get_timeline() call
// after branching fails below
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_PAGE.clone())?;
// Create a relation on the timeline
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
// Check that list_rels() lists it after LSN 2, but no before it.
let reltag = match TESTREL_A {
RelishTag::Relation(reltag) => reltag,
_ => panic!("unexpected relish")
};
assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&reltag));
assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&reltag));
assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
// Create a branch, check that the relation is visible there
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
let newtline = repo.get_timeline(newtimelineid)?;
assert!(newtline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
// Unlink it on the branch
newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
// Check that it's no longer listed on the branch after the point where it was unlinked
assert!(newtline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
assert!(!newtline.list_rels(0, TESTDB, Lsn(0x40))?.contains(&reltag));
// Run checkpoint and garbage collection and check that it's still not visible
newtline.checkpoint()?;
repo.gc_iteration(Some(newtimelineid), 0, true)?;
assert!(!newtline.list_rels(0, TESTDB, Lsn(0x40))?.contains(&reltag));
Ok(())
}
///
/// Test branch creation
///

View File

@@ -45,7 +45,6 @@ pub fn import_timeline_from_postgres_datadir(
match direntry.file_name().to_str() {
None => continue,
// These special files appear in the snapshot, but are not needed by the page server
Some("pg_control") => {
import_nonrel_file(timeline, lsn, RelishTag::ControlFile, &direntry.path())?;
// Extract checkpoint record from pg_control and store is as separate object
@@ -93,7 +92,6 @@ pub fn import_timeline_from_postgres_datadir(
match direntry.file_name().to_str() {
None => continue,
// These special files appear in the snapshot, but are not needed by the page server
Some("PG_VERSION") => continue,
Some("pg_filenode.map") => import_nonrel_file(
timeline,
@@ -153,7 +151,7 @@ fn import_relfile(
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
if let Err(e) = p {
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
return Err(e.into());
}
let (relnode, forknum, segno) = p.unwrap();
@@ -397,15 +395,15 @@ pub fn save_decoded_record(
for tablespace_id in dropdb.tablespace_ids {
let rels = timeline.list_rels(tablespace_id, dropdb.db_id, lsn)?;
for rel in rels {
timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
}
trace!(
"Unlink FileNodeMap {}, {} at lsn {}",
"Drop FileNodeMap {}, {} at lsn {}",
tablespace_id,
dropdb.db_id,
lsn
);
timeline.put_unlink(
timeline.drop_relish(
RelishTag::FileNodeMap {
spcnode: tablespace_id,
dbnode: dropdb.db_id,
@@ -448,12 +446,12 @@ pub fn save_decoded_record(
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
trace!(
"unlink twophaseFile for xid {} parsed_xact.xid {} here at {}",
"Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
decoded.xl_xid,
parsed_xact.xid,
lsn
);
timeline.put_unlink(
timeline.drop_relish(
RelishTag::TwoPhase {
xid: parsed_xact.xid,
},
@@ -733,7 +731,7 @@ fn save_xact_record(
dbnode: xnode.dbnode,
relnode: xnode.relnode,
};
timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
}
}
Ok(())
@@ -775,7 +773,7 @@ fn save_clog_truncate_record(
return Ok(());
}
// Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
// Iterate via SLRU CLOG segments and drop segments that we're ready to truncate
// TODO This implementation is very inefficient -
// it scans all non-rels only to find Clog
//
@@ -790,8 +788,8 @@ fn save_clog_truncate_record(
if slru == SlruKind::Clog {
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
timeline.put_unlink(RelishTag::Slru { slru, segno }, lsn)?;
trace!("unlink CLOG segment {:>04X} at lsn {}", segno, lsn);
timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?;
trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn);
}
}
}
@@ -894,7 +892,7 @@ fn save_multixact_truncate_record(
// Delete all the segments except the last one. The last segment can still
// contain, possibly partially, valid data.
while segment != endsegment {
timeline.put_unlink(
timeline.drop_relish(
RelishTag::Slru {
slru: SlruKind::MultiXactMembers,
segno: segment as u32,