mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-23 05:12:56 +00:00
Compare commits
5 Commits
parallel_w
...
dropped_re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5e91d15f06 | ||
|
|
8c46178256 | ||
|
|
2a4d405f96 | ||
|
|
66174c0342 | ||
|
|
cee366ac17 |
@@ -26,7 +26,7 @@ A checkpoint record in the WAL marks a point in the WAL sequence at which it is
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
Whenever enough WAL has been accumulated in memory, the page server []
|
||||
writes out the changes in memory into new layer files[]. This process
|
||||
writes out the changes from in-memory layers into new layer files[]. This process
|
||||
is called "checkpointing". The page server only creates layer files for
|
||||
relations that have been modified since the last checkpoint.
|
||||
|
||||
@@ -41,17 +41,28 @@ Stateless Postgres node that stores data in pageserver.
|
||||
Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
|
||||
Each PostgreSQL fork is considered a separate relish.
|
||||
|
||||
### Layer file
|
||||
### Layer
|
||||
|
||||
Each layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
|
||||
There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
layers are used to ingest incoming WAL, and provide fast access
|
||||
to the recent page versions. On-disk layers are stored as files on disk, and
|
||||
are immutable.
|
||||
### Layer file (on-disk layer)
|
||||
|
||||
Layered repository on-disk format is based on immutable files. The
|
||||
files are called "layer files". Each file corresponds to one 10 MB
|
||||
files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE
|
||||
segment of a PostgreSQL relation fork. There are two kinds of layer
|
||||
files: image files and delta files. An image file contains a
|
||||
"snapshot" of the segment at a particular LSN, and a delta file
|
||||
contains WAL records applicable to the segment, in a range of LSNs.
|
||||
|
||||
### Layer map
|
||||
|
||||
The layer map tracks what layers exist for all the relishes in a timeline.
|
||||
### Layered repository
|
||||
|
||||
Zenith repository implementation that keeps data in layers.
|
||||
### LSN
|
||||
|
||||
|
||||
@@ -121,7 +132,7 @@ Each SLRU segment is considered a separate relish[].
|
||||
|
||||
### Tenant (Multitenancy)
|
||||
Tenant represents a single customer, interacting with Zenith.
|
||||
Wal redo[] activity, timelines[], snapshots[] are managed for each tenant independently.
|
||||
Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
|
||||
One pageserver[] can serve multiple tenants at once.
|
||||
One safekeeper
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ On the page server tenants introduce one level of indirection, so data directory
|
||||
├── de182bc61fb11a5a6b390a8aed3a804a
|
||||
└── ee6016ec31116c1b7c33dfdfca38891f
|
||||
```
|
||||
Wal redo activity, timelines, snapshots are managed for each tenant independently.
|
||||
Wal redo activity and timelines are managed for each tenant independently.
|
||||
|
||||
For local environment used for example in tests there also new level of indirection for tenants. It touches `pgdatadirs` directory. Now it contains `tenants` subdirectory so the structure looks the following way:
|
||||
|
||||
|
||||
@@ -583,7 +583,7 @@ impl Timeline for LayeredTimeline {
|
||||
}
|
||||
|
||||
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>> {
|
||||
trace!("list_rels called at {}", lsn);
|
||||
println!("list_rels called at {}", lsn);
|
||||
|
||||
// List all rels in this timeline, and all its ancestors.
|
||||
let mut all_rels = HashSet::new();
|
||||
@@ -675,14 +675,14 @@ impl Timeline for LayeredTimeline {
|
||||
(relsize - 1) / RELISH_SEG_SIZE
|
||||
};
|
||||
|
||||
// Unlink segments beyond the last remaining segment.
|
||||
// Drop segments beyond the last remaining segment.
|
||||
for remove_segno in (last_remain_seg + 1)..=old_last_seg {
|
||||
let seg = SegmentTag {
|
||||
rel,
|
||||
segno: remove_segno,
|
||||
};
|
||||
let layer = self.get_layer_for_write(seg, lsn)?;
|
||||
layer.put_unlink(lsn)?;
|
||||
layer.drop_segment(lsn)?;
|
||||
}
|
||||
|
||||
// Truncate the last remaining segment to the specified size
|
||||
@@ -698,8 +698,8 @@ impl Timeline for LayeredTimeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn put_unlink(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
|
||||
trace!("put_unlink: {} at {}", rel, lsn);
|
||||
fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
|
||||
trace!("drop_segment: {} at {}", rel, lsn);
|
||||
|
||||
if rel.is_blocky() {
|
||||
let oldsize_opt = self.get_relish_size(rel, self.get_last_record_lsn())?;
|
||||
@@ -710,25 +710,25 @@ impl Timeline for LayeredTimeline {
|
||||
(oldsize - 1) / RELISH_SEG_SIZE
|
||||
};
|
||||
|
||||
// Unlink all segments
|
||||
// Drop all segments of the relish
|
||||
for remove_segno in 0..=old_last_seg {
|
||||
let seg = SegmentTag {
|
||||
rel,
|
||||
segno: remove_segno,
|
||||
};
|
||||
let layer = self.get_layer_for_write(seg, lsn)?;
|
||||
layer.put_unlink(lsn)?;
|
||||
layer.drop_segment(lsn)?;
|
||||
}
|
||||
} else {
|
||||
warn!(
|
||||
"put_unlink called on non-existent relish {} at {}",
|
||||
"drop_segment called on non-existent relish {} at {}",
|
||||
rel, lsn
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let seg = SegmentTag::from_blknum(rel, 0);
|
||||
let layer = self.get_layer_for_write(seg, lsn)?;
|
||||
layer.put_unlink(lsn)?;
|
||||
layer.drop_segment(lsn)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -927,7 +927,6 @@ impl LayeredTimeline {
|
||||
assert!(layer.get_start_lsn() <= lsn);
|
||||
|
||||
if layer.is_dropped() && layer.get_end_lsn() <= lsn {
|
||||
// The segment was unlinked
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
@@ -988,7 +987,7 @@ impl LayeredTimeline {
|
||||
if prev_layer.get_timeline_id() != self.timelineid {
|
||||
// First modification on this timeline
|
||||
start_lsn = self.ancestor_lsn;
|
||||
trace!(
|
||||
println!(
|
||||
"creating file for write for {} at branch point {}/{}",
|
||||
seg,
|
||||
self.timelineid,
|
||||
@@ -996,14 +995,14 @@ impl LayeredTimeline {
|
||||
);
|
||||
} else {
|
||||
start_lsn = prev_layer.get_end_lsn();
|
||||
trace!(
|
||||
println!(
|
||||
"creating file for write for {} after previous layer {}/{}",
|
||||
seg,
|
||||
self.timelineid,
|
||||
start_lsn
|
||||
);
|
||||
}
|
||||
trace!(
|
||||
println!(
|
||||
"prev layer is at {}/{} - {}",
|
||||
prev_layer.get_timeline_id(),
|
||||
prev_layer.get_start_lsn(),
|
||||
@@ -1019,7 +1018,7 @@ impl LayeredTimeline {
|
||||
)?;
|
||||
} else {
|
||||
// New relation.
|
||||
trace!(
|
||||
println!(
|
||||
"creating layer for write for new rel {} at {}/{}",
|
||||
seg,
|
||||
self.timelineid,
|
||||
@@ -1031,6 +1030,7 @@ impl LayeredTimeline {
|
||||
}
|
||||
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
println!("before layers.insert_open {:?}", layer.filename());
|
||||
let layer_rc: Arc<InMemoryLayer> = Arc::new(layer);
|
||||
layers.insert_open(Arc::clone(&layer_rc));
|
||||
|
||||
@@ -1091,7 +1091,7 @@ impl LayeredTimeline {
|
||||
prev: prev_record_lsn,
|
||||
} = self.last_record_lsn.load();
|
||||
|
||||
trace!(
|
||||
println!(
|
||||
"checkpointing timeline {} at {}",
|
||||
self.timelineid,
|
||||
last_record_lsn
|
||||
@@ -1227,7 +1227,7 @@ impl LayeredTimeline {
|
||||
//
|
||||
// Determine for each file if it needs to be retained
|
||||
// FIXME: also scan open in-memory layers. Normally we cannot remove the
|
||||
// latest layer of any seg, but if it was unlinked it's possible
|
||||
// latest layer of any seg, but if it was dropped it's possible
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
let seg = l.get_seg_tag();
|
||||
|
||||
@@ -6,13 +6,14 @@ which pages they apply to, and accumulates the incoming changes in
|
||||
memory. Every now and then, the accumulated changes are written out to
|
||||
new files.
|
||||
|
||||
The files are called "snapshot files". Each snapshot file corresponds
|
||||
to one 10 MB slice of a PostgreSQL relation fork. The snapshot files
|
||||
The files are called "layer files". Each layer file corresponds
|
||||
to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or
|
||||
non-rel file in a range of LSNs. The layer files
|
||||
for each timeline are stored in the timeline's subdirectory under
|
||||
.zenith/tenants/<tenantid>/timelines.
|
||||
|
||||
There are two kind of snapshot file: base images, and deltas. A base
|
||||
image file contains a snapshot of a segment as it was at one LSN,
|
||||
There are two kind of layer file: base images, and deltas. A base
|
||||
image file contains a layer of a segment as it was at one LSN,
|
||||
whereas a delta file contains modifications to a segment - mostly in
|
||||
the form of WAL records - in a range of LSN
|
||||
|
||||
@@ -44,7 +45,7 @@ managed, except that the first part of file names is different.
|
||||
Internally, the relations and non-relation files that are managed in
|
||||
the versioned store are together called "relishes".
|
||||
|
||||
If a file has been dropped, the last snapshot file for it is created
|
||||
If a file has been dropped, the last layer file for it is created
|
||||
with the _DROPPED suffix, e.g.
|
||||
|
||||
rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED
|
||||
@@ -67,7 +68,7 @@ for 'orders' table on 'main' branch is represented like this:
|
||||
main/orders_100_200
|
||||
|
||||
|
||||
# Creating snapshot files
|
||||
# Creating layer files
|
||||
|
||||
Let's start with a simple example with a system that contains one
|
||||
branch called 'main' and two tables, 'orders' and 'customers'. The end
|
||||
@@ -86,10 +87,10 @@ end of WAL at 250 are kept in memory. If the page server crashes, the
|
||||
latest records between 200-250 need to be re-read from the WAL.
|
||||
|
||||
Whenever enough WAL has been accumulated in memory, the page server
|
||||
writes out the changes in memory into new snapshot files. This process
|
||||
writes out the changes in memory into new layer files. This process
|
||||
is called "checkpointing" (not to be confused with the PostgreSQL
|
||||
checkpoints, that's a different thing). The page server only creates
|
||||
snapshot files for relations that have been modified since the last
|
||||
layer files for relations that have been modified since the last
|
||||
checkpoint. For example, if the current end of WAL is at LSN 450, and
|
||||
the last checkpoint happened at LSN 400 but there hasn't been any
|
||||
recent changes to 'customers' table, you would have these files on
|
||||
@@ -108,7 +109,7 @@ disk:
|
||||
|
||||
If the customers table is modified later, a new file is created for it
|
||||
at the next checkpoint. The new file will cover the "gap" from the
|
||||
last snapshot file, so the LSN ranges are always contiguous:
|
||||
last layer file, so the LSN ranges are always contiguous:
|
||||
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
@@ -130,13 +131,13 @@ page server needs to reconstruct the requested page, as it was at the
|
||||
requested LSN. To do that, the page server first checks the recent
|
||||
in-memory layer; if the requested page version is found there, it can
|
||||
be returned immediatedly without looking at the files on
|
||||
disk. Otherwise the page server needs to locate the snapshot file that
|
||||
disk. Otherwise the page server needs to locate the layer file that
|
||||
contains the requested page version.
|
||||
|
||||
For example, if a request comes in for table 'orders' at LSN 250, the
|
||||
page server would load the 'main/orders_200_300' file into memory, and
|
||||
reconstruct and return the requested page from it, as it was at
|
||||
LSN 250. Because the snapshot file consists of a full image of the
|
||||
LSN 250. Because the layer file consists of a full image of the
|
||||
relation at the start LSN and the WAL, reconstructing the page
|
||||
involves replaying any WAL records applicable to the page between LSNs
|
||||
200-250, starting from the base image at LSN 200.
|
||||
@@ -171,7 +172,7 @@ Then, the 'orders' table is updated differently on the 'main' and
|
||||
|
||||
Because the 'customers' table hasn't been modified on the child
|
||||
branch, there is no file for it there. If you request a page for it on
|
||||
the 'child' branch, the page server will not find any snapshot file
|
||||
the 'child' branch, the page server will not find any layer file
|
||||
for it in the 'child' directory, so it will recurse to look into the
|
||||
parent 'main' branch instead.
|
||||
|
||||
@@ -217,7 +218,7 @@ branch at a historic LSN, is how we support PITR in Zenith.
|
||||
|
||||
# Garbage collection
|
||||
|
||||
In this scheme, we keep creating new snapshot files over time. We also
|
||||
In this scheme, we keep creating new layer files over time. We also
|
||||
need a mechanism to remove old files that are no longer needed,
|
||||
because disk space isn't infinite.
|
||||
|
||||
@@ -245,7 +246,7 @@ of the branch is LSN 525, so that the GC horizon is currently at
|
||||
main/customers_200
|
||||
|
||||
We can remove the following files because the end LSNs of those files are
|
||||
older than GC horizon 375, and there are more recent snapshot files for the
|
||||
older than GC horizon 375, and there are more recent layer files for the
|
||||
table:
|
||||
|
||||
main/orders_100 DELETE
|
||||
@@ -262,7 +263,7 @@ table:
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
|
||||
'main/customers_100_200' is old enough, but it cannot be
|
||||
removed because there is no newer snapshot file for the table.
|
||||
removed because there is no newer layer file for the table.
|
||||
|
||||
Things get slightly more complicated with multiple branches. All of
|
||||
the above still holds, but in addition to recent files we must also
|
||||
@@ -308,7 +309,7 @@ new base image and delta file for it on the child:
|
||||
|
||||
After this, the 'main/orders_100' and 'main/orders_100_200' file could
|
||||
be removed. It is no longer needed by the child branch, because there
|
||||
is a newer snapshot file there. TODO: This optimization hasn't been
|
||||
is a newer layer file there. TODO: This optimization hasn't been
|
||||
implemented! The GC algorithm will currently keep the file on the
|
||||
'main' branch anyway, for as long as the child branch exists.
|
||||
|
||||
@@ -346,7 +347,7 @@ It would also be OK to have overlapping LSN ranges for the same relation:
|
||||
main/orders_300_400
|
||||
main/orders_400
|
||||
|
||||
The code that reads the snapshot files should cope with this, but this
|
||||
The code that reads the layer files should cope with this, but this
|
||||
situation doesn't arise either, because the checkpointing code never
|
||||
does that. It could be useful, however, as a transient state when
|
||||
garbage collecting around branch points, or explicit recovery
|
||||
@@ -360,6 +361,6 @@ points. For example, if we start with this:
|
||||
|
||||
And there is a branch or explicit recovery point at LSN 150, we could
|
||||
replace 'main/orders_100_200' with 'main/orders_150' to keep a
|
||||
snapshot only at that exact point that's still needed, removing the
|
||||
layer only at that exact point that's still needed, removing the
|
||||
other page versions around it. But such compaction has not been
|
||||
implemented yet.
|
||||
|
||||
@@ -174,7 +174,7 @@ impl Layer for DeltaLayer {
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
// TODO: avoid opening the snapshot file for each read
|
||||
// TODO: avoid opening the file for each read
|
||||
let (_path, book) = self.open_book()?;
|
||||
let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
let inner = self.load()?;
|
||||
@@ -251,6 +251,8 @@ impl Layer for DeltaLayer {
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
// Is the requested LSN after the rel was dropped?
|
||||
if self.dropped && lsn >= self.end_lsn {
|
||||
return Ok(false);
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
//! It is stored in a file on disk.
|
||||
//!
|
||||
//! On disk, the image files are stored in timelines/<timelineid> directory.
|
||||
//! Currently, there are no subdirectories, and each snapshot file is named like this:
|
||||
//! Currently, there are no subdirectories, and each image layer file is named like this:
|
||||
//!
|
||||
//! Note that segno is
|
||||
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
|
||||
//!
|
||||
//! For example:
|
||||
@@ -15,10 +16,10 @@
|
||||
//! Only metadata is loaded into memory by the load function.
|
||||
//! When images are needed, they are read directly from disk.
|
||||
//!
|
||||
//! For blocky segments, the images are stored in BLOCKY_IMAGES_CHAPTER.
|
||||
//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
|
||||
//! All the images are required to be BLOCK_SIZE, which allows for random access.
|
||||
//!
|
||||
//! For non-blocky segments, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
|
||||
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
|
||||
//!
|
||||
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
@@ -156,7 +157,9 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Get size of the segment
|
||||
fn get_seg_size(&self, _lsn: Lsn) -> Result<u32> {
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => Ok(num_blocks),
|
||||
@@ -165,7 +168,8 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
assert!(lsn >= self.lsn);
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
|
||||
@@ -198,6 +198,8 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
// Is the requested LSN after the segment was dropped?
|
||||
@@ -239,8 +241,8 @@ impl Layer for InMemoryLayer {
|
||||
.unwrap_or_default();
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for {} {}-{} ----",
|
||||
self.seg, self.start_lsn, end_str
|
||||
"----- in-memory layer for tli {} seg {} {}-{} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str
|
||||
);
|
||||
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
@@ -271,7 +273,7 @@ impl InMemoryLayer {
|
||||
start_lsn: Lsn,
|
||||
oldest_pending_lsn: Lsn,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!(
|
||||
println!(
|
||||
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
timelineid,
|
||||
@@ -325,7 +327,7 @@ impl InMemoryLayer {
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
trace!(
|
||||
println!(
|
||||
"put_page_version blk {} of {} at {}/{}",
|
||||
blknum,
|
||||
self.seg.rel,
|
||||
@@ -352,7 +354,7 @@ impl InMemoryLayer {
|
||||
// which we've just acquired above
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
if newsize > oldsize {
|
||||
trace!(
|
||||
println!(
|
||||
"enlarging segment {} from {} to {} blocks at {}",
|
||||
self.seg,
|
||||
oldsize,
|
||||
@@ -380,13 +382,41 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Remember that the segment was dropped at given LSN
|
||||
pub fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
pub fn drop_segment(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
|
||||
assert!(inner.drop_lsn.is_none());
|
||||
inner.drop_lsn = Some(lsn);
|
||||
|
||||
info!("dropped segment {} at {}", self.seg, lsn);
|
||||
println!("dropped segment {} at {}", self.seg, lsn);
|
||||
|
||||
let end_str = inner
|
||||
.drop_lsn
|
||||
.as_ref()
|
||||
.map(|drop_lsn| drop_lsn.to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
{
|
||||
let mut result = format!(
|
||||
"----- inmemory layer tli {} for {} {}-{} ----\n",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str
|
||||
);
|
||||
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
result += &format!("{}: {}\n", k, v);
|
||||
}
|
||||
for (k, v) in inner.page_versions.iter() {
|
||||
result += &format!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
k.0,
|
||||
k.1,
|
||||
v.page_image.is_some(),
|
||||
v.record.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
println!("{}", result);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -405,7 +435,7 @@ impl InMemoryLayer {
|
||||
) -> Result<InMemoryLayer> {
|
||||
let seg = src.get_seg_tag();
|
||||
|
||||
trace!(
|
||||
println!(
|
||||
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
timelineid,
|
||||
@@ -436,7 +466,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
///
|
||||
/// Write the this in-memory layer to disk, as a snapshot layer.
|
||||
/// Write the this in-memory layer to disk.
|
||||
///
|
||||
/// The cutoff point for the layer that's written to disk is 'end_lsn'.
|
||||
///
|
||||
@@ -455,7 +485,7 @@ impl InMemoryLayer {
|
||||
// This is needed just to call materialize_page()
|
||||
timeline: &LayeredTimeline,
|
||||
) -> Result<(Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>)> {
|
||||
info!(
|
||||
println!(
|
||||
"freezing in memory layer for {} on timeline {} at {}",
|
||||
self.seg, self.timelineid, cutoff_lsn
|
||||
);
|
||||
@@ -531,7 +561,7 @@ impl InMemoryLayer {
|
||||
)?;
|
||||
let delta_layer_rc: Arc<dyn Layer> = Arc::new(delta_layer);
|
||||
frozen_layers.push(delta_layer_rc);
|
||||
trace!(
|
||||
println!(
|
||||
"freeze: created delta layer {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
@@ -547,7 +577,7 @@ impl InMemoryLayer {
|
||||
let imgfile = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
|
||||
let imgfile_rc: Arc<dyn Layer> = Arc::new(imgfile);
|
||||
frozen_layers.push(Arc::clone(&imgfile_rc));
|
||||
trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
println!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
|
||||
// If there were any page versions newer than the cutoff, initialize a new in-memory
|
||||
// layer to hold them
|
||||
@@ -564,7 +594,7 @@ impl InMemoryLayer {
|
||||
new_inner.page_versions.append(&mut after_page_versions);
|
||||
new_inner.segsizes.append(&mut after_segsizes);
|
||||
drop(new_inner);
|
||||
trace!("freeze: created new in-mem layer {} {}-", self.seg, end_lsn);
|
||||
println!("freeze: created new in-mem layer {} {}-", self.seg, end_lsn);
|
||||
|
||||
new_open_rc = Some(Arc::new(new_open))
|
||||
}
|
||||
@@ -576,13 +606,22 @@ impl InMemoryLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self) -> String {
|
||||
let mut result = format!(
|
||||
"----- inmemory layer for {} {}-> ----\n",
|
||||
self.seg, self.start_lsn
|
||||
);
|
||||
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
.drop_lsn
|
||||
.as_ref()
|
||||
.map(|drop_lsn| drop_lsn.to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut result = format!(
|
||||
"----- dump inmemory layer for tli {} seg {} {}-{} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str
|
||||
);
|
||||
|
||||
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
result += &format!("{}: {}\n", k, v);
|
||||
}
|
||||
@@ -596,6 +635,8 @@ impl InMemoryLayer {
|
||||
);
|
||||
}
|
||||
|
||||
println!("inner println {}", result);
|
||||
result
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
//!
|
||||
//! The layer map tracks what layers exist for all the relations in a timeline.
|
||||
//! The layer map tracks what layers exist for all the relishes in a timeline.
|
||||
//!
|
||||
//! When the timeline is first accessed, the server lists of all layer files
|
||||
//! in the timelines/<timelineid> directory, and populates this map with
|
||||
@@ -149,6 +149,7 @@ impl LayerMap {
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
if let Some(_old) = &segentry.open {
|
||||
// FIXME: shouldn't exist, but check
|
||||
println!("insert_open() // FIXME: shouldn't exist, but check");
|
||||
}
|
||||
segentry.open = Some(Arc::clone(&layer));
|
||||
} else {
|
||||
@@ -220,6 +221,8 @@ impl LayerMap {
|
||||
pub fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>> {
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
println!("layer list_rels()");
|
||||
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let RelishTag::Relation(reltag) = seg.rel {
|
||||
if (spcnode == 0 || reltag.spcnode == spcnode)
|
||||
@@ -227,15 +230,33 @@ impl LayerMap {
|
||||
{
|
||||
// Add only if it exists at the requested LSN.
|
||||
if let Some(open) = &segentry.open {
|
||||
if open.get_end_lsn() > lsn {
|
||||
println!("explore segentry.open {:?}", open.filename());
|
||||
|
||||
if open.get_end_lsn() > lsn && open.get_start_lsn() <= lsn {
|
||||
rels.insert(reltag);
|
||||
println!("found rel {} at lsn {} in open layer start_lsn {} end lsn {}",
|
||||
reltag, lsn, open.get_start_lsn(), open.get_end_lsn());
|
||||
println!("{}", open.dump());
|
||||
|
||||
}
|
||||
} else if let Some((_k, _v)) = segentry
|
||||
else
|
||||
{
|
||||
println!("found DROPPED rel {} at lsn {} in open layer start_lsn {} end lsn {}",
|
||||
reltag, lsn, open.get_start_lsn(), open.get_end_lsn());
|
||||
println!("{}", open.dump());
|
||||
}
|
||||
} else if let Some((l_start_lsn, layer)) = segentry
|
||||
.historic
|
||||
.range((Included(Lsn(0)), Included(lsn)))
|
||||
.next_back()
|
||||
{
|
||||
rels.insert(reltag);
|
||||
println!("explore historic segment {:?}", layer.filename());
|
||||
|
||||
if !layer.is_dropped() && l_start_lsn <= &lsn {
|
||||
rels.insert(reltag);
|
||||
println!("found rel {} in historic layer", reltag);
|
||||
layer.dump()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -253,15 +274,17 @@ impl LayerMap {
|
||||
} else {
|
||||
// Add only if it exists at the requested LSN.
|
||||
if let Some(open) = &segentry.open {
|
||||
if open.get_end_lsn() > lsn {
|
||||
if open.get_end_lsn() > lsn && open.get_start_lsn() <= lsn {
|
||||
rels.insert(seg.rel);
|
||||
}
|
||||
} else if let Some((_k, _v)) = segentry
|
||||
} else if let Some((l_start_lsn, layer)) = segentry
|
||||
.historic
|
||||
.range((Included(Lsn(0)), Included(lsn)))
|
||||
.next_back()
|
||||
{
|
||||
rels.insert(seg.rel);
|
||||
if !layer.is_dropped() && l_start_lsn <= &lsn {
|
||||
rels.insert(seg.rel);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -271,15 +294,16 @@ impl LayerMap {
|
||||
/// Is there a newer image layer for given segment?
|
||||
///
|
||||
/// This is used for garbage collection, to determine if an old layer can
|
||||
/// be deleted. We ignore in-memory layers because they are not durable
|
||||
/// on disk, and delta layers because they depend on an older layer.
|
||||
/// be deleted.
|
||||
pub fn newer_image_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
|
||||
if let Some(segentry) = self.segs.get(&seg) {
|
||||
// We only check on-disk layers, because
|
||||
// in-memory layers are not durable
|
||||
for (newer_lsn, layer) in segentry
|
||||
.historic
|
||||
.range((Included(lsn), Included(Lsn(u64::MAX))))
|
||||
{
|
||||
// Ignore delta layers.
|
||||
// Ignore layers that depend on an older layer.
|
||||
if layer.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -97,24 +97,32 @@ pub enum PageReconstructResult {
|
||||
}
|
||||
|
||||
///
|
||||
/// A Layer holds all page versions for one segment of a relish, in a range of LSNs.
|
||||
/// There are two kinds of layers, in-memory and snapshot layers. In-memory
|
||||
/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
|
||||
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
/// layers are used to ingest incoming WAL, and provide fast access
|
||||
/// to the recent page versions. Snaphot layers are stored on disk, and
|
||||
/// to the recent page versions. On-disk layers are stored as files on disk, and
|
||||
/// are immutable. This trait presents the common functionality of
|
||||
/// in-memory and snapshot layers.
|
||||
///
|
||||
/// Each layer contains a full snapshot of the segment at the start
|
||||
/// LSN. In addition to that, it contains WAL (or more page images)
|
||||
/// needed to recontruct any page version up to the end LSN.
|
||||
/// in-memory and on-disk layers.
|
||||
///
|
||||
pub trait Layer: Send + Sync {
|
||||
// These functions identify the relish segment and the LSN range
|
||||
// that this Layer holds.
|
||||
/// Identify the timeline this relish belongs to
|
||||
fn get_timeline_id(&self) -> ZTimelineId;
|
||||
|
||||
/// Identify the relish segment
|
||||
fn get_seg_tag(&self) -> SegmentTag;
|
||||
|
||||
/// Inclusive start bound of the LSN range that this layer hold
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
/// 'end_lsn' meaning depends on the layer kind:
|
||||
/// - in-memory layer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
|
||||
/// - image layer represents snapshot at one LSN, so end_lsn = lsn
|
||||
/// - delta layer has end_lsn
|
||||
///
|
||||
/// TODO Is end_lsn always exclusive for all layer kinds?
|
||||
fn get_end_lsn(&self) -> Lsn;
|
||||
|
||||
/// Is the segment represented by this layer dropped by PostgreSQL?
|
||||
fn is_dropped(&self) -> bool;
|
||||
|
||||
/// Filename used to store this layer on disk. (Even in-memory layers
|
||||
|
||||
@@ -24,7 +24,7 @@ pub fn init_logging(
|
||||
if record.level().is_at_least(slog::Level::Info) {
|
||||
return true;
|
||||
}
|
||||
false
|
||||
true
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(
|
||||
|
||||
@@ -346,7 +346,7 @@ impl PageServerHandler {
|
||||
pgb.write_message(&BeMessage::CopyOutResponse)?;
|
||||
info!("sent CopyOut");
|
||||
|
||||
/* Send a tarball of the latest snapshot on the timeline */
|
||||
/* Send a tarball of the latest layer on the timeline */
|
||||
{
|
||||
let mut writer = CopyDataSink { pgb };
|
||||
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn);
|
||||
@@ -582,18 +582,18 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_total"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"elapsed"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
|
||||
@@ -133,9 +133,8 @@ pub trait Timeline: Send + Sync {
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// Unlink relish.
|
||||
/// This method is used for marking dropped relations and truncated SLRU segments
|
||||
fn put_unlink(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
|
||||
/// This method is used for marking dropped relations and truncated SLRU files
|
||||
fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Track end of the latest digested WAL record.
|
||||
///
|
||||
@@ -229,6 +228,8 @@ mod tests {
|
||||
forknum: 0,
|
||||
});
|
||||
|
||||
const TESTDB: u32 = 111;
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_IMG(s: &str) -> Bytes {
|
||||
@@ -417,6 +418,53 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Test list_rels() function, with branches and unlinking
|
||||
///
|
||||
#[test]
|
||||
fn test_list_rels_and_unlink() -> Result<()> {
|
||||
let repo = get_test_repo("test_unlink")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0x00))?;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_PAGE.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
|
||||
// Check that list_rels() lists it after LSN 2, but no before it.
|
||||
let reltag = match TESTREL_A {
|
||||
RelishTag::Relation(reltag) => reltag,
|
||||
_ => panic!("unexpected relish")
|
||||
};
|
||||
assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&reltag));
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&reltag));
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
|
||||
|
||||
// Create a branch, check that the relation is visible there
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
assert!(newtline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
|
||||
|
||||
// Unlink it on the branch
|
||||
newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
|
||||
|
||||
// Check that it's no longer listed on the branch after the point where it was unlinked
|
||||
assert!(newtline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
|
||||
assert!(!newtline.list_rels(0, TESTDB, Lsn(0x40))?.contains(&reltag));
|
||||
|
||||
// Run checkpoint and garbage collection and check that it's still not visible
|
||||
newtline.checkpoint()?;
|
||||
repo.gc_iteration(Some(newtimelineid), 0, true)?;
|
||||
assert!(!newtline.list_rels(0, TESTDB, Lsn(0x40))?.contains(&reltag));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Test branch creation
|
||||
///
|
||||
|
||||
@@ -45,7 +45,6 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("pg_control") => {
|
||||
import_nonrel_file(timeline, lsn, RelishTag::ControlFile, &direntry.path())?;
|
||||
// Extract checkpoint record from pg_control and store is as separate object
|
||||
@@ -93,7 +92,6 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
@@ -153,7 +151,7 @@ fn import_relfile(
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if let Err(e) = p {
|
||||
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
|
||||
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
let (relnode, forknum, segno) = p.unwrap();
|
||||
@@ -397,15 +395,15 @@ pub fn save_decoded_record(
|
||||
for tablespace_id in dropdb.tablespace_ids {
|
||||
let rels = timeline.list_rels(tablespace_id, dropdb.db_id, lsn)?;
|
||||
for rel in rels {
|
||||
timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
|
||||
timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
|
||||
}
|
||||
trace!(
|
||||
"Unlink FileNodeMap {}, {} at lsn {}",
|
||||
"Drop FileNodeMap {}, {} at lsn {}",
|
||||
tablespace_id,
|
||||
dropdb.db_id,
|
||||
lsn
|
||||
);
|
||||
timeline.put_unlink(
|
||||
timeline.drop_relish(
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: dropdb.db_id,
|
||||
@@ -448,12 +446,12 @@ pub fn save_decoded_record(
|
||||
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
|
||||
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
|
||||
trace!(
|
||||
"unlink twophaseFile for xid {} parsed_xact.xid {} here at {}",
|
||||
"Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
|
||||
decoded.xl_xid,
|
||||
parsed_xact.xid,
|
||||
lsn
|
||||
);
|
||||
timeline.put_unlink(
|
||||
timeline.drop_relish(
|
||||
RelishTag::TwoPhase {
|
||||
xid: parsed_xact.xid,
|
||||
},
|
||||
@@ -733,7 +731,7 @@ fn save_xact_record(
|
||||
dbnode: xnode.dbnode,
|
||||
relnode: xnode.relnode,
|
||||
};
|
||||
timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
|
||||
timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -775,7 +773,7 @@ fn save_clog_truncate_record(
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
|
||||
// Iterate via SLRU CLOG segments and drop segments that we're ready to truncate
|
||||
// TODO This implementation is very inefficient -
|
||||
// it scans all non-rels only to find Clog
|
||||
//
|
||||
@@ -790,8 +788,8 @@ fn save_clog_truncate_record(
|
||||
if slru == SlruKind::Clog {
|
||||
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
|
||||
timeline.put_unlink(RelishTag::Slru { slru, segno }, lsn)?;
|
||||
trace!("unlink CLOG segment {:>04X} at lsn {}", segno, lsn);
|
||||
timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?;
|
||||
trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -894,7 +892,7 @@ fn save_multixact_truncate_record(
|
||||
// Delete all the segments except the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
while segment != endsegment {
|
||||
timeline.put_unlink(
|
||||
timeline.drop_relish(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: segment as u32,
|
||||
|
||||
Reference in New Issue
Block a user