Compare commits

...

22 Commits

Author SHA1 Message Date
Konstantin Knizhnik
59ea3973a4 Set hint bits in pageserver 2021-09-10 18:27:34 +03:00
Konstantin Knizhnik
08bc808043 Create branch just to run tests 2021-09-07 15:12:39 +03:00
Konstantin Knizhnik
ba563ee93e Revert "Bump postgres version"
This reverts commit 511873aaed.
2021-09-07 15:12:39 +03:00
anastasia
194b33ac3b print diff for mismatching files in check_restored_datadir_content() 2021-09-07 15:12:39 +03:00
Konstantin Knizhnik
a190c0eb88 Transaction commit redo handler should set TRANSACTION_STATUS_COMMITTED status for subtransactions, not TRANSACTION_STATUS_SUB_COMMITTED
Closes #535
2021-09-07 15:12:39 +03:00
anastasia
2b5405ac6e Add test funciton to compare files in compute nodes to catch bugs in SLRU replay.
Compare files in existing compute node's pgdata with fresh basebackup at the same lsn. We expect that content is identical, except tmp files
Use it after some tests.
2021-09-07 15:12:39 +03:00
Arseny Sher
1d75c827a0 Adapt safekeepers to --sync-safekeepers walproposer mode.
1) Do epoch switch without record from new epoch, immediately after recovery --
--sync-safekeepers mode doesn't generate new records.
2) Fix commit_lsn advancement by taking into account wal we have locally --
   setting it further is incorrect.
3) Report it back to walproposer so he knows when sync is done.
4) Remove system id check as it is unknown in sync mode.

And make logging slightly better.

ref #439
2021-09-07 15:12:39 +03:00
Stas Kelvich
e1e43f13df Make use of postgres --sync-safekeepers in tests and CLI.
Change control plane code to call `postgres --sync-safekeepers` before
compute node start when safekeepers are enabled. Now `pg create` will
create an empty data directory with the proper config file. Subsequent
`pg start` will run `sync-safekeepers` and will call basebackup with
the resulting LSN. Also change few tests to accommodate this new behavior.
2021-09-07 15:12:39 +03:00
Konstantin Knizhnik
b2e0490d5e Add description of Zenith changes in Postgres core (#533)
* Add description of Zenith changes in Postgres core

* Update README.md
2021-09-07 15:12:39 +03:00
Kirill Bulatov
1d3c86e17a Check rusage return code 2021-09-07 15:12:39 +03:00
Konstantin Knizhnik
e8c22488b9 Set proper xl_prev in basebackup, when possible.
In a passing fix two minor issues with basabackup:
* check that we can't create branches with pre-initdb LSN's
* normalize branch LSN's that are pointing to the segment boundary

patch by @knizhnik
closes #506
2021-09-07 15:12:39 +03:00
anastasia
9c1dbe3783 Add LayerMap.dump() funciton for debugging.
Print timelineid in layer dumps
2021-09-07 15:12:39 +03:00
anastasia
1365f8c703 Rename put_unlink() to drop_relish() in Timeline trait.
Rename put_unlink() to drop_segment() in Layer trait.
2021-09-07 15:12:39 +03:00
anastasia
df4ce15456 Improve comments for Layer trait. 2021-09-07 15:12:39 +03:00
anastasia
9ed4db273d Don't use term 'snapshot' to describe layers 2021-09-07 15:12:39 +03:00
Heikki Linnakangas
21cf4a3e11 Include # of bytes written in pgbench benchmark result
Now that the page server collects this metric (since commit 212920e47e),
let's include it in the performance test results

The new metric looks like this:

    performance/test_perf_pgbench.py .         [100%]
    --------------- Benchmark results ----------------
    test_pgbench.init: 6.784 s
    test_pgbench.pageserver_writes: 466 MB    <---- THIS IS NEW
    test_pgbench.5000_xacts: 8.196 s
    test_pgbench.size: 163 MB

    =============== 1 passed in 21.00s ===============
2021-09-07 15:12:39 +03:00
Heikki Linnakangas
2c10224c9a Partial fix for issue with extending relation with a gap.
This should fix the sporadic regression test failures we've been seeing
lately with "no base img found" errors.

This fixes the common case, but one corner case is still not handled:
If a relation is extended across a segment boundary, leaving a gap block
in the segment preceding the segment containing the target block, the
preceding segment will not be padded with zeros correctly. This adds
a test case for that, but it's commented out.

See github issue https://github.com/zenithdb/zenith/issues/500
2021-09-07 15:12:39 +03:00
Patrick Insinger
c33faf98d1 zenith_utils - box BidiStream::Tls variant
Clippy warns that one variant is 40 bytes and the other is 568 bytes.
Box the larger variant to avoid this warning
2021-09-07 15:12:39 +03:00
Dmitry Rodionov
95453bc4af fix clippy warnings 2021-09-07 15:12:39 +03:00
Kirill Bulatov
3a37877edc Fix some typos 2021-09-07 15:12:39 +03:00
Heikki Linnakangas
2145ec5fe8 Fix infinite loop with forced repository checkpoint.
To fix, break out of the loop when you reach an in-memory layer that was
created after the checkpoint started. To do that, add a "generation"
counter into the layer map.

Fixes https://github.com/zenithdb/zenith/issues/494
2021-09-07 15:12:39 +03:00
Konstantin Knizhnik
49d14cbde7 Create branch just to run tests 2021-09-07 13:32:45 +03:00
53 changed files with 1331 additions and 612 deletions

View File

@@ -4,14 +4,17 @@ use std::net::SocketAddr;
use std::net::TcpStream;
use std::os::unix::fs::PermissionsExt;
use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::{collections::BTreeMap, path::PathBuf};
use anyhow::{Context, Result};
use lazy_static::lazy_static;
use postgres_ffi::pg_constants;
use regex::Regex;
use zenith_utils::connstring::connection_host_port;
use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::ZTenantId;
use zenith_utils::zid::ZTimelineId;
@@ -86,7 +89,6 @@ impl ComputeControlPlane {
&mut self,
tenantid: ZTenantId,
branch_name: &str,
config_only: bool,
) -> Result<Arc<PostgresNode>> {
let timeline_id = self
.pageserver
@@ -101,25 +103,15 @@ impl ComputeControlPlane {
is_test: false,
timelineid: timeline_id,
tenantid,
uses_wal_proposer: false,
});
node.init_from_page_server(self.env.auth_type, config_only)?;
node.create_pgdata()?;
node.setup_pg_conf(self.env.auth_type)?;
self.nodes
.insert((tenantid, node.name.clone()), Arc::clone(&node));
// Configure the node to stream WAL directly to the pageserver
node.append_conf(
"postgresql.conf",
format!(
concat!(
"synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
"zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
),
node.connstr(),
)
.as_str(),
)?;
Ok(node)
}
}
@@ -135,6 +127,7 @@ pub struct PostgresNode {
is_test: bool,
pub timelineid: ZTimelineId,
pub tenantid: ZTenantId,
uses_wal_proposer: bool,
}
impl PostgresNode {
@@ -219,6 +212,8 @@ impl PostgresNode {
.parse()
.with_context(|| err_msg)?;
let uses_wal_proposer = config.contains("wal_acceptors");
// ok now
Ok(PostgresNode {
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
@@ -228,15 +223,48 @@ impl PostgresNode {
is_test: false,
timelineid,
tenantid,
uses_wal_proposer,
})
}
fn sync_walkeepers(&self) -> Result<Lsn> {
let pg_path = self.env.pg_bin_dir().join("postgres");
let sync_output = Command::new(pg_path)
.arg("--sync-safekeepers")
.env_clear()
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("PGDATA", self.pgdata().to_str().unwrap())
.output()
.with_context(|| "sync-walkeepers failed")?;
if !sync_output.status.success() {
anyhow::bail!(
"sync-walkeepers failed: '{}'",
String::from_utf8_lossy(&sync_output.stderr)
);
}
let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
println!("Walkeepers synced on {}", lsn);
Ok(lsn)
}
/// Get basebackup from the pageserver as a tar archive and extract it
/// to the `self.pgdata()` directory.
pub fn do_basebackup(&self) -> Result<()> {
let pgdata = self.pgdata();
fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
println!(
"Extracting base backup to create postgres instance: path={} port={}",
self.pgdata().display(),
self.address.port()
);
let sql = if let Some(lsn) = lsn {
format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn)
} else {
format!("basebackup {} {}", self.tenantid, self.timelineid)
};
let sql = format!("basebackup {} {}", self.tenantid, self.timelineid);
let mut client = self
.pageserver
.page_server_psql_client()
@@ -248,47 +276,32 @@ impl PostgresNode {
// Read the archive directly from the `CopyOutReader`
tar::Archive::new(copyreader)
.unpack(&pgdata)
.unpack(&self.pgdata())
.with_context(|| "extracting page backup failed")?;
Ok(())
}
/// Connect to a pageserver, get basebackup, and untar it to initialize a
/// new data directory
pub fn init_from_page_server(&self, auth_type: AuthType, config_only: bool) -> Result<()> {
let pgdata = self.pgdata();
println!(
"Extracting base backup to create postgres instance: path={} port={}",
pgdata.display(),
self.address.port()
);
// initialize data directory
if self.is_test {
fs::remove_dir_all(&pgdata).ok();
}
fs::create_dir_all(&pgdata)
.with_context(|| format!("could not create data directory {}", pgdata.display()))?;
fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
|| {
fn create_pgdata(&self) -> Result<()> {
fs::create_dir_all(&self.pgdata()).with_context(|| {
format!(
"could not create data directory {}",
self.pgdata().display()
)
})?;
fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
.with_context(|| {
format!(
"could not set permissions in data directory {}",
pgdata.display()
self.pgdata().display()
)
},
)?;
})
}
if config_only {
//Just create an empty config file
File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
} else {
self.do_basebackup()?;
fs::create_dir_all(self.pgdata().join("pg_wal"))?;
fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
}
// Connect to a page server, get base backup, and untar it to initialize a
// new data directory
fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
@@ -342,6 +355,40 @@ impl PostgresNode {
.as_str(),
)?;
// Configure the node to stream WAL directly to the pageserver
self.append_conf(
"postgresql.conf",
format!(
concat!(
"synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
"zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
),
self.connstr(),
)
.as_str(),
)?;
Ok(())
}
fn load_basebackup(&self) -> Result<()> {
let lsn = if self.uses_wal_proposer {
// LSN WAL_SEGMENT_SIZE means that it is bootstrap and we need to download just
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
// procedure evolves quite actively right now, so let's think about it again
// when things would be more stable (TODO).
let lsn = self.sync_walkeepers()?;
if lsn == Lsn(pg_constants::WAL_SEGMENT_SIZE as u64) {
None
} else {
Some(lsn)
}
} else {
None
};
self.do_basebackup(lsn)?;
Ok(())
}
@@ -408,38 +455,22 @@ impl PostgresNode {
}
// 1. We always start compute node from scratch, so
// if old dir exists, preserve config files and drop the directory
// XXX Now we only use 'postgresql.conf'.
// If we will need 'pg_hba.conf', support it here too
// if old dir exists, preserve 'postgresql.conf' and drop the directory
let postgresql_conf_path = self.pgdata().join("postgresql.conf");
let postgresql_conf = fs::read(postgresql_conf_path.clone()).with_context(|| {
let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
format!(
"failed to read config file in {}",
postgresql_conf_path.to_str().unwrap()
)
})?;
println!(
"Destroying postgres data directory '{}'",
self.pgdata().to_str().unwrap()
);
fs::remove_dir_all(&self.pgdata())?;
self.create_pgdata()?;
// 2. Create new node
self.init_from_page_server(self.env.auth_type, false)?;
// 2. Bring back config files
fs::write(&postgresql_conf_path, postgresql_conf)?;
// 3. Bring back config files
if let Ok(mut file) = OpenOptions::new()
.append(false)
.write(true)
.open(&postgresql_conf_path)
{
file.write_all(&postgresql_conf)?;
file.sync_all()?;
}
// 3. Load basebackup
self.load_basebackup()?;
// 4. Finally start the compute node postgres
println!("Starting postgres node at '{}'", self.connstr());

View File

@@ -74,7 +74,10 @@ impl PageServerNode {
args.extend(&["--auth-type", "ZenithJWT"]);
}
create_tenant.map(|tenantid| args.extend(&["--create-tenant", tenantid]));
if let Some(tenantid) = create_tenant {
args.extend(&["--create-tenant", tenantid])
}
let status = cmd
.args(args)
.env_clear()

View File

@@ -11,3 +11,4 @@
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core

202
docs/core_changes.md Normal file
View File

@@ -0,0 +1,202 @@
1. Add t_cid to XLOG record
- Why?
The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax.
To recap, the problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares abut it anymore.
- Alternatives?
I don't know
2. Add PD_WAL_LOGGED.
- Why?
Postgres sometimes writes data to the page before it is wal-logged. If such page ais swapped out, we will loose this change. The problem is currently solved by setting PD_WAL_LOGGED bit in page header. When page without this bit set is written to the SMGR, then it is forced to be written to the WAL as FPI using log_newpage_copy() function.
There was wrong assumption that it can happen only during construction of some exotic indexes (like gist). It is not true. The same situation can happen with COPY,VACUUM and when record hint bits are set.
- Discussion:
https://discord.com/channels/869525774699462656/882681420986851359
- Alternatives:
Do not store this flag in page header, but associate this bit with shared buffer. Logically it is more correct but in practice we will get not advantages: neither in space, neither in CPU overhead.
3. XLogReadBufferForRedo not always loads and pins requested buffer. So we need to add extra checks that buffer is really pinned. Also do not use BufferGetBlockNumber for buffer returned by XLogReadBufferForRedo.
- Why?
XLogReadBufferForRedo is not pinning pages which are not requested by wal-redo. It is specific only for wal-redo Postgres.
- Alternatives?
No
4. Eliminate reporting of some warnings related with hint bits, for example
"page is not marked all-visible but visibility map bit is set in relation".
- Why?
Hint bit may be not WAL logged.
- Alternative?
Always wal log any page changes.
5. Maintain last written LSN.
- Why?
When compute node requests page from page server, we need to specify LSN. Ideally it should be LSN
of WAL record performing last update of this pages. But we do not know it, because we do not have page.
We can use current WAL flush position, but in this case there is high probability that page server
will be blocked until this peace of WAL is delivered.
As better approximation we can keep max LSN of written page. It will be better to take in account LSNs only of evicted pages,
but SMGR API doesn't provide such knowledge.
- Alternatives?
Maintain map of LSNs of evicted pages.
6. Launching Postgres without WAL.
- Why?
According to Zenith architecture compute node is stateless. So when we are launching
compute node, we need to provide some dummy PG_DATADIR. Relation pages
can be requested on demand from page server. But Postgres still need some non-relational data:
control and configuration files, SLRUs,...
It is currently implemented using basebackup (do not mix with pg_basebackup) which is created
by pageserver. It includes in this tarball config/control files, SLRUs and required directories.
As far as pageserver do not have original (non-scattered) WAL segments, it includes in
this tarball dummy WAL segment which contains only SHUTDOWN_CHECKPOINT record at the beginning of segment,
which redo field points to the end of wal. It allows to load checkpoint record in more or less
standard way with minimal changes of Postgres, but then some special handling is needed,
including restoring previous record position from zenith.signal file.
Also we have to correctly initialize header of last WAL page (pointed by checkpoint.redo)
to pass checks performed by XLogReader.
- Alternatives?
We may not include fake WAL segment in tarball at all and modify xlog.c to load checkpoint record
in special way. But it may only increase number of changes in xlog.c
7. Add redo_read_buffer_filter callback to XLogReadBufferForRedoExtended
- Why?
We need a way in wal-redo Postgres to ignore pages which are not requested by pageserver.
So wal-redo Postgres reconstructs only requested page and for all other returns BLK_DONE
which means that recovery for them is not needed.
- Alternatives?
No
8. Enforce WAL logging of sequence updates.
- Why?
Due to performance reasons Postgres don't want to log each fetching of a value from a sequence,
so we pre-log a few fetches in advance. In the event of crash we can lose
(skip over) as many values as we pre-logged.
But it doesn't work with Zenith because page with sequence value can be evicted from buffer cache
and we will get a gap in sequence values even without crash.
- Alternatives:
Do not try to preserve sequential order but avoid performance penalty.
9. Treat unlogged tables as normal (permanent) tables.
- Why?
Unlogged tables are not transient, so them have to survive node restart (unlike temporary tables).
But as far as compute node is stateless, we need to persist their data to storage node.
And it can only be done through the WAL.
- Alternatives?
* Store unlogged tables locally (violates requirement of stateless compute nodes).
* Prohibit unlogged tables at all.
10. Support start Postgres in wal-redo mode
- Why?
To be able to apply WAL record and reconstruct pages at page server.
- Alternatives?
* Rewrite redo handlers in Rust
* Do not reconstruct pages at page server at all and do it at compute node.
11. WAL proposer
- Why?
WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.
It is currently implemented as patch to standard WAL sender.
- Alternatives?
Can be moved to extension if some extra callbacks will be added to wal sender code.
12. Secure Computing BPF API wrapper.
- Why?
Pageserver delegates complex WAL decoding duties to Postgres,
which means that the latter might fall victim to carefully designed
malicious WAL records and start doing harmful things to the system.
To prevent this, it has been decided to limit possible interactions
with the outside world using the Secure Computing BPF mode.
- Alternatives:
* Rewrite redo handlers in Rust.
* Add more checks to guarantee correctness of WAL records.
* Move seccomp.c to extension
* Many other discussed approaches to neutralize incorrect WAL records vulnerabilities.
13. Callbacks for replica feedbacks
- Why?
Allowing waproposer to interact with walsender code.
- Alternatives
Copy walsender code to walproposer.
14. Support multiple SMGR implementations.
- Why?
Postgres provides abstract API for storage manager but it has only one implementation
and provides no way to replace it with custom storage manager.
- Alternatives?
None.
15. Calculate database size as sum of all database relations.
- Why?
Postgres is calculating database size by traversing data directory
but as far as Zenith compute node is stateless we can not do it.
- Alternatives?
Send this request directly to pageserver and calculate real (physical) size
of Zenith representation of database/timeline, rather than sum logical size of all relations.
-----------------------------------------------
Not currently committed but proposed:
1. Disable ring buffer buffer manager strategies
- Why?
Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
Even if there are free space in buffer cache, pages may be evicted.
Negative effect of it can be somehow compensated by file system cache, but in case of Zenith
cost of requesting page from page server is much higher.
- Alternatives?
Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
for example copy evicted page from ring buffer to some other buffer if there is free space
in buffer cache.
2. Disable marking page as dirty when hint bits are set.
- Why?
Postgres has to modify page twice: first time when some tuple is updated and second time when
hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
- Alternatives?
Add special WAL record for setting page hints.
3. Prefetching
- Why?
As far as pages in Zenith are loaded on demand, to reduce node startup time
and also sppedup some massive queries we need some mechanism for bulk loading to
reduce page request round-trip overhead.
Currently Postgres is supporting prefetching only for bitmap scan.
In Zenith we also use prefetch for sequential and index scan. For sequential scan we prefetch
some number of following pages. For index scan we prefetch pages of heap relation addressed by TIDs.
4. Prewarming.
- Why?
Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
We can capture state of compute node buffer cache and send bulk request for this pages at startup.

View File

@@ -26,7 +26,7 @@ A checkpoint record in the WAL marks a point in the WAL sequence at which it is
NOTE: This is an overloaded term.
Whenever enough WAL has been accumulated in memory, the page server []
writes out the changes in memory into new layer files[]. This process
writes out the changes from in-memory layers into new layer files[]. This process
is called "checkpointing". The page server only creates layer files for
relations that have been modified since the last checkpoint.
@@ -41,17 +41,28 @@ Stateless Postgres node that stores data in pageserver.
Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
Each PostgreSQL fork is considered a separate relish.
### Layer file
### Layer
Each layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
There are two kinds of layers, in-memory and on-disk layers. In-memory
layers are used to ingest incoming WAL, and provide fast access
to the recent page versions. On-disk layers are stored as files on disk, and
are immutable.
### Layer file (on-disk layer)
Layered repository on-disk format is based on immutable files. The
files are called "layer files". Each file corresponds to one 10 MB
files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE
segment of a PostgreSQL relation fork. There are two kinds of layer
files: image files and delta files. An image file contains a
"snapshot" of the segment at a particular LSN, and a delta file
contains WAL records applicable to the segment, in a range of LSNs.
### Layer map
The layer map tracks what layers exist for all the relishes in a timeline.
### Layered repository
Zenith repository implementation that keeps data in layers.
### LSN
@@ -121,7 +132,7 @@ Each SLRU segment is considered a separate relish[].
### Tenant (Multitenancy)
Tenant represents a single customer, interacting with Zenith.
Wal redo[] activity, timelines[], snapshots[] are managed for each tenant independently.
Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
One pageserver[] can serve multiple tenants at once.
One safekeeper

View File

@@ -37,7 +37,7 @@ On the page server tenants introduce one level of indirection, so data directory
├── de182bc61fb11a5a6b390a8aed3a804a
└── ee6016ec31116c1b7c33dfdfca38891f
```
Wal redo activity, timelines, snapshots are managed for each tenant independently.
Wal redo activity and timelines are managed for each tenant independently.
For local environment used for example in tests there also new level of indirection for tenants. It touches `pgdatadirs` directory. Now it contains `tenants` subdirectory so the structure looks the following way:

View File

@@ -47,28 +47,45 @@ impl<'a> Basebackup<'a> {
timeline: &'a Arc<dyn Timeline>,
req_lsn: Option<Lsn>,
) -> Basebackup<'a> {
// current_prev may be zero if we are at the start of timeline branched from old lsn
let RecordLsn {
last: lsn,
prev: prev_record_lsn,
} = if let Some(lsn) = req_lsn {
// FIXME: that wouldn't work since we don't know prev for old LSN's.
// Probably it is better to avoid using prev in compute node start
// at all and acept the fact that first WAL record in the timeline would
// have zero as prev. https://github.com/zenithdb/zenith/issues/506
RecordLsn {
last: lsn,
prev: lsn,
last: current_last,
prev: current_prev,
} = timeline.get_last_record_rlsn();
// Compute postgres doesn't have any previous WAL files, but the first record that this
// postgres is going to write need to have LSN of previous record (xl_prev). So we are
// writing prev_lsn to "zenith.signal" file so that postgres can read it during the start.
// In some cases we don't know prev_lsn (branch or basebackup @old_lsn) so pass Lsn(0)
// instead and embrace the wrong xl_prev in this situations.
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
if req_lsn > current_last {
// FIXME: now wait_lsn() is inside of list_nonrels() so we don't have a way
// to get it from there. It is better to wait just here.
(Lsn(0), req_lsn)
} else if req_lsn < current_last {
// we don't know prev already. We don't currently use basebackup@old_lsn
// but may use it for read only replicas in future
(Lsn(0), req_lsn)
} else {
// we are exactly at req_lsn and know prev
(current_prev, req_lsn)
}
} else {
// Atomically get last and prev LSN's
timeline.get_last_record_rlsn()
// None in req_lsn means that we are branching from the latest LSN
(current_prev, current_last)
};
info!(
"taking basebackup lsn={}, prev_lsn={}",
backup_prev, backup_lsn
);
Basebackup {
ar: Builder::new(write),
timeline,
lsn,
prev_record_lsn,
lsn: backup_lsn,
prev_record_lsn: backup_prev,
}
}
@@ -84,10 +101,10 @@ impl<'a> Basebackup<'a> {
for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() {
if *filepath == "pg_hba.conf" {
let data = pg_constants::PG_HBA.as_bytes();
let header = new_tar_header(&filepath, data.len() as u64)?;
self.ar.append(&header, &data[..])?;
let header = new_tar_header(filepath, data.len() as u64)?;
self.ar.append(&header, data)?;
} else {
let header = new_tar_header(&filepath, 0)?;
let header = new_tar_header(filepath, 0)?;
self.ar.append(&header, &mut io::empty())?;
}
}
@@ -166,14 +183,12 @@ impl<'a> Basebackup<'a> {
self.lsn,
)?;
let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID {
let dst_path = "PG_VERSION";
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
self.ar.append(&header, &version_bytes[..])?;
let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
self.ar.append(&header, version_bytes)?;
let dst_path = format!("global/PG_VERSION");
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
self.ar.append(&header, &version_bytes[..])?;
let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?;
self.ar.append(&header, version_bytes)?;
String::from("global/pg_filenode.map") // filenode map for global tablespace
} else {
@@ -188,7 +203,7 @@ impl<'a> Basebackup<'a> {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
self.ar.append(&header, &version_bytes[..])?;
self.ar.append(&header, version_bytes)?;
format!("base/{}/pg_filenode.map", dbnode)
};
@@ -238,7 +253,7 @@ impl<'a> Basebackup<'a> {
XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
pg_constants::WAL_SEGMENT_SIZE,
);
checkpoint.redo = self.lsn.0 + self.lsn.calc_padding(8u32);
checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;
//reset some fields we don't want to preserve
//TODO Check this.
@@ -251,9 +266,14 @@ impl<'a> Basebackup<'a> {
pg_control.state = pg_constants::DB_SHUTDOWNED;
// add zenith.signal file
let xl_prev = if self.prev_record_lsn == Lsn(0) {
0xBAD0 // magic value to indicate that we don't know prev_lsn
} else {
self.prev_record_lsn.0
};
self.ar.append(
&new_tar_header("zenith.signal", 8)?,
&self.prev_record_lsn.0.to_le_bytes()[..],
&xl_prev.to_le_bytes()[..],
)?;
//send pg_control

View File

@@ -113,7 +113,7 @@ impl CfgFileParams {
.auth_type
.as_ref()
.map_or(Ok(AuthType::Trust), |auth_type| {
AuthType::from_str(&auth_type)
AuthType::from_str(auth_type)
})?;
if !pg_distrib_dir.join("bin/postgres").exists() {
@@ -273,7 +273,7 @@ fn main() -> Result<()> {
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
// Initialize logger
let (_scope_guard, log_file) = logger::init_logging(&conf, "pageserver.log")?;
let (_scope_guard, log_file) = logger::init_logging(conf, "pageserver.log")?;
let _log_guard = slog_stdlog::init()?;
// Note: this `info!(...)` macro comes from `log` crate
@@ -284,7 +284,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
if conf.daemonize {
info!("daemonizing...");
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
// There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
// that we will see any accidental manual fprintf's or backtraces.
let stdout = log_file.try_clone().unwrap();
let stderr = log_file;

View File

@@ -43,7 +43,7 @@ pub struct PointInTime {
pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
// Initialize logger
let (_scope_guard, _log_file) = logger::init_logging(&conf, "pageserver.log")?;
let (_scope_guard, _log_file) = logger::init_logging(conf, "pageserver.log")?;
let _log_guard = slog_stdlog::init()?;
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
@@ -264,15 +264,22 @@ pub(crate) fn create_branch(
}
let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
let timeline = repo.get_timeline(startpoint.timelineid)?;
if startpoint.lsn == Lsn(0) {
// Find end of WAL on the old timeline
let end_of_wal = repo
.get_timeline(startpoint.timelineid)?
.get_last_record_lsn();
let end_of_wal = timeline.get_last_record_lsn();
info!("branching at end of WAL: {}", end_of_wal);
startpoint.lsn = end_of_wal;
}
startpoint.lsn = startpoint.lsn.align();
if timeline.get_start_lsn() > startpoint.lsn {
anyhow::bail!(
"invalid startpoint {} for the branch {}: less than timeline start {}",
startpoint.lsn,
branchname,
timeline.get_start_lsn()
);
}
// create a new timeline directory for it
let newtli = create_timeline(conf, Some(startpoint), tenantid)?;
@@ -284,7 +291,7 @@ pub(crate) fn create_branch(
// FIXME: there's a race condition, if you create a branch with the same
// name concurrently.
let data = newtli.to_string();
fs::write(conf.branch_path(&branchname, tenantid), data)?;
fs::write(conf.branch_path(branchname, tenantid), data)?;
Ok(BranchInfo {
name: branchname.to_string(),
@@ -333,21 +340,21 @@ fn parse_point_in_time(
// Check if it's a tag
if lsn.is_none() {
let tagpath = conf.tag_path(name, &tenantid);
let tagpath = conf.tag_path(name, tenantid);
if tagpath.exists() {
let pointstr = fs::read_to_string(tagpath)?;
return parse_point_in_time(conf, &pointstr, &tenantid);
return parse_point_in_time(conf, &pointstr, tenantid);
}
}
// Check if it's a branch
// Check if it's branch @ LSN
let branchpath = conf.branch_path(name, &tenantid);
let branchpath = conf.branch_path(name, tenantid);
if branchpath.exists() {
let pointstr = fs::read_to_string(branchpath)?;
let mut result = parse_point_in_time(conf, &pointstr, &tenantid)?;
let mut result = parse_point_in_time(conf, &pointstr, tenantid)?;
result.lsn = lsn.unwrap_or(Lsn(0));
return Ok(result);
@@ -356,7 +363,7 @@ fn parse_point_in_time(
// Check if it's a timelineid
// Check if it's timelineid @ LSN
if let Ok(timelineid) = ZTimelineId::from_str(name) {
let tlipath = conf.timeline_path(&timelineid, &tenantid);
let tlipath = conf.timeline_path(&timelineid, tenantid);
if tlipath.exists() {
return Ok(PointInTime {
timelineid,

View File

@@ -71,7 +71,7 @@ static TIMEOUT: Duration = Duration::from_secs(60);
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
// would be more appropriate. But a low value forces the code to be exercised more,
// which is good for now to trigger bugs.
static OLDEST_INMEM_DISTANCE: u64 = 16 * 1024 * 1024;
static OLDEST_INMEM_DISTANCE: i128 = 16 * 1024 * 1024;
// Metrics collected on operations on the storage repository.
lazy_static! {
@@ -150,12 +150,24 @@ impl Repository for LayeredRepository {
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> {
let src_timeline = self.get_timeline(src)?;
let RecordLsn {
last: src_last,
prev: src_prev,
} = src_timeline.get_last_record_rlsn();
// Use src_prev from the source timeline only if we branched at the last record.
let dst_prev = if src_last == start_lsn {
Some(src_prev)
} else {
None
};
// Create the metadata file, noting the ancestor of the new timeline.
// There is initially no data in it, but all the read-calls know to look
// into the ancestor.
let metadata = TimelineMetadata {
disk_consistent_lsn: start_lsn,
prev_record_lsn: Some(src_timeline.get_prev_record_lsn()), // FIXME not atomic with start_lsn
prev_record_lsn: dst_prev,
ancestor_timeline: Some(src),
ancestor_lsn: start_lsn,
};
@@ -246,8 +258,8 @@ impl LayeredRepository {
tenantid: ZTenantId,
) -> LayeredRepository {
LayeredRepository {
tenantid: tenantid,
conf: conf,
tenantid,
conf,
timelines: Mutex::new(HashMap::new()),
walredo_mgr,
}
@@ -675,14 +687,14 @@ impl Timeline for LayeredTimeline {
(relsize - 1) / RELISH_SEG_SIZE
};
// Unlink segments beyond the last remaining segment.
// Drop segments beyond the last remaining segment.
for remove_segno in (last_remain_seg + 1)..=old_last_seg {
let seg = SegmentTag {
rel,
segno: remove_segno,
};
let layer = self.get_layer_for_write(seg, lsn)?;
layer.put_unlink(lsn)?;
layer.drop_segment(lsn)?;
}
// Truncate the last remaining segment to the specified size
@@ -698,8 +710,8 @@ impl Timeline for LayeredTimeline {
Ok(())
}
fn put_unlink(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
trace!("put_unlink: {} at {}", rel, lsn);
fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
trace!("drop_segment: {} at {}", rel, lsn);
if rel.is_blocky() {
let oldsize_opt = self.get_relish_size(rel, self.get_last_record_lsn())?;
@@ -710,25 +722,25 @@ impl Timeline for LayeredTimeline {
(oldsize - 1) / RELISH_SEG_SIZE
};
// Unlink all segments
// Drop all segments of the relish
for remove_segno in 0..=old_last_seg {
let seg = SegmentTag {
rel,
segno: remove_segno,
};
let layer = self.get_layer_for_write(seg, lsn)?;
layer.put_unlink(lsn)?;
layer.drop_segment(lsn)?;
}
} else {
warn!(
"put_unlink called on non-existent relish {} at {}",
"drop_segment called on non-existent relish {} at {}",
rel, lsn
);
}
} else {
let seg = SegmentTag::from_blknum(rel, 0);
let layer = self.get_layer_for_write(seg, lsn)?;
layer.put_unlink(lsn)?;
layer.drop_segment(lsn)?;
}
Ok(())
@@ -782,6 +794,14 @@ impl Timeline for LayeredTimeline {
fn get_last_record_rlsn(&self) -> RecordLsn {
self.last_record_lsn.load()
}
fn get_start_lsn(&self) -> Lsn {
if let Some(ancestor) = self.ancestor_timeline.as_ref() {
ancestor.get_start_lsn()
} else {
self.ancestor_lsn
}
}
}
impl LayeredTimeline {
@@ -902,7 +922,7 @@ impl LayeredTimeline {
while lsn < timeline.ancestor_lsn {
trace!("going into ancestor {} ", timeline.ancestor_lsn);
timeline = &timeline.ancestor_timeline.as_ref().unwrap();
timeline = timeline.ancestor_timeline.as_ref().unwrap();
}
// Now we have the right starting timeline for our search.
@@ -927,7 +947,6 @@ impl LayeredTimeline {
assert!(layer.get_start_lsn() <= lsn);
if layer.is_dropped() && layer.get_end_lsn() <= lsn {
// The segment was unlinked
return Ok(None);
}
@@ -937,7 +956,7 @@ impl LayeredTimeline {
// If not, check if there's a layer on the ancestor timeline
if let Some(ancestor) = &timeline.ancestor_timeline {
lsn = timeline.ancestor_lsn;
timeline = &ancestor.as_ref();
timeline = ancestor.as_ref();
trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn);
continue;
}
@@ -1052,7 +1071,7 @@ impl LayeredTimeline {
// FIXME: we can deadlock if we call wait_lsn() from WAL receiver. And we actually
// it a lot from there. Only deadlock that I caught was while trying to add wait_lsn()
// in list_rels(). But it makes sense to make all functions in timeline non-waiting;
// assert that arg_lsn <= current_record_lsn; call wait_lsn explicetly where it is
// assert that arg_lsn <= current_record_lsn; call wait_lsn explicitly where it is
// needed (page_service and basebackup); uncomment this check:
// assert_ne!(thread::current().name(), Some("WAL receiver thread"));
@@ -1074,6 +1093,18 @@ impl LayeredTimeline {
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
/// know anything about them here in the repository.
fn checkpoint_internal(&self, force: bool) -> Result<()> {
// Grab lock on the layer map.
//
// TODO: We hold it locked throughout the checkpoint operation. That's bad,
// the checkpointing could take many seconds, and any incoming get_page_at_lsn()
// requests will block.
let mut layers = self.layers.lock().unwrap();
// Bump the generation number in the layer map, so that we can distinguish
// entries inserted after the checkpoint started
let current_generation = layers.increment_generation();
// Read 'last_record_lsn'. That becomes the cutoff LSN for frozen layers.
let RecordLsn {
last: last_record_lsn,
prev: prev_record_lsn,
@@ -1085,13 +1116,6 @@ impl LayeredTimeline {
last_record_lsn
);
// Grab lock on the layer map.
//
// TODO: We hold it locked throughout the checkpoint operation. That's bad,
// the checkpointing could take many seconds, and any incoming get_page_at_lsn()
// requests will block.
let mut layers = self.layers.lock().unwrap();
// Take the in-memory layer with the oldest WAL record. If it's older
// than the threshold, write it out to disk as a new image and delta file.
// Repeat until all remaining in-memory layers are within the threshold.
@@ -1102,14 +1126,26 @@ impl LayeredTimeline {
// check, though. We should also aim at flushing layers that consume
// a lot of memory and/or aren't receiving much updates anymore.
let mut disk_consistent_lsn = last_record_lsn;
while let Some(oldest_layer) = layers.peek_oldest_open() {
// Does this layer need freezing?
while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() {
let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();
let distance = last_record_lsn.0 - oldest_pending_lsn.0;
if !force && distance < OLDEST_INMEM_DISTANCE {
// Does this layer need freezing?
//
// Write out all in-memory layers that contain WAL older than OLDEST_INMEM_DISTANCE.
// Or if 'force' is true, write out all of them. If we reach a layer with the same
// generation number, we know that we have cycled through all layers that were open
// when we started. We don't want to process layers inserted after we started, to
// avoid getting into an infinite loop trying to process again entries that we
// inserted ourselves.
let distance = last_record_lsn.widening_sub(oldest_pending_lsn);
if distance < 0
|| (!force && distance < OLDEST_INMEM_DISTANCE)
|| oldest_generation == current_generation
{
info!(
"the oldest layer is now {} which is {} bytes behind last_record_lsn",
oldest_layer.get_seg_tag(),
oldest_layer.filename().display(),
distance
);
disk_consistent_lsn = oldest_pending_lsn;
@@ -1117,7 +1153,7 @@ impl LayeredTimeline {
}
// freeze it
let (new_historics, new_open) = oldest_layer.freeze(last_record_lsn, &self)?;
let (new_historics, new_open) = oldest_layer.freeze(last_record_lsn, self)?;
// replace this layer with the new layers that 'freeze' returned
layers.pop_oldest_open();
@@ -1159,7 +1195,7 @@ impl LayeredTimeline {
let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid);
let metadata = TimelineMetadata {
disk_consistent_lsn: disk_consistent_lsn,
disk_consistent_lsn,
prev_record_lsn: ondisk_prev_record_lsn,
ancestor_timeline: ancestor_timelineid,
ancestor_lsn: self.ancestor_lsn,
@@ -1210,7 +1246,7 @@ impl LayeredTimeline {
//
// Determine for each file if it needs to be retained
// FIXME: also scan open in-memory layers. Normally we cannot remove the
// latest layer of any seg, but if it was unlinked it's possible
// latest layer of any seg, but if it was dropped it's possible
let mut layers = self.layers.lock().unwrap();
'outer: for l in layers.iter_historic_layers() {
let seg = l.get_seg_tag();
@@ -1287,18 +1323,14 @@ impl LayeredTimeline {
doomed_layer.delete()?;
layers.remove_historic(&*doomed_layer);
if doomed_layer.is_dropped() {
if doomed_layer.get_seg_tag().rel.is_relation() {
result.ondisk_relfiles_dropped += 1;
} else {
result.ondisk_nonrelfiles_dropped += 1;
}
} else {
if doomed_layer.get_seg_tag().rel.is_relation() {
result.ondisk_relfiles_removed += 1;
} else {
result.ondisk_nonrelfiles_removed += 1;
}
match (
doomed_layer.is_dropped(),
doomed_layer.get_seg_tag().rel.is_relation(),
) {
(true, true) => result.ondisk_relfiles_dropped += 1,
(true, false) => result.ondisk_nonrelfiles_dropped += 1,
(false, true) => result.ondisk_relfiles_removed += 1,
(false, false) => result.ondisk_nonrelfiles_removed += 1,
}
}
@@ -1414,6 +1446,7 @@ impl LayeredTimeline {
trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn);
}
let img = self.walredo_mgr.request_redo(
self,
rel,
blknum,
request_lsn,

View File

@@ -6,13 +6,14 @@ which pages they apply to, and accumulates the incoming changes in
memory. Every now and then, the accumulated changes are written out to
new files.
The files are called "snapshot files". Each snapshot file corresponds
to one 10 MB slice of a PostgreSQL relation fork. The snapshot files
The files are called "layer files". Each layer file corresponds
to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or
non-rel file in a range of LSNs. The layer files
for each timeline are stored in the timeline's subdirectory under
.zenith/tenants/<tenantid>/timelines.
There are two kind of snapshot file: base images, and deltas. A base
image file contains a snapshot of a segment as it was at one LSN,
There are two kind of layer file: base images, and deltas. A base
image file contains a layer of a segment as it was at one LSN,
whereas a delta file contains modifications to a segment - mostly in
the form of WAL records - in a range of LSN
@@ -44,7 +45,7 @@ managed, except that the first part of file names is different.
Internally, the relations and non-relation files that are managed in
the versioned store are together called "relishes".
If a file has been dropped, the last snapshot file for it is created
If a file has been dropped, the last layer file for it is created
with the _DROPPED suffix, e.g.
rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED
@@ -67,7 +68,7 @@ for 'orders' table on 'main' branch is represented like this:
main/orders_100_200
# Creating snapshot files
# Creating layer files
Let's start with a simple example with a system that contains one
branch called 'main' and two tables, 'orders' and 'customers'. The end
@@ -86,10 +87,10 @@ end of WAL at 250 are kept in memory. If the page server crashes, the
latest records between 200-250 need to be re-read from the WAL.
Whenever enough WAL has been accumulated in memory, the page server
writes out the changes in memory into new snapshot files. This process
writes out the changes in memory into new layer files. This process
is called "checkpointing" (not to be confused with the PostgreSQL
checkpoints, that's a different thing). The page server only creates
snapshot files for relations that have been modified since the last
layer files for relations that have been modified since the last
checkpoint. For example, if the current end of WAL is at LSN 450, and
the last checkpoint happened at LSN 400 but there hasn't been any
recent changes to 'customers' table, you would have these files on
@@ -108,7 +109,7 @@ disk:
If the customers table is modified later, a new file is created for it
at the next checkpoint. The new file will cover the "gap" from the
last snapshot file, so the LSN ranges are always contiguous:
last layer file, so the LSN ranges are always contiguous:
main/orders_100
main/orders_100_200
@@ -130,13 +131,13 @@ page server needs to reconstruct the requested page, as it was at the
requested LSN. To do that, the page server first checks the recent
in-memory layer; if the requested page version is found there, it can
be returned immediatedly without looking at the files on
disk. Otherwise the page server needs to locate the snapshot file that
disk. Otherwise the page server needs to locate the layer file that
contains the requested page version.
For example, if a request comes in for table 'orders' at LSN 250, the
page server would load the 'main/orders_200_300' file into memory, and
reconstruct and return the requested page from it, as it was at
LSN 250. Because the snapshot file consists of a full image of the
LSN 250. Because the layer file consists of a full image of the
relation at the start LSN and the WAL, reconstructing the page
involves replaying any WAL records applicable to the page between LSNs
200-250, starting from the base image at LSN 200.
@@ -171,7 +172,7 @@ Then, the 'orders' table is updated differently on the 'main' and
Because the 'customers' table hasn't been modified on the child
branch, there is no file for it there. If you request a page for it on
the 'child' branch, the page server will not find any snapshot file
the 'child' branch, the page server will not find any layer file
for it in the 'child' directory, so it will recurse to look into the
parent 'main' branch instead.
@@ -217,7 +218,7 @@ branch at a historic LSN, is how we support PITR in Zenith.
# Garbage collection
In this scheme, we keep creating new snapshot files over time. We also
In this scheme, we keep creating new layer files over time. We also
need a mechanism to remove old files that are no longer needed,
because disk space isn't infinite.
@@ -245,7 +246,7 @@ of the branch is LSN 525, so that the GC horizon is currently at
main/customers_200
We can remove the following files because the end LSNs of those files are
older than GC horizon 375, and there are more recent snapshot files for the
older than GC horizon 375, and there are more recent layer files for the
table:
main/orders_100 DELETE
@@ -262,7 +263,7 @@ table:
main/customers_200 KEEP, NO NEWER VERSION
'main/customers_100_200' is old enough, but it cannot be
removed because there is no newer snapshot file for the table.
removed because there is no newer layer file for the table.
Things get slightly more complicated with multiple branches. All of
the above still holds, but in addition to recent files we must also
@@ -308,7 +309,7 @@ new base image and delta file for it on the child:
After this, the 'main/orders_100' and 'main/orders_100_200' file could
be removed. It is no longer needed by the child branch, because there
is a newer snapshot file there. TODO: This optimization hasn't been
is a newer layer file there. TODO: This optimization hasn't been
implemented! The GC algorithm will currently keep the file on the
'main' branch anyway, for as long as the child branch exists.
@@ -346,7 +347,7 @@ It would also be OK to have overlapping LSN ranges for the same relation:
main/orders_300_400
main/orders_400
The code that reads the snapshot files should cope with this, but this
The code that reads the layer files should cope with this, but this
situation doesn't arise either, because the checkpointing code never
does that. It could be useful, however, as a transient state when
garbage collecting around branch points, or explicit recovery
@@ -360,6 +361,6 @@ points. For example, if we start with this:
And there is a branch or explicit recovery point at LSN 150, we could
replace 'main/orders_100_200' with 'main/orders_150' to keep a
snapshot only at that exact point that's still needed, removing the
layer only at that exact point that's still needed, removing the
other page versions around it. But such compaction has not been
implemented yet.

View File

@@ -130,23 +130,23 @@ pub struct DeltaLayerInner {
impl Layer for DeltaLayer {
fn get_timeline_id(&self) -> ZTimelineId {
return self.timelineid;
self.timelineid
}
fn get_seg_tag(&self) -> SegmentTag {
return self.seg;
self.seg
}
fn is_dropped(&self) -> bool {
return self.dropped;
self.dropped
}
fn get_start_lsn(&self) -> Lsn {
return self.start_lsn;
self.start_lsn
}
fn get_end_lsn(&self) -> Lsn {
return self.end_lsn;
self.end_lsn
}
fn filename(&self) -> PathBuf {
@@ -174,7 +174,7 @@ impl Layer for DeltaLayer {
{
// Open the file and lock the metadata in memory
// TODO: avoid opening the snapshot file for each read
// TODO: avoid opening the file for each read
let (_path, book) = self.open_book()?;
let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
let inner = self.load()?;
@@ -285,8 +285,8 @@ impl Layer for DeltaLayer {
/// debugging function to print out the contents of the layer
fn dump(&self) -> Result<()> {
println!(
"----- delta layer for {} {}-{} ----",
self.seg, self.start_lsn, self.end_lsn
"----- delta layer for tli {} seg {} {}-{} ----",
self.timelineid, self.seg, self.start_lsn, self.end_lsn
);
println!("--- relsizes ---");
@@ -358,6 +358,7 @@ impl DeltaLayer {
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
/// expedient.
#[allow(clippy::too_many_arguments)]
pub fn create(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
@@ -372,16 +373,16 @@ impl DeltaLayer {
) -> Result<DeltaLayer> {
let delta_layer = DeltaLayer {
path_or_conf: PathOrConf::Conf(conf),
timelineid: timelineid,
tenantid: tenantid,
seg: seg,
start_lsn: start_lsn,
timelineid,
tenantid,
seg,
start_lsn,
end_lsn,
dropped,
inner: Mutex::new(DeltaLayerInner {
loaded: true,
page_version_metas: BTreeMap::new(),
relsizes: relsizes,
relsizes,
}),
predecessor,
};

View File

@@ -111,8 +111,10 @@ impl DeltaFileName {
dropped,
})
}
}
fn to_string(&self) -> String {
impl fmt::Display for DeltaFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let basename = match self.seg.rel {
RelishTag::Relation(reltag) => format!(
"rel_{}_{}_{}_{}",
@@ -134,11 +136,12 @@ impl DeltaFileName {
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
}
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
RelishTag::Checkpoint => format!("pg_control_checkpoint"),
RelishTag::ControlFile => format!("pg_control"),
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
RelishTag::ControlFile => "pg_control".to_string(),
};
format!(
write!(
f,
"{}_{}_{:016X}_{:016X}{}",
basename,
self.seg.segno,
@@ -149,12 +152,6 @@ impl DeltaFileName {
}
}
impl fmt::Display for DeltaFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.to_string())
}
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct ImageFileName {
pub seg: SegmentTag,
@@ -233,8 +230,10 @@ impl ImageFileName {
Some(ImageFileName { seg, lsn })
}
}
fn to_string(&self) -> String {
impl fmt::Display for ImageFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let basename = match self.seg.rel {
RelishTag::Relation(reltag) => format!(
"rel_{}_{}_{}_{}",
@@ -256,11 +255,12 @@ impl ImageFileName {
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
}
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
RelishTag::Checkpoint => format!("pg_control_checkpoint"),
RelishTag::ControlFile => format!("pg_control"),
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
RelishTag::ControlFile => "pg_control".to_string(),
};
format!(
write!(
f,
"{}_{}_{:016X}",
basename,
self.seg.segno,
@@ -269,12 +269,6 @@ impl ImageFileName {
}
}
impl fmt::Display for ImageFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.to_string())
}
}
/// Scan timeline directory and create ImageFileName and DeltaFilename
/// structs representing all files on disk
///
@@ -302,7 +296,7 @@ pub fn list_files(
warn!("unrecognized filename in timeline dir: {}", fname);
}
}
return Ok((imgfiles, deltafiles));
Ok((imgfiles, deltafiles))
}
/// Helper enum to hold a PageServerConf, or a path

View File

@@ -2,8 +2,9 @@
//! It is stored in a file on disk.
//!
//! On disk, the image files are stored in timelines/<timelineid> directory.
//! Currently, there are no subdirectories, and each snapshot file is named like this:
//! Currently, there are no subdirectories, and each image layer file is named like this:
//!
//! Note that segno is
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
//!
//! For example:
@@ -15,10 +16,10 @@
//! Only metadata is loaded into memory by the load function.
//! When images are needed, they are read directly from disk.
//!
//! For blocky segments, the images are stored in BLOCKY_IMAGES_CHAPTER.
//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
//! All the images are required to be BLOCK_SIZE, which allows for random access.
//!
//! For non-blocky segments, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
//!
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
use crate::layered_repository::storage_layer::{
@@ -97,23 +98,23 @@ impl Layer for ImageLayer {
}
fn get_timeline_id(&self) -> ZTimelineId {
return self.timelineid;
self.timelineid
}
fn get_seg_tag(&self) -> SegmentTag {
return self.seg;
self.seg
}
fn is_dropped(&self) -> bool {
return false;
false
}
fn get_start_lsn(&self) -> Lsn {
return self.lsn;
self.lsn
}
fn get_end_lsn(&self) -> Lsn {
return self.lsn;
self.lsn
}
/// Look up given page in the file
@@ -192,7 +193,10 @@ impl Layer for ImageLayer {
/// debugging function to print out the contents of the layer
fn dump(&self) -> Result<()> {
println!("----- image layer for {} at {} ----", self.seg, self.lsn);
println!(
"----- image layer for tli {} seg {} at {} ----",
self.timelineid, self.seg, self.lsn
);
let inner = self.load()?;
@@ -255,10 +259,10 @@ impl ImageLayer {
let layer = ImageLayer {
path_or_conf: PathOrConf::Conf(conf),
timelineid: timelineid,
tenantid: tenantid,
seg: seg,
lsn: lsn,
timelineid,
tenantid,
seg,
lsn,
inner: Mutex::new(ImageLayerInner {
loaded: true,
image_type: image_type.clone(),

View File

@@ -7,6 +7,7 @@ use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
};
use crate::layered_repository::LayeredTimeline;
use crate::layered_repository::ZERO_PAGE;
use crate::layered_repository::{DeltaLayer, ImageLayer};
use crate::repository::WALRecord;
use crate::PageServerConf;
@@ -14,6 +15,7 @@ use crate::{ZTenantId, ZTimelineId};
use anyhow::{bail, Result};
use bytes::Bytes;
use log::*;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::ops::Bound::Included;
use std::path::PathBuf;
@@ -93,8 +95,8 @@ impl Layer for InMemoryLayer {
let delta_filename = DeltaFileName {
seg: self.seg,
start_lsn: self.start_lsn,
end_lsn: end_lsn,
dropped: dropped,
end_lsn,
dropped,
}
.to_string();
@@ -102,15 +104,15 @@ impl Layer for InMemoryLayer {
}
fn get_timeline_id(&self) -> ZTimelineId {
return self.timelineid;
self.timelineid
}
fn get_seg_tag(&self) -> SegmentTag {
return self.seg;
self.seg
}
fn get_start_lsn(&self) -> Lsn {
return self.start_lsn;
self.start_lsn
}
fn get_end_lsn(&self) -> Lsn {
@@ -239,21 +241,32 @@ impl Layer for InMemoryLayer {
.unwrap_or_default();
println!(
"----- in-memory layer for {} {}-{} ----",
self.seg, self.start_lsn, end_str
"----- in-memory layer for tli {} seg {} {}-{} ----",
self.timelineid, self.seg, self.start_lsn, end_str
);
for (k, v) in inner.segsizes.iter() {
println!("{}: {}", k, v);
println!("segsizes {}: {}", k, v);
}
for (k, v) in inner.page_versions.iter() {
println!(
"blk {} at {}: {}/{}\n",
k.0,
k.1,
v.page_image.is_some(),
v.record.is_some()
);
}
//for (k, v) in inner.page_versions.iter() {
// println!("blk {} at {}: {}/{}", k.0, k.1, v.page_image.is_some(), v.record.is_some());
//}
Ok(())
}
}
// Type alias to simplify InMemoryLayer::freeze signature
//
type SuccessorLayers = (Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>);
impl InMemoryLayer {
/// Return the oldest page version that's stored in this layer
pub fn get_oldest_pending_lsn(&self) -> Lsn {
@@ -359,6 +372,36 @@ impl InMemoryLayer {
newsize,
lsn
);
// If we are extending the relation by more than one page, initialize the "gap"
// with zeros
//
// XXX: What if the caller initializes the gap with subsequent call with same LSN?
// I don't think that can happen currently, but that is highly dependent on how
// PostgreSQL writes its WAL records and there's no guarantee of it. If it does
// happen, we would hit the "page version already exists" warning above on the
// subsequent call to initialize the gap page.
let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize;
for gapblknum in gapstart..blknum {
let zeropv = PageVersion {
page_image: Some(ZERO_PAGE.clone()),
record: None,
};
println!(
"filling gap blk {} with zeros for write of {}",
gapblknum, blknum
);
let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
// We already had an entry for this LSN. That's odd..
if old.is_some() {
warn!(
"Page version of rel {} blk {} at {} already exists",
self.seg.rel, blknum, lsn
);
}
}
inner.segsizes.insert(lsn, newsize);
}
}
@@ -380,7 +423,7 @@ impl InMemoryLayer {
}
/// Remember that the segment was dropped at given LSN
pub fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
pub fn drop_segment(&self, lsn: Lsn) -> anyhow::Result<()> {
let mut inner = self.inner.lock().unwrap();
assert!(inner.drop_lsn.is_none());
@@ -429,14 +472,14 @@ impl InMemoryLayer {
inner: Mutex::new(InMemoryLayerInner {
drop_lsn: None,
page_versions: BTreeMap::new(),
segsizes: segsizes,
segsizes,
}),
predecessor: Some(src),
})
}
///
/// Write the this in-memory layer to disk, as a snapshot layer.
/// Write the this in-memory layer to disk.
///
/// The cutoff point for the layer that's written to disk is 'end_lsn'.
///
@@ -454,7 +497,7 @@ impl InMemoryLayer {
cutoff_lsn: Lsn,
// This is needed just to call materialize_page()
timeline: &LayeredTimeline,
) -> Result<(Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>)> {
) -> Result<SuccessorLayers> {
info!(
"freezing in memory layer for {} on timeline {} at {}",
self.seg, self.timelineid, cutoff_lsn
@@ -494,13 +537,17 @@ impl InMemoryLayer {
before_page_versions = BTreeMap::new();
after_page_versions = BTreeMap::new();
for ((blknum, lsn), pv) in inner.page_versions.iter() {
if *lsn == end_lsn {
// Page versions at the cutoff LSN will be stored in the
// materialized image layer.
} else if *lsn > end_lsn {
after_page_versions.insert((*blknum, *lsn), pv.clone());
} else {
before_page_versions.insert((*blknum, *lsn), pv.clone());
match lsn.cmp(&end_lsn) {
Ordering::Less => {
before_page_versions.insert((*blknum, *lsn), pv.clone());
}
Ordering::Equal => {
// Page versions at the cutoff LSN will be stored in the
// materialized image layer.
}
Ordering::Greater => {
after_page_versions.insert((*blknum, *lsn), pv.clone());
}
}
}
} else {
@@ -572,30 +619,4 @@ impl InMemoryLayer {
Ok((frozen_layers, new_open_rc))
}
/// debugging function to print out the contents of the layer
#[allow(unused)]
pub fn dump(&self) -> String {
let mut result = format!(
"----- inmemory layer for {} {}-> ----\n",
self.seg, self.start_lsn
);
let inner = self.inner.lock().unwrap();
for (k, v) in inner.segsizes.iter() {
result += &format!("{}: {}\n", k, v);
}
for (k, v) in inner.page_versions.iter() {
result += &format!(
"blk {} at {}: {}/{}\n",
k.0,
k.1,
v.page_image.is_some(),
v.record.is_some()
);
}
result
}
}

View File

@@ -1,5 +1,5 @@
//!
//! The layer map tracks what layers exist for all the relations in a timeline.
//! The layer map tracks what layers exist for all the relishes in a timeline.
//!
//! When the timeline is first accessed, the server lists of all layer files
//! in the timelines/<timelineid> directory, and populates this map with
@@ -43,6 +43,10 @@ pub struct LayerMap {
/// This allows easy access to the in-memory layer that contains the
/// oldest WAL record.
open_segs: BinaryHeap<OpenSegEntry>,
/// Generation number, used to distinguish newly inserted entries in the
/// binary heap from older entries during checkpoint.
current_generation: u64,
}
///
@@ -59,9 +63,13 @@ struct SegEntry {
/// Entry held LayerMap.open_segs, with boilerplate comparison
/// routines to implement a min-heap ordered by 'oldest_pending_lsn'
///
/// Each entry also carries a generation number. It can be used to distinguish
/// entries with the same 'oldest_pending_lsn'.
struct OpenSegEntry {
pub oldest_pending_lsn: Lsn,
pub layer: Arc<InMemoryLayer>,
pub generation: u64,
}
impl Ord for OpenSegEntry {
fn cmp(&self, other: &Self) -> Ordering {
@@ -73,10 +81,13 @@ impl Ord for OpenSegEntry {
impl PartialOrd for OpenSegEntry {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
// to get that.
other
.oldest_pending_lsn
.partial_cmp(&self.oldest_pending_lsn)
// to get that. Entries with identical oldest_pending_lsn are ordered by generation
Some(
other
.oldest_pending_lsn
.cmp(&self.oldest_pending_lsn)
.then_with(|| other.generation.cmp(&self.generation)),
)
}
}
impl PartialEq for OpenSegEntry {
@@ -98,7 +109,7 @@ impl LayerMap {
if let Some(open) = &segentry.open {
if open.get_start_lsn() <= lsn {
let x: Arc<dyn Layer> = Arc::clone(&open) as _;
let x: Arc<dyn Layer> = Arc::clone(open) as _;
return Some(x);
}
}
@@ -108,7 +119,7 @@ impl LayerMap {
.range((Included(Lsn(0)), Included(lsn)))
.next_back()
{
let x: Arc<dyn Layer> = Arc::clone(&v) as _;
let x: Arc<dyn Layer> = Arc::clone(v) as _;
Some(x)
} else {
None
@@ -121,12 +132,7 @@ impl LayerMap {
///
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
let segentry = self.segs.get(tag)?;
if let Some(open) = &segentry.open {
Some(Arc::clone(open))
} else {
None
}
segentry.open.as_ref().map(Arc::clone)
}
///
@@ -150,7 +156,8 @@ impl LayerMap {
let opensegentry = OpenSegEntry {
oldest_pending_lsn: layer.get_oldest_pending_lsn(),
layer: layer,
layer,
generation: self.current_generation,
};
self.open_segs.push(opensegentry);
@@ -259,15 +266,16 @@ impl LayerMap {
/// Is there a newer image layer for given segment?
///
/// This is used for garbage collection, to determine if an old layer can
/// be deleted. We ignore in-memory layers because they are not durable
/// on disk, and delta layers because they depend on an older layer.
/// be deleted.
pub fn newer_image_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
if let Some(segentry) = self.segs.get(&seg) {
// We only check on-disk layers, because
// in-memory layers are not durable
for (newer_lsn, layer) in segentry
.historic
.range((Included(lsn), Included(Lsn(u64::MAX))))
{
// Ignore delta layers.
// Ignore layers that depend on an older layer.
if layer.is_incremental() {
continue;
}
@@ -290,21 +298,46 @@ impl LayerMap {
false
}
/// Return the oldest in-memory layer.
pub fn peek_oldest_open(&self) -> Option<Arc<InMemoryLayer>> {
/// Return the oldest in-memory layer, along with its generation number.
pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
if let Some(opensegentry) = self.open_segs.peek() {
Some(Arc::clone(&opensegentry.layer))
Some((Arc::clone(&opensegentry.layer), opensegentry.generation))
} else {
None
}
}
/// Increment the generation number used to stamp open in-memory layers. Layers
/// added with `insert_open` after this call will be associated with the new
/// generation. Returns the new generation number.
pub fn increment_generation(&mut self) -> u64 {
self.current_generation += 1;
self.current_generation
}
pub fn iter_historic_layers(&self) -> HistoricLayerIter {
HistoricLayerIter {
segiter: self.segs.iter(),
iter: None,
}
}
/// debugging function to print out the contents of the layer map
#[allow(unused)]
pub fn dump(&self) -> Result<()> {
println!("Begin dump LayerMap");
for (seg, segentry) in self.segs.iter() {
if let Some(open) = &segentry.open {
open.dump()?;
}
for (_, layer) in segentry.historic.iter() {
layer.dump()?;
}
}
println!("End dump LayerMap");
Ok(())
}
}
impl Default for LayerMap {
@@ -312,6 +345,7 @@ impl Default for LayerMap {
LayerMap {
segs: HashMap::new(),
open_segs: BinaryHeap::new(),
current_generation: 0,
}
}
}

View File

@@ -97,24 +97,32 @@ pub enum PageReconstructResult {
}
///
/// A Layer holds all page versions for one segment of a relish, in a range of LSNs.
/// There are two kinds of layers, in-memory and snapshot layers. In-memory
/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
/// layers are used to ingest incoming WAL, and provide fast access
/// to the recent page versions. Snaphot layers are stored on disk, and
/// to the recent page versions. On-disk layers are stored as files on disk, and
/// are immutable. This trait presents the common functionality of
/// in-memory and snapshot layers.
///
/// Each layer contains a full snapshot of the segment at the start
/// LSN. In addition to that, it contains WAL (or more page images)
/// needed to recontruct any page version up to the end LSN.
/// in-memory and on-disk layers.
///
pub trait Layer: Send + Sync {
// These functions identify the relish segment and the LSN range
// that this Layer holds.
/// Identify the timeline this relish belongs to
fn get_timeline_id(&self) -> ZTimelineId;
/// Identify the relish segment
fn get_seg_tag(&self) -> SegmentTag;
/// Inclusive start bound of the LSN range that this layer hold
fn get_start_lsn(&self) -> Lsn;
/// 'end_lsn' meaning depends on the layer kind:
/// - in-memory layer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
/// - image layer represents snapshot at one LSN, so end_lsn = lsn
/// - delta layer has end_lsn
///
/// TODO Is end_lsn always exclusive for all layer kinds?
fn get_end_lsn(&self) -> Lsn;
/// Is the segment represented by this layer dropped by PostgreSQL?
fn is_dropped(&self) -> bool;
/// Filename used to store this layer on disk. (Even in-memory layers

View File

@@ -346,7 +346,7 @@ impl PageServerHandler {
pgb.write_message(&BeMessage::CopyOutResponse)?;
info!("sent CopyOut");
/* Send a tarball of the latest snapshot on the timeline */
/* Send a tarball of the latest layer on the timeline */
{
let mut writer = CopyDataSink { pgb };
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn);
@@ -372,7 +372,7 @@ impl PageServerHandler {
.claims
.as_ref()
.expect("claims presence already checked");
Ok(auth::check_permission(claims, tenantid)?)
auth::check_permission(claims, tenantid)
}
}
@@ -389,7 +389,7 @@ impl postgres_backend::Handler for PageServerHandler {
.as_ref()
.as_ref()
.unwrap()
.decode(&str::from_utf8(jwt_response)?)?;
.decode(str::from_utf8(jwt_response)?)?;
if matches!(data.claims.scope, Scope::Tenant) {
ensure!(
@@ -425,7 +425,7 @@ impl postgres_backend::Handler for PageServerHandler {
self.handle_controlfile(pgb)?;
} else if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(" ").collect::<Vec<_>>();
let params = params_raw.split(' ').collect::<Vec<_>>();
ensure!(
params.len() == 2,
"invalid param number for pagestream command"
@@ -484,7 +484,7 @@ impl postgres_backend::Handler for PageServerHandler {
.get_timeline(timelineid)
.context(format!("error fetching timeline {}", timelineid))?;
walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr, tenantid.to_owned());
walreceiver::launch_wal_receiver(self.conf, timelineid, &connstr, tenantid.to_owned());
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("branch_create ") {
@@ -492,10 +492,10 @@ impl postgres_backend::Handler for PageServerHandler {
// branch_create <tenantid> <branchname> <startpoint>
// TODO lazy static
// TOOD: escaping, to allow branch names with spaces
// TODO: escaping, to allow branch names with spaces
let re = Regex::new(r"^branch_create ([[:xdigit:]]+) (\S+) ([^\r\n\s;]+)[\r\n\s;]*;?$")
.unwrap();
let caps = re.captures(&query_string).ok_or_else(err)?;
let caps = re.captures(query_string).ok_or_else(err)?;
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
let branchname = caps.get(2).ok_or_else(err)?.as_str().to_owned();
@@ -504,7 +504,7 @@ impl postgres_backend::Handler for PageServerHandler {
self.check_permission(Some(tenantid))?;
let branch =
branches::create_branch(&self.conf, &branchname, &startpoint_str, &tenantid)?;
branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
let branch = serde_json::to_vec(&branch)?;
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
@@ -519,14 +519,14 @@ impl postgres_backend::Handler for PageServerHandler {
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
let branches = crate::branches::get_branches(&self.conf, &tenantid)?;
let branches = crate::branches::get_branches(self.conf, &tenantid)?;
let branches_buf = serde_json::to_vec(&branches)?;
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
.write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("tenant_list") {
let tenants = crate::branches::get_tenants(&self.conf)?;
let tenants = crate::branches::get_tenants(self.conf)?;
let tenants_buf = serde_json::to_vec(&tenants)?;
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
@@ -537,13 +537,13 @@ impl postgres_backend::Handler for PageServerHandler {
// tenant_create <tenantid>
let re = Regex::new(r"^tenant_create ([[:xdigit:]]+)$").unwrap();
let caps = re.captures(&query_string).ok_or_else(err)?;
let caps = re.captures(query_string).ok_or_else(err)?;
self.check_permission(None)?;
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
tenant_mgr::create_repository_for_tenant(&self.conf, tenantid)?;
tenant_mgr::create_repository_for_tenant(self.conf, tenantid)?;
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
@@ -582,54 +582,54 @@ impl postgres_backend::Handler for PageServerHandler {
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"snapshot_relfiles_total"),
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
RowDescriptor::int8_col(b"layer_relfiles_total"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
RowDescriptor::int8_col(b"layer_relfiles_removed"),
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
RowDescriptor::int8_col(b"elapsed"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(&result.ondisk_relfiles_total.to_string().as_bytes()),
Some(result.ondisk_relfiles_total.to_string().as_bytes()),
Some(
&result
result
.ondisk_relfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
&result
result
.ondisk_relfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(&result.ondisk_relfiles_not_updated.to_string().as_bytes()),
Some(&result.ondisk_relfiles_removed.to_string().as_bytes()),
Some(&result.ondisk_relfiles_dropped.to_string().as_bytes()),
Some(&result.ondisk_nonrelfiles_total.to_string().as_bytes()),
Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
Some(
&result
result
.ondisk_nonrelfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
&result
result
.ondisk_nonrelfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(&result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
Some(&result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
Some(&result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
Some(&result.elapsed.as_millis().to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
Some(result.elapsed.as_millis().to_string().as_bytes()),
]))?
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else {

View File

@@ -125,11 +125,7 @@ impl RelishTag {
// convenience function to check if this relish is a normal relation.
pub const fn is_relation(&self) -> bool {
if let RelishTag::Relation(_) = self {
true
} else {
false
}
matches!(self, RelishTag::Relation(_))
}
}

View File

@@ -113,7 +113,7 @@ pub trait Timeline: Send + Sync {
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;
/// Get a list of non-relational objects
fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
//------------------------------------------------------------------------------
// Public PUT functions, to update the repository with new page versions.
@@ -133,9 +133,8 @@ pub trait Timeline: Send + Sync {
/// Truncate relation
fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;
/// Unlink relish.
/// This method is used for marking dropped relations and truncated SLRU segments
fn put_unlink(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
/// This method is used for marking dropped relations and truncated SLRU files
fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
/// Track end of the latest digested WAL record.
///
@@ -147,6 +146,7 @@ pub trait Timeline: Send + Sync {
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
fn get_last_record_lsn(&self) -> Lsn;
fn get_prev_record_lsn(&self) -> Lsn;
fn get_start_lsn(&self) -> Lsn;
///
/// Flush to disk all data that was written with the put_* functions
@@ -201,6 +201,7 @@ impl WALRecord {
///
/// Tests that should work the same with any Repository/Timeline implementation.
///
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {
use super::*;
@@ -357,6 +358,37 @@ mod tests {
TEST_IMG("foo blk 2 at 5")
);
// Truncate to zero length
tline.put_truncation(TESTREL_A, Lsn(0x60), 0)?;
tline.advance_last_record_lsn(Lsn(0x60));
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 0);
// Extend from 0 to 2 blocks, leaving a gap
tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
tline.advance_last_record_lsn(Lsn(0x70));
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
assert_eq!(
tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?,
TEST_IMG("foo blk 1")
);
// Extend a lot more, leaving a big gap that spans across segments
// FIXME: This is currently broken, see https://github.com/zenithdb/zenith/issues/500
/*
tline.put_page_image(TESTREL_A, 1500, Lsn(0x80), TEST_IMG("foo blk 1500"))?;
tline.advance_last_record_lsn(Lsn(0x80));
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), 1501);
for blk in 2..1500 {
assert_eq!(
tline.get_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?,
ZERO_PAGE);
}
assert_eq!(
tline.get_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?,
TEST_IMG("foo blk 1500"));
*/
Ok(())
}
@@ -475,6 +507,7 @@ mod tests {
impl WalRedoManager for TestRedoManager {
fn request_redo(
&self,
timeline: &dyn Timeline,
rel: RelishTag,
blknum: u32,
lsn: Lsn,

View File

@@ -29,7 +29,7 @@ use zenith_utils::lsn::Lsn;
const MAX_MBR_BLKNO: u32 =
pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
const ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
///
/// Import all relation data pages from local disk into the repository.
@@ -45,7 +45,6 @@ pub fn import_timeline_from_postgres_datadir(
match direntry.file_name().to_str() {
None => continue,
// These special files appear in the snapshot, but are not needed by the page server
Some("pg_control") => {
import_nonrel_file(timeline, lsn, RelishTag::ControlFile, &direntry.path())?;
// Extract checkpoint record from pg_control and store is as separate object
@@ -93,7 +92,6 @@ pub fn import_timeline_from_postgres_datadir(
match direntry.file_name().to_str() {
None => continue,
// These special files appear in the snapshot, but are not needed by the page server
Some("PG_VERSION") => continue,
Some("pg_filenode.map") => import_nonrel_file(
timeline,
@@ -130,7 +128,7 @@ pub fn import_timeline_from_postgres_datadir(
}
for entry in fs::read_dir(path.join("pg_twophase"))? {
let entry = entry?;
let xid = u32::from_str_radix(&entry.path().to_str().unwrap(), 16)?;
let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
}
// TODO: Scan pg_tblspc
@@ -153,7 +151,7 @@ fn import_relfile(
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
if let Err(e) = p {
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
return Err(e.into());
}
let (relnode, forknum, segno) = p.unwrap();
@@ -397,15 +395,15 @@ pub fn save_decoded_record(
for tablespace_id in dropdb.tablespace_ids {
let rels = timeline.list_rels(tablespace_id, dropdb.db_id, lsn)?;
for rel in rels {
timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
}
trace!(
"Unlink FileNodeMap {}, {} at lsn {}",
"Drop FileNodeMap {}, {} at lsn {}",
tablespace_id,
dropdb.db_id,
lsn
);
timeline.put_unlink(
timeline.drop_relish(
RelishTag::FileNodeMap {
spcnode: tablespace_id,
dbnode: dropdb.db_id,
@@ -429,7 +427,7 @@ pub fn save_decoded_record(
},
rpageno,
lsn,
ZERO_PAGE,
ZERO_PAGE.clone(),
)?;
} else {
assert!(info == pg_constants::CLOG_TRUNCATE);
@@ -448,12 +446,12 @@ pub fn save_decoded_record(
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
trace!(
"unlink twophaseFile for xid {} parsed_xact.xid {} here at {}",
"Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
decoded.xl_xid,
parsed_xact.xid,
lsn
);
timeline.put_unlink(
timeline.drop_relish(
RelishTag::TwoPhase {
xid: parsed_xact.xid,
},
@@ -486,7 +484,7 @@ pub fn save_decoded_record(
},
rpageno,
lsn,
ZERO_PAGE,
ZERO_PAGE.clone(),
)?;
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
let pageno = buf.get_u32_le();
@@ -499,7 +497,7 @@ pub fn save_decoded_record(
},
rpageno,
lsn,
ZERO_PAGE,
ZERO_PAGE.clone(),
)?;
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
@@ -597,19 +595,16 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
// TODO This implementation is very inefficient -
// it scans all non-rels only to find FileNodeMaps
for tag in timeline.list_nonrels(req_lsn)? {
match tag {
RelishTag::FileNodeMap { spcnode, dbnode } => {
if spcnode == src_tablespace_id && dbnode == src_db_id {
let img = timeline.get_page_at_lsn_nowait(tag, 0, req_lsn)?;
let new_tag = RelishTag::FileNodeMap {
spcnode: tablespace_id,
dbnode: db_id,
};
timeline.put_page_image(new_tag, 0, lsn, img)?;
break;
}
if let RelishTag::FileNodeMap { spcnode, dbnode } = tag {
if spcnode == src_tablespace_id && dbnode == src_db_id {
let img = timeline.get_page_at_lsn_nowait(tag, 0, req_lsn)?;
let new_tag = RelishTag::FileNodeMap {
spcnode: tablespace_id,
dbnode: db_id,
};
timeline.put_page_image(new_tag, 0, lsn, img)?;
break;
}
_ => {} // do nothing
}
}
info!(
@@ -733,7 +728,7 @@ fn save_xact_record(
dbnode: xnode.dbnode,
relnode: xnode.relnode,
};
timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
}
}
Ok(())
@@ -775,7 +770,7 @@ fn save_clog_truncate_record(
return Ok(());
}
// Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
// Iterate via SLRU CLOG segments and drop segments that we're ready to truncate
// TODO This implementation is very inefficient -
// it scans all non-rels only to find Clog
//
@@ -785,17 +780,14 @@ fn save_clog_truncate_record(
// instead.
let req_lsn = min(timeline.get_last_record_lsn(), lsn);
for obj in timeline.list_nonrels(req_lsn)? {
match obj {
RelishTag::Slru { slru, segno } => {
if slru == SlruKind::Clog {
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
timeline.put_unlink(RelishTag::Slru { slru, segno }, lsn)?;
trace!("unlink CLOG segment {:>04X} at lsn {}", segno, lsn);
}
if let RelishTag::Slru { slru, segno } = obj {
if slru == SlruKind::Clog {
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?;
trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn);
}
}
_ => {}
}
}
@@ -894,7 +886,7 @@ fn save_multixact_truncate_record(
// Delete all the segments except the last one. The last segment can still
// contain, possibly partially, valid data.
while segment != endsegment {
timeline.put_unlink(
timeline.drop_relish(
RelishTag::Slru {
slru: SlruKind::MultiXactMembers,
segno: segment as u32,

View File

@@ -197,6 +197,7 @@ impl WalStreamDecoder {
}
#[allow(dead_code)]
#[derive(Default)]
pub struct DecodedBkpBlock {
/* Is this block ref in use? */
//in_use: bool,
@@ -229,25 +230,7 @@ pub struct DecodedBkpBlock {
impl DecodedBkpBlock {
pub fn new() -> DecodedBkpBlock {
DecodedBkpBlock {
rnode_spcnode: 0,
rnode_dbnode: 0,
rnode_relnode: 0,
forknum: 0,
blkno: 0,
flags: 0,
has_image: false,
apply_image: false,
will_init: false,
hole_offset: 0,
hole_length: 0,
bimg_len: 0,
bimg_info: 0,
has_data: false,
data_len: 0,
}
Default::default()
}
}

View File

@@ -164,7 +164,7 @@ fn walreceiver_main(
// There might be some padding after the last full record, skip it.
startpoint += startpoint.calc_padding(8u32);
debug!(
info!(
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
last_rec_lsn, startpoint, timelineid, end_of_wal
);
@@ -457,7 +457,7 @@ fn write_wal_file(
{
Ok(mut file) => {
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
file.write_all(&ZERO_BLOCK)?;
file.write_all(ZERO_BLOCK)?;
}
wal_file = file;
}

View File

@@ -43,7 +43,7 @@ use zenith_utils::lsn::Lsn;
use zenith_utils::zid::ZTenantId;
use crate::relish::*;
use crate::repository::WALRecord;
use crate::repository::{Timeline, WALRecord};
use crate::waldecoder::XlMultiXactCreate;
use crate::waldecoder::XlXactParsedRecord;
use crate::PageServerConf;
@@ -79,6 +79,7 @@ pub trait WalRedoManager: Send + Sync {
/// the reords.
fn request_redo(
&self,
timeline: &dyn Timeline,
rel: RelishTag,
blknum: u32,
lsn: Lsn,
@@ -96,6 +97,7 @@ pub struct DummyRedoManager {}
impl crate::walredo::WalRedoManager for DummyRedoManager {
fn request_redo(
&self,
_timeline: &dyn Timeline,
_rel: RelishTag,
_blknum: u32,
_lsn: Lsn,
@@ -176,6 +178,7 @@ impl WalRedoManager for PostgresRedoManager {
///
fn request_redo(
&self,
timeline: &dyn Timeline,
rel: RelishTag,
blknum: u32,
lsn: Lsn,
@@ -209,13 +212,20 @@ impl WalRedoManager for PostgresRedoManager {
let process = (*process_guard).as_ref().unwrap();
self.runtime
.block_on(self.handle_apply_request(&process, &request))
.block_on(self.handle_apply_request(process, &request))
};
end_time = Instant::now();
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
if let Ok(page) = result {
let mut buf = BytesMut::new();
buf.extend_from_slice(&page);
self.set_hint_bits(timeline, &mut buf, lsn, &request.records);
return Ok(buf.freeze());
}
result
}
}
@@ -242,6 +252,117 @@ impl PostgresRedoManager {
}
}
fn xid_status(&self, timeline: &dyn Timeline, xid: u32, lsn: Lsn) -> u8 {
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
if let Ok(clog_page) = timeline.get_page_at_lsn_nowait(
RelishTag::Slru {
slru: SlruKind::Clog,
segno,
},
rpageno,
lsn,
) {
postgres_ffi::nonrelfile_utils::transaction_id_get_status(xid, &clog_page[..])
} else {
pg_constants::TRANSACTION_STATUS_IN_PROGRESS
}
}
fn set_hint_bits(
&self,
timeline: &dyn Timeline,
page: &mut BytesMut,
lsn: Lsn,
records: &Vec<WALRecord>,
) {
let mut flags = LittleEndian::read_u16(
&page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
);
if (flags & (pg_constants::PD_HEAP_RELATION | pg_constants::PD_NONHEAP_RELATION)) == 0 {
// If type of relation was not determined yet,
// then do it now
for r in records {
let xl_rmid = r.rec[pg_constants::XL_RMID_OFFS];
if xl_rmid == pg_constants::RM_HEAP_ID || xl_rmid == pg_constants::RM_HEAP2_ID {
flags |= pg_constants::PD_HEAP_RELATION;
break;
}
}
if (flags & pg_constants::PD_HEAP_RELATION) == 0 {
flags |= pg_constants::PD_NONHEAP_RELATION;
}
LittleEndian::write_u16(
&mut page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
flags,
);
}
if (flags & pg_constants::PD_HEAP_RELATION) != 0 {
// Set hint bits for heap relation page
let pd_lower = LittleEndian::read_u16(
&page[pg_constants::PD_LOWER_OFFSET..pg_constants::PD_LOWER_OFFSET + 2],
) as usize;
let mut tid_offs = pg_constants::SIZE_OF_PAGE_HEADER_DATA;
while tid_offs < pd_lower {
let tid = LittleEndian::read_u32(&page[tid_offs..tid_offs + 4]);
let lp_off = (tid & 0x7FFF) as usize;
if ((tid >> 15) & 3) == pg_constants::LP_NORMAL {
// normal item pointer
let t_xmin = LittleEndian::read_u32(
&page[lp_off + pg_constants::T_XMIN_OFFS
..lp_off + pg_constants::T_XMIN_OFFS + 4],
);
let t_xmax = LittleEndian::read_u32(
&page[lp_off + pg_constants::T_XMAX_OFFS
..lp_off + pg_constants::T_XMAX_OFFS + 4],
);
let mut t_infomask = LittleEndian::read_u16(
&page[lp_off + pg_constants::T_INFOMASK_OFFS
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
);
if (t_infomask
& (pg_constants::HEAP_XMIN_COMMITTED | pg_constants::HEAP_XMIN_INVALID))
== 0
&& t_xmin != 0
{
let status = self.xid_status(timeline, t_xmin, lsn);
if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
t_infomask |= pg_constants::HEAP_XMIN_COMMITTED;
} else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
t_infomask |= pg_constants::HEAP_XMIN_INVALID;
}
LittleEndian::write_u16(
&mut page[lp_off + pg_constants::T_INFOMASK_OFFS
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
t_infomask,
);
}
if (t_infomask
& (pg_constants::HEAP_XMAX_COMMITTED
| pg_constants::HEAP_XMAX_INVALID
| pg_constants::HEAP_XMAX_IS_MULTI))
== 0
&& t_xmax != 0
{
let status = self.xid_status(timeline, t_xmax, lsn);
if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
t_infomask |= pg_constants::HEAP_XMAX_COMMITTED;
} else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
t_infomask |= pg_constants::HEAP_XMAX_INVALID;
}
LittleEndian::write_u16(
&mut page[lp_off + pg_constants::T_INFOMASK_OFFS
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
t_infomask,
);
}
}
tid_offs += 4;
}
}
}
///
/// Process one request for WAL redo.
///
@@ -324,7 +445,7 @@ impl PostgresRedoManager {
if rec_segno == segno && blknum == rpageno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
}
@@ -453,7 +574,7 @@ impl PostgresRedoProcess {
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
// just create one with constant name. That fails if you try to launch more than
// one WAL redo manager concurrently.
let datadir = conf.tenant_path(&tenantid).join("wal-redo-datadir");
let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir");
// Create empty data directory for wal-redo postgres, deleting old one first.
if datadir.exists() {

View File

@@ -11,7 +11,7 @@
//! data directory is compatible with a postgres binary. That includes
//! a version number, configuration options that can be set at
//! compilation time like the block size, and the platform's alignment
//! and endianess information. (The PostgreSQL on-disk file format is
//! and endianness information. (The PostgreSQL on-disk file format is
//! not portable across platforms.)
//!
//! The control file is stored in the PostgreSQL data directory, as

View File

@@ -46,6 +46,7 @@ pub const SIZE_OF_PAGE_HEADER: u16 = 24;
pub const BITS_PER_HEAPBLOCK: u16 = 2;
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00;
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
@@ -189,11 +190,36 @@ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
pub const XLP_LONG_HEADER: u16 = 0x0002;
pub const PG_MAJORVERSION: &'static str = "14";
pub const PG_MAJORVERSION: &str = "14";
// Zenith specific page flags used to distinguish heap and non-heap relations
pub const PD_HEAP_RELATION: u16 = 0x10;
pub const PD_NONHEAP_RELATION: u16 = 0x20;
// bufpage.h
pub const PD_FLAGS_OFFSET: usize = 10; // PageHeaderData.pd_flags
pub const PD_LOWER_OFFSET: usize = 12; // PageHeaderData.pd_lower
// itemid.h
pub const LP_NORMAL: u32 = 1;
// htup_details.h
pub const T_XMIN_OFFS: usize = 0;
pub const T_XMAX_OFFS: usize = 4;
pub const T_INFOMASK_OFFS: usize = 4 * 3 + 2 * 3 + 2;
pub const HEAP_XMIN_COMMITTED: u16 = 0x0100; /* t_xmin committed */
pub const HEAP_XMIN_INVALID: u16 = 0x0200; /* t_xmin invalid/aborted */
pub const HEAP_XMAX_COMMITTED: u16 = 0x0400; /* t_xmax committed */
pub const HEAP_XMAX_INVALID: u16 = 0x0800; /* t_xmax invalid/aborted */
pub const HEAP_XMAX_IS_MULTI: u16 = 0x1000; /* t_xmax is a MultiXactId */
pub const SIZE_OF_PAGE_HEADER_DATA: usize = 24;
// xlogrecord.h
pub const XL_RMID_OFFS: usize = 17;
// List of subdirectories inside pgdata.
// Copied from src/bin/initdb/initdb.c
pub const PGDATA_SUBDIRS: [&'static str; 22] = [
pub const PGDATA_SUBDIRS: [&str; 22] = [
"global",
"pg_wal/archive_status",
"pg_commit_ts",
@@ -218,11 +244,11 @@ pub const PGDATA_SUBDIRS: [&'static str; 22] = [
"pg_logical/mappings",
];
pub const PGDATA_SPECIAL_FILES: [&'static str; 4] = [
"pg_hba.conf",
"pg_ident.conf",
"postgresql.conf",
"postgresql.auto.conf",
];
// Don't include postgresql.conf as it is inconvenient on node start:
// we need postgresql.conf before basebackup to synchronize safekeepers
// so no point in overwriting it during backup restore. Rest of the files
// here are not needed before backup so it is okay to edit them after.
pub const PGDATA_SPECIAL_FILES: [&str; 3] =
["pg_hba.conf", "pg_ident.conf", "postgresql.auto.conf"];
pub static PG_HBA: &'static str = include_str!("../samples/pg_hba.conf");
pub static PG_HBA: &str = include_str!("../samples/pg_hba.conf");

View File

@@ -26,6 +26,7 @@ use std::fs::{self, File};
use std::io::prelude::*;
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use zenith_utils::lsn::Lsn;
pub const XLOG_FNAME_LEN: usize = 24;
pub const XLOG_BLCKSZ: usize = 8192;
@@ -37,6 +38,7 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
#[allow(clippy::identity_op)]
pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
pub type XLogRecPtr = u64;
@@ -88,6 +90,21 @@ pub fn IsPartialXLogFileName(fname: &str) -> bool {
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
}
/// If LSN points to the beginning of the page, then shift it to first record,
/// otherwise align on 8-bytes boundary (required for WAL records)
pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
if lsn.0 % XLOG_BLCKSZ as u64 == 0 {
let hdr_size = if lsn.0 % seg_sz as u64 == 0 {
XLOG_SIZE_OF_XLOG_LONG_PHD
} else {
XLOG_SIZE_OF_XLOG_SHORT_PHD
};
lsn + hdr_size as u64
} else {
lsn.align()
}
}
pub fn get_current_timestamp() -> TimestampTz {
const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
@@ -173,12 +190,11 @@ fn find_end_of_wal_segment(
let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS;
wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]);
crc = !crc;
} else {
crc ^= 0xFFFFFFFFu32;
crc = crc32c_append(crc, &buf[page_offs..page_offs + n]);
crc = !crc;
}
crc = !crc;
rec_offs += n;
offs += n;
contlen -= n;
@@ -416,7 +432,6 @@ mod tests {
use super::*;
use regex::Regex;
use std::{env, process::Command, str::FromStr};
use zenith_utils::lsn::Lsn;
// Run find_end_of_wal against file in test_wal dir
// Ensure that it finds last record correctly
@@ -465,7 +480,7 @@ mod tests {
let waldump_output = std::str::from_utf8(&waldump_output.stderr).unwrap();
println!("waldump_output = '{}'", &waldump_output);
let re = Regex::new(r"invalid record length at (.+):").unwrap();
let caps = re.captures(&waldump_output).unwrap();
let caps = re.captures(waldump_output).unwrap();
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
// 5. Rename file to partial to actually find last valid lsn

View File

@@ -56,7 +56,7 @@ impl CPlaneApi {
md5::compute([stored_hash.as_bytes(), salt].concat())
);
let received_hash = std::str::from_utf8(&md5_response)?;
let received_hash = std::str::from_utf8(md5_response)?;
println!(
"auth: {} rh={} sh={} ssh={} {:?}",

View File

@@ -143,10 +143,10 @@ fn main() -> anyhow::Result<()> {
// for each connection.
thread::Builder::new()
.name("Proxy thread".into())
.spawn(move || proxy::thread_main(&state, pageserver_listener))?,
.spawn(move || proxy::thread_main(state, pageserver_listener))?,
thread::Builder::new()
.name("Mgmt thread".into())
.spawn(move || mgmt::thread_main(&state, mgmt_listener))?,
.spawn(move || mgmt::thread_main(state, mgmt_listener))?,
];
for t in threads.into_iter() {

View File

@@ -1,3 +1,4 @@
import subprocess
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
@@ -74,3 +75,18 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
# All the rows are visible on the main branch
main_cur.execute('SELECT count(*) FROM foo')
assert main_cur.fetchone() == (200100, )
# Check bad lsn's for branching
# branch at segment boundary
zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
pg = postgres.create_start("test_branch_segment_boundary")
cur = pg.connect().cursor()
cur.execute('SELECT 1')
assert cur.fetchone() == (1, )
# branch at pre-initdb lsn
try:
zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
except subprocess.CalledProcessError:
print("Branch creation with pre-initdb LSN failed (as expected)")

View File

@@ -3,7 +3,7 @@ import os
from contextlib import closing
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
pytest_plugins = ("fixtures.zenith_fixtures")

View File

@@ -2,7 +2,7 @@ import os
import pathlib
from contextlib import closing
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -69,6 +69,8 @@ def test_dropdb(
with conn.cursor() as cur:
cur.execute('DROP DATABASE foodb')
cur.execute('CHECKPOINT')
cur.execute('SELECT pg_current_wal_insert_lsn()')
lsn_after_drop = cur.fetchone()[0]
@@ -94,3 +96,6 @@ def test_dropdb(
print(dbpath)
assert os.path.isdir(dbpath) == False
# Check that we restore the content of the datadir correctly
check_restored_datadir_content(zenith_cli, pg, lsn_after_drop, postgres)

View File

@@ -1,4 +1,4 @@
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -63,3 +63,6 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_b
# Check that we restored pg_controlfile correctly
assert next_multixact_id_new == next_multixact_id
# Check that we restore the content of the datadir correctly
check_restored_datadir_content(zenith_cli, pg, lsn, postgres)

View File

@@ -6,19 +6,19 @@ pytest_plugins = ("fixtures.zenith_fixtures")
def print_gc_result(row):
print("GC duration {elapsed} ms".format_map(row));
print(" REL total: {snapshot_relfiles_total}, needed_by_cutoff {snapshot_relfiles_needed_by_cutoff}, needed_by_branches: {snapshot_relfiles_needed_by_branches}, not_updated: {snapshot_relfiles_not_updated}, removed: {snapshot_relfiles_removed}, dropped: {snapshot_relfiles_dropped}".format_map(row))
print(" NONREL total: {snapshot_nonrelfiles_total}, needed_by_cutoff {snapshot_nonrelfiles_needed_by_cutoff}, needed_by_branches: {snapshot_nonrelfiles_needed_by_branches}, not_updated: {snapshot_nonrelfiles_not_updated}, removed: {snapshot_nonrelfiles_removed}, dropped: {snapshot_nonrelfiles_dropped}".format_map(row))
print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))
#
# Test Garbage Collection of old snapshot files
# Test Garbage Collection of old layer files
#
# This test is pretty tightly coupled with the current implementation of layered
# storage, in layered_repository.rs.
#
def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
zenith_cli.run(["branch", "test_snapfiles_gc", "empty"])
pg = postgres.create_start('test_snapfiles_gc')
def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
zenith_cli.run(["branch", "test_layerfiles_gc", "empty"])
pg = postgres.create_start('test_layerfiles_gc')
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
@@ -55,8 +55,8 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
row = pscur.fetchone()
print_gc_result(row);
# remember the number of files
snapshot_relfiles_remain = row['snapshot_relfiles_total'] - row['snapshot_relfiles_removed']
assert snapshot_relfiles_remain > 0
layer_relfiles_remain = row['layer_relfiles_total'] - row['layer_relfiles_removed']
assert layer_relfiles_remain > 0
# Insert a row.
print("Inserting one row and running GC")
@@ -64,12 +64,12 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
assert row['snapshot_relfiles_removed'] == 1
assert row['snapshot_relfiles_dropped'] == 0
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
assert row['layer_relfiles_removed'] == 1
assert row['layer_relfiles_dropped'] == 0
# Insert two more rows and run GC.
# This should create a new snapshot file with the new contents, and
# This should create a new layer file with the new contents, and
# remove the old one.
print("Inserting two more rows and running GC")
cur.execute("INSERT INTO foo VALUES (2)")
@@ -78,11 +78,11 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
assert row['snapshot_relfiles_removed'] == 1
assert row['snapshot_relfiles_dropped'] == 0
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
assert row['layer_relfiles_removed'] == 1
assert row['layer_relfiles_dropped'] == 0
# Do it again. Should again create a new snapshot file and remove old one.
# Do it again. Should again create a new layer file and remove old one.
print("Inserting two more rows and running GC")
cur.execute("INSERT INTO foo VALUES (2)")
cur.execute("INSERT INTO foo VALUES (3)")
@@ -90,18 +90,18 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
assert row['snapshot_relfiles_removed'] == 1
assert row['snapshot_relfiles_dropped'] == 0
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
assert row['layer_relfiles_removed'] == 1
assert row['layer_relfiles_dropped'] == 0
# Run GC again, with no changes in the database. Should not remove anything.
print("Run GC again, with nothing to do")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
assert row['snapshot_relfiles_removed'] == 0
assert row['snapshot_relfiles_dropped'] == 0
assert row['layer_relfiles_total'] == layer_relfiles_remain
assert row['layer_relfiles_removed'] == 0
assert row['layer_relfiles_dropped'] == 0
#
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
@@ -114,11 +114,11 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
print_gc_result(row);
# Each relation fork is counted separately, hence 3.
assert row['snapshot_relfiles_dropped'] == 3
assert row['layer_relfiles_dropped'] == 3
# The catalog updates also create new snapshot files of the catalogs, which
# The catalog updates also create new layer files of the catalogs, which
# are counted as 'removed'
assert row['snapshot_relfiles_removed'] > 0
assert row['layer_relfiles_removed'] > 0
# TODO: perhaps we should count catalog and user relations separately,
# to make this kind of testing more robust

View File

@@ -59,9 +59,8 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
# Create a branch with the transaction in prepared state
zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])
# Create compute node, but don't start.
# We want to observe pgdata before postgres starts
pg2 = postgres.create(
# Start compute on the new branch
pg2 = postgres.create_start(
'test_twophase_prepared',
config_lines=['max_prepared_transactions=5'],
)
@@ -71,7 +70,6 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
print(twophase_files2)
assert twophase_files2.sort() == twophase_files.sort()
pg2 = pg2.start()
conn2 = pg2.connect()
cur2 = conn2.cursor()

View File

@@ -1,7 +1,7 @@
import os
from fixtures.utils import mkdir_if_needed
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -49,3 +49,10 @@ def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_
# logs the exact same data to `regression.out` anyway.
with capsys.disabled():
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
# checkpoint one more time to ensure that the lsn we get is the latest one
pg.safe_psql('CHECKPOINT')
lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
# Check that we restore the content of the datadir correctly
check_restored_datadir_content(zenith_cli, pg, lsn, postgres)

View File

@@ -1,7 +1,7 @@
import os
from fixtures.utils import mkdir_if_needed
from fixtures.zenith_fixtures import PostgresFactory
from fixtures.zenith_fixtures import PostgresFactory, check_restored_datadir_content
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -50,3 +50,10 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
# logs the exact same data to `regression.out` anyway.
with capsys.disabled():
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
# checkpoint one more time to ensure that the lsn we get is the latest one
pg.safe_psql('CHECKPOINT')
lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
# Check that we restore the content of the datadir correctly
check_restored_datadir_content(zenith_cli, pg, lsn, postgres)

View File

@@ -1,6 +1,7 @@
from pprint import pprint
import os
import re
import timeit
import pathlib
import uuid
@@ -78,7 +79,7 @@ class ZenithBenchmarkResults:
self.results.append((test_name, metric_name, metric_value, unit))
# Sesssion scope fixture that initializes the results object
# Session scope fixture that initializes the results object
@pytest.fixture(autouse=True, scope='session')
def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
"""
@@ -120,6 +121,35 @@ class ZenithBenchmarker:
self.results.record(self.request.node.name, metric_name, end - start, 's')
def get_io_writes(self, pageserver) -> int:
"""
Fetch the "cumulative # of bytes written" metric from the pageserver
"""
# Fetch all the exposed prometheus metrics from page server
all_metrics = pageserver.http_client().get_metrics()
# Use a regular expression to extract the one we're interested in
#
# TODO: If we start to collect more of the prometheus metrics in the
# performance test suite like this, we should refactor this to load and
# parse all the metrics into a more convenient structure in one go.
#
# The metric should be an integer, as it's a number of bytes. But in general
# all prometheus metrics are floats. So to be pedantic, read it as a float
# and round to integer.
matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics)
return int(round(float(matches.group(1))))
@contextmanager
def record_pageserver_writes(self, pageserver, metric_name):
"""
Record bytes written by the pageserver during a test.
"""
before = self.get_io_writes(pageserver)
yield
after = self.get_io_writes(pageserver)
self.results.record(self.request.node.name, metric_name, round((after - before) / (1024 * 1024)), 'MB')
@pytest.fixture(scope='function')
def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
"""

View File

@@ -10,6 +10,8 @@ import shutil
import signal
import subprocess
import time
import filecmp
import difflib
from contextlib import closing
from pathlib import Path
@@ -169,12 +171,23 @@ class ZenithCli:
args = [self.bin_zenith] + arguments
print('Running command "{}"'.format(' '.join(args)))
return subprocess.run(args,
env=self.env,
check=True,
universal_newlines=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# Interceipt CalledProcessError and print more info
try:
res = subprocess.run(args,
env=self.env,
check=True,
universal_newlines=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
except subprocess.CalledProcessError as err:
print(f"Run failed: {err}")
print(f" stdout: {err.stdout}")
print(f" stderr: {err.stderr}")
raise err
return res
@zenfixture
@@ -226,6 +239,11 @@ class ZenithPageserverHttpClient(requests.Session):
res.raise_for_status()
return res.json()
def get_metrics(self) -> str:
res = self.get(f"http://localhost:{self.port}/metrics")
res.raise_for_status()
return res.text
@dataclass
class AuthKeys:
@@ -434,7 +452,6 @@ class Postgres(PgProtocol):
branch: str,
wal_acceptors: Optional[str] = None,
config_lines: Optional[List[str]] = None,
config_only: bool = False,
) -> 'Postgres':
"""
Create the pg data directory.
@@ -446,10 +463,7 @@ class Postgres(PgProtocol):
if not config_lines:
config_lines = []
if config_only:
self.zenith_cli.run(['pg', 'create', '--config-only', branch, f'--tenantid={self.tenant_id}'])
else:
self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
self.branch = branch
path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch
self.pgdata_dir = os.path.join(self.repo_dir, path)
@@ -470,11 +484,13 @@ class Postgres(PgProtocol):
assert self.branch is not None
print(f"Starting postgres on brach {self.branch}")
print(f"Starting postgres on branch {self.branch}")
self.zenith_cli.run(['pg', 'start', self.branch, f'--tenantid={self.tenant_id}'])
run_result = self.zenith_cli.run(['pg', 'start', self.branch, f'--tenantid={self.tenant_id}'])
self.running = True
print(f"stdout: {run_result.stdout}")
self.pg_bin.run(['pg_controldata', self.pg_data_dir_path()])
return self
@@ -572,7 +588,6 @@ class Postgres(PgProtocol):
branch=branch,
wal_acceptors=wal_acceptors,
config_lines=config_lines,
config_only=True,
).start()
return self
@@ -584,6 +599,23 @@ class Postgres(PgProtocol):
self.stop()
def list_files_to_compare(self):
pgdata_files = []
for root, _file, filenames in os.walk(self.pgdata_dir):
for filename in filenames:
rel_dir = os.path.relpath(root, self.pgdata_dir)
# Skip some dirs and files we don't want to compare
skip_dirs = ['pg_wal', 'pg_stat', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical']
skip_files = ['pg_internal.init', 'pg.log', 'zenith.signal', 'postgresql.conf',
'postmaster.opts', 'postmaster.pid', 'pg_control']
if rel_dir not in skip_dirs and filename not in skip_files:
rel_file = os.path.join(rel_dir, filename)
pgdata_files.append(rel_file)
pgdata_files.sort()
print(pgdata_files)
return pgdata_files
class PostgresFactory:
""" An object representing multiple running postgres daemons. """
def __init__(self, zenith_cli: ZenithCli, repo_dir: str, pg_bin: PgBin, initial_tenant: str, base_port: int = 55431):
@@ -911,3 +943,55 @@ class TenantFactory:
@zenfixture
def tenant_factory(zenith_cli: ZenithCli):
return TenantFactory(zenith_cli)
# pg is the existing comute node we want to compare our basebackup to
# lsn is the latest lsn of this node
def check_restored_datadir_content(zenith_cli, pg, lsn, postgres: PostgresFactory):
# stop postgres to ensure that files won't change
pg.stop()
# list files we're going to compare
pgdata_files = pg.list_files_to_compare()
# create new branch, but don't start postgres
# We only need 'basebackup' result here.
zenith_cli.run(
["branch", "check_restored_datadir", pg.branch + "@" + lsn])
pg2 = postgres.create('check_restored_datadir')
print('postgres is created on check_restored_datadir branch')
print('files in a basebackup')
# list files we're going to compare
pgdata_files2 = pg2.list_files_to_compare()
# check that file sets are equal
assert pgdata_files == pgdata_files2
# compare content of the files
# filecmp returns (match, mismatch, error) lists
# We've already filtered all mismatching files in list_files_to_compare(),
# so here expect that the content is identical
(match, mismatch, error) = filecmp.cmpfiles(pg.pgdata_dir,
pg2.pgdata_dir,
pgdata_files,
shallow=False)
print('filecmp result mismatch and error lists:')
print(mismatch)
print(error)
for f in mismatch:
f1 = os.path.join(pg.pgdata_dir, f)
f2 = os.path.join(pg2.pgdata_dir, f)
stdout_filename = "{}.diff".format(f2)
with open(stdout_filename, 'w') as stdout_f:
subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True)
subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True)
cmd = ['diff {}.hex {}.hex'.format(f1, f2)]
subprocess.run(cmd, stdout=stdout_f, shell=True)
assert (mismatch, error) == ([], [])

View File

@@ -46,13 +46,14 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
connstr = pg.connstr()
# Initialize pgbench database
with zenbenchmark.record_duration('init'):
pg_bin.run_capture(['pgbench', '-s5', '-i', connstr])
# Initialize pgbench database, recording the time and I/O it takes
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
with zenbenchmark.record_duration('init'):
pg_bin.run_capture(['pgbench', '-s5', '-i', connstr])
# Flush the layers from memory to disk. The time to do that is included in the
# reported init time.
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
# Flush the layers from memory to disk. This is included in the reported
# time and I/O
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
# Run pgbench for 5000 transactions
with zenbenchmark.record_duration('5000_xacts'):

View File

@@ -148,7 +148,7 @@ impl ReplicationConn {
}
});
let (mut start_pos, mut stop_pos) = Self::parse_start_stop(&cmd)?;
let (mut start_pos, mut stop_pos) = Self::parse_start_stop(cmd)?;
let mut wal_seg_size: usize;
loop {
@@ -229,7 +229,7 @@ impl ReplicationConn {
start_pos += send_size as u64;
debug!("Sent WAL to page server up to {}", end_pos);
debug!("sent WAL up to {}", end_pos);
// Decide whether to reuse this file. If we don't set wal_file here
// a new file will be opened next time.

View File

@@ -10,6 +10,7 @@ use log::*;
use postgres_ffi::xlog_utils::TimeLineID;
use serde::{Deserialize, Serialize};
use std::cmp::max;
use std::cmp::min;
use std::io;
use std::io::Read;
@@ -47,6 +48,7 @@ pub struct ServerInfo {
/// Postgres server version
pub pg_version: u32,
pub system_id: SystemId,
pub tenant_id: ZTenantId,
/// Zenith timelineid
pub ztli: ZTimelineId,
pub tli: TimeLineID,
@@ -65,10 +67,9 @@ pub struct SafeKeeperState {
/// information about server
pub server: ServerInfo,
/// Unique id of the last *elected* proposer we dealed with. Not needed
/// correctness, exists for monitoring purposes.
/// for correctness, exists for monitoring purposes.
pub proposer_uuid: PgUuid,
/// part of WAL acknowledged by quorum (note that we might not have wal to
/// up this point locally)
/// part of WAL acknowledged by quorum and available locally
pub commit_lsn: Lsn,
/// minimal LSN which may be needed for recovery of some safekeeper (end lsn
/// + 1 of last record streamed to everyone)
@@ -84,6 +85,7 @@ impl SafeKeeperState {
server: ServerInfo {
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
system_id: 0, /* Postgres system identifier */
tenant_id: ZTenantId::from([0u8; 16]),
ztli: ZTimelineId::from([0u8; 16]),
tli: 0,
wal_seg_size: 0,
@@ -95,6 +97,12 @@ impl SafeKeeperState {
}
}
impl Default for SafeKeeperState {
fn default() -> Self {
Self::new()
}
}
// protocol messages
/// Initial Proposer -> Acceptor message
@@ -155,7 +163,7 @@ pub struct AppendRequestHeader {
end_lsn: Lsn,
/// LSN committed by quorum of safekeepers
commit_lsn: Lsn,
/// restart LSN position (minimal LSN which may be needed by proposer to perform recovery)
/// restart LSN position (minimal LSN which may be needed by proposer to perform recovery)
restart_lsn: Lsn,
// only for logging/debugging
proposer_uuid: PgUuid,
@@ -172,6 +180,9 @@ pub struct AppendResponse {
// make much sense without taking epoch into account, as history can be
// diverged.
pub flush_lsn: Lsn,
// We report back our awareness about which WAL is committed, as this is
// a criterion for walproposer --sync mode exit
pub commit_lsn: Lsn,
pub hs_feedback: HotStandbyFeedback,
}
@@ -205,7 +216,7 @@ impl ProposerAcceptorMessage {
let rec_size = hdr
.end_lsn
.checked_sub(hdr.begin_lsn)
.ok_or(anyhow!("begin_lsn > end_lsn in AppendRequest"))?
.ok_or_else(|| anyhow!("begin_lsn > end_lsn in AppendRequest"))?
.0 as usize;
if rec_size > MAX_SEND_SIZE {
bail!(
@@ -217,10 +228,7 @@ impl ProposerAcceptorMessage {
let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
stream.read_exact(&mut wal_data_vec)?;
let wal_data = Bytes::from(wal_data_vec);
let msg = AppendRequest {
h: hdr,
wal_data: wal_data,
};
let msg = AppendRequest { h: hdr, wal_data };
Ok(ProposerAcceptorMessage::AppendRequest(msg))
}
@@ -274,7 +282,9 @@ pub struct SafeKeeper<ST: Storage> {
/// reading wal.
pub flush_lsn: Lsn,
pub tli: u32,
pub flushed_truncate_lsn: Lsn,
/// not-yet-flushed pairs of same named fields in s.*
pub commit_lsn: Lsn,
pub truncate_lsn: Lsn,
pub storage: ST,
pub s: SafeKeeperState, // persistent part
pub elected_proposer_term: Term, // for monitoring/debugging
@@ -289,7 +299,8 @@ where
SafeKeeper {
flush_lsn,
tli,
flushed_truncate_lsn: Lsn(0),
commit_lsn: state.commit_lsn,
truncate_lsn: state.truncate_lsn,
storage,
s: state,
elected_proposer_term: 0,
@@ -320,13 +331,6 @@ where
SK_PROTOCOL_VERSION
);
}
if self.s.server.system_id != 0 && self.s.server.system_id != msg.system_id {
bail!(
"system identifier changed: got {}, expected {}",
msg.system_id,
self.s.server.system_id,
);
}
/* Postgres upgrade is not treated as fatal error */
if msg.pg_version != self.s.server.pg_version
&& self.s.server.pg_version != UNKNOWN_SERVER_VERSION
@@ -339,6 +343,7 @@ where
// set basic info about server, if not yet
self.s.server.system_id = msg.system_id;
self.s.server.tenant_id = msg.tenant_id;
self.s.server.ztli = msg.ztli;
self.s.server.tli = msg.tli;
self.s.server.wal_seg_size = msg.wal_seg_size;
@@ -378,12 +383,13 @@ where
}
/// Handle request to append WAL.
#[allow(clippy::comparison_chain)]
fn handle_append_request(&mut self, msg: &AppendRequest) -> Result<AcceptorProposerMessage> {
// log first AppendRequest from this proposer
if self.elected_proposer_term < msg.h.term {
info!(
"start receiving WAL from timeline {} term {}",
self.s.server.ztli, msg.h.term,
"start accepting WAL from timeline {}, tenant {}, term {}, epochStartLsn {:?}",
self.s.server.ztli, self.s.server.tenant_id, msg.h.term, msg.h.epoch_start_lsn,
);
self.elected_proposer_term = msg.h.term;
}
@@ -398,6 +404,7 @@ where
let resp = AppendResponse {
term: self.s.acceptor_state.term,
epoch: self.s.acceptor_state.epoch,
commit_lsn: Lsn(0),
flush_lsn: Lsn(0),
hs_feedback: HotStandbyFeedback::empty(),
};
@@ -414,9 +421,13 @@ where
* maximum (vcl) determined by WAL proposer during handshake.
* Switching epoch means that node completes recovery and start writing in the WAL new data.
* XXX: this is wrong, we must actively truncate not matching part of log.
*
* The non-strict inequality is important for us, as proposer in --sync mode doesn't
* generate new records, but to advance commit_lsn epoch switch must happen on majority.
* We can regard this as commit of empty entry in new epoch, this should be safe.
*/
if self.s.acceptor_state.epoch < msg.h.term
&& msg.h.end_lsn > max(self.flush_lsn, msg.h.epoch_start_lsn)
&& msg.h.end_lsn >= max(self.flush_lsn, msg.h.epoch_start_lsn)
{
info!("switched to new epoch {}", msg.h.term);
self.s.acceptor_state.epoch = msg.h.term; /* bump epoch */
@@ -427,8 +438,20 @@ where
}
self.s.proposer_uuid = msg.h.proposer_uuid;
self.s.commit_lsn = msg.h.commit_lsn;
self.s.truncate_lsn = msg.h.restart_lsn;
// Advance commit_lsn taking into account what we have locally.
// xxx this is wrapped into epoch check because we overwrite wal
// instead of truncating it, so without it commit_lsn might include
// wrong part. Anyway, nobody is much interested in our commit_lsn while
// epoch switch hasn't happened, right?
if self.s.acceptor_state.epoch == msg.h.term {
let commit_lsn = min(msg.h.commit_lsn, self.flush_lsn);
// If new commit_lsn reached epoch switch, force sync of control file:
// walproposer in sync mode is very interested when this happens.
sync_control_file |=
commit_lsn >= msg.h.epoch_start_lsn && self.s.commit_lsn < msg.h.epoch_start_lsn;
self.commit_lsn = commit_lsn;
}
self.truncate_lsn = msg.h.restart_lsn;
/*
* Update restart LSN in control file.
@@ -436,24 +459,26 @@ where
* when restart_lsn delta exceeds WAL segment size.
*/
sync_control_file |=
self.flushed_truncate_lsn + (self.s.server.wal_seg_size as u64) < self.s.truncate_lsn;
self.storage.persist(&self.s, sync_control_file)?;
self.s.truncate_lsn + (self.s.server.wal_seg_size as u64) < self.truncate_lsn;
if sync_control_file {
self.flushed_truncate_lsn = self.s.truncate_lsn;
self.s.commit_lsn = self.commit_lsn;
self.s.truncate_lsn = self.truncate_lsn;
}
self.storage.persist(&self.s, sync_control_file)?;
let resp = AppendResponse {
term: self.s.acceptor_state.term,
epoch: self.s.acceptor_state.epoch,
flush_lsn: self.flush_lsn,
commit_lsn: self.s.commit_lsn,
// will be filled by caller code to avoid bothering safekeeper
hs_feedback: HotStandbyFeedback::empty(),
};
trace!(
"processed AppendRequest of len {}, flush_lsn={:X}/{:>08X}, resp {:?}",
debug!(
"processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, resp {:?}",
msg.wal_data.len(),
(self.flush_lsn.0 >> 32) as u32,
self.flush_lsn.0 as u32,
msg.h.end_lsn,
msg.h.commit_lsn,
&resp,
);
Ok(AcceptorProposerMessage::AppendResponse(resp))
@@ -492,7 +517,7 @@ mod tests {
let mut vote_resp = sk.process_msg(&vote_request);
match vote_resp.unwrap() {
AcceptorProposerMessage::VoteResponse(resp) => assert!(resp.vote_given != 0),
_ => assert!(false),
r => panic!("unexpected response: {:?}", r),
}
// reboot...
@@ -506,7 +531,7 @@ mod tests {
vote_resp = sk.process_msg(&vote_request);
match vote_resp.unwrap() {
AcceptorProposerMessage::VoteResponse(resp) => assert!(resp.vote_given == 0),
_ => assert!(false),
r => panic!("unexpected response: {:?}", r),
}
}
@@ -519,7 +544,7 @@ mod tests {
let mut ar_hdr = AppendRequestHeader {
term: 1,
epoch_start_lsn: Lsn(2),
epoch_start_lsn: Lsn(3),
begin_lsn: Lsn(1),
end_lsn: Lsn(2),
commit_lsn: Lsn(0),
@@ -531,20 +556,20 @@ mod tests {
wal_data: Bytes::from_static(b"b"),
};
// check that AppendRequest before VCL doesn't switch epoch
// check that AppendRequest before epochStartLsn doesn't switch epoch
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
assert!(resp.is_ok());
assert!(sk.storage.persisted_state.acceptor_state.epoch == 0);
assert_eq!(sk.storage.persisted_state.acceptor_state.epoch, 0);
// but record after VCL does the switch
// but record at epochStartLsn does the switch
ar_hdr.begin_lsn = Lsn(2);
ar_hdr.end_lsn = Lsn(3);
append_request = AppendRequest {
h: ar_hdr.clone(),
h: ar_hdr,
wal_data: Bytes::from_static(b"b"),
};
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
assert!(resp.is_ok());
assert!(sk.storage.persisted_state.acceptor_state.epoch == 1);
assert_eq!(sk.storage.persisted_state.acceptor_state.epoch, 1);
}
}

View File

@@ -127,7 +127,7 @@ impl SharedState {
if let CreateControlFile::False = create {
bail!("control file is empty");
}
return Ok((file, SafeKeeperState::new()));
Ok((file, SafeKeeperState::new()))
} else {
match SafeKeeperState::des_from(&mut file) {
Err(e) => {
@@ -144,7 +144,7 @@ impl SharedState {
SK_FORMAT_VERSION
);
}
return Ok((file, s));
Ok((file, s))
}
}
}
@@ -217,14 +217,11 @@ impl Timeline {
rmsg = shared_state.sk.process_msg(msg)?;
// locally available commit lsn. flush_lsn can be smaller than
// commit_lsn if we are catching up safekeeper.
commit_lsn = min(shared_state.sk.flush_lsn, shared_state.sk.s.commit_lsn);
commit_lsn = shared_state.sk.commit_lsn;
// if this is AppendResponse, fill in proper hot standby feedback
match rmsg {
AcceptorProposerMessage::AppendResponse(ref mut resp) => {
resp.hs_feedback = shared_state.hs_feedback.clone();
}
_ => (),
if let AcceptorProposerMessage::AppendResponse(ref mut resp) = rmsg {
resp.hs_feedback = shared_state.hs_feedback.clone();
}
}
// Ping wal sender that new data might be available.
@@ -401,7 +398,7 @@ impl Storage for FileStorage {
{
Ok(mut file) => {
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
file.write_all(&ZERO_BLOCK)?;
file.write_all(ZERO_BLOCK)?;
}
wal_file = file;
}

View File

@@ -95,7 +95,7 @@ fn main() -> Result<()> {
.required(false)
))
.subcommand(SubCommand::with_name("start")
.about("Start a postrges compute node.\n This command actually creates new node from scrath, but preserves existing config files")
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
.arg(timeline_arg.clone()).arg(tenantid_arg.clone()))
.subcommand(
SubCommand::with_name("stop")
@@ -359,7 +359,7 @@ fn get_branch_infos(
}
fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(&env);
let pageserver = PageServerNode::from_env(env);
match tenant_match.subcommand() {
("list", Some(_)) => {
for tenant in pageserver.tenant_list()? {
@@ -381,12 +381,12 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
}
fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(&env);
let pageserver = PageServerNode::from_env(env);
if let Some(branchname) = branch_match.value_of("branchname") {
let startpoint_str = branch_match
.value_of("start-point")
.ok_or(anyhow!("Missing start-point"))?;
.ok_or_else(|| anyhow!("Missing start-point"))?;
let tenantid: ZTenantId = branch_match
.value_of("tenantid")
.map_or(Ok(env.tenantid), |value| value.parse())?;
@@ -447,9 +447,8 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
.value_of("tenantid")
.map_or(Ok(env.tenantid), |value| value.parse())?;
let timeline_name = create_match.value_of("timeline").unwrap_or("main");
let config_only = create_match.is_present("config-only");
cplane.new_node(tenantid, timeline_name, config_only)?;
cplane.new_node(tenantid, timeline_name)?;
}
("start", Some(start_match)) => {
let tenantid: ZTenantId = start_match
@@ -466,11 +465,15 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
None
};
println!("Starting postgres on timeline {}...", timeline_name);
println!(
"Starting {} postgres on timeline {}...",
if node.is_some() { "existing" } else { "new" },
timeline_name
);
if let Some(node) = node {
node.start(&auth_token)?;
} else {
let node = cplane.new_node(tenantid, timeline_name, false)?;
let node = cplane.new_node(tenantid, timeline_name)?;
node.start(&auth_token)?;
}
}

View File

@@ -13,7 +13,6 @@ pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
pub use prometheus::{Encoder, TextEncoder};
mod wrappers;
use libc::{c_long, getrusage, rusage, suseconds_t, time_t, timeval, RUSAGE_SELF};
pub use wrappers::{CountedReader, CountedWriter};
/// Gathers all Prometheus metrics and records the I/O stats just before that.
@@ -42,40 +41,26 @@ lazy_static! {
// performed by the process.
// We know the the size of the block, so we can determine the I/O bytes out of it.
// The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
#[allow(clippy::unnecessary_cast)]
fn update_io_metrics() {
let mut usage = rusage {
ru_utime: timeval {
tv_sec: 0 as time_t,
tv_usec: 0 as suseconds_t,
},
ru_stime: timeval {
tv_sec: 0 as time_t,
tv_usec: 0 as suseconds_t,
},
ru_maxrss: 0 as c_long,
ru_ixrss: 0 as c_long,
ru_idrss: 0 as c_long,
ru_isrss: 0 as c_long,
ru_minflt: 0 as c_long,
ru_majflt: 0 as c_long,
ru_nswap: 0 as c_long,
ru_inblock: 0 as c_long,
ru_oublock: 0 as c_long,
ru_msgsnd: 0 as c_long,
ru_msgrcv: 0 as c_long,
ru_nsignals: 0 as c_long,
ru_nvcsw: 0 as c_long,
ru_nivcsw: 0 as c_long,
};
unsafe {
getrusage(RUSAGE_SELF, (&mut usage) as *mut rusage);
}
let rusage_stats = get_rusage_stats();
const BYTES_IN_BLOCK: i64 = 512;
DISK_IO_BYTES
.with_label_values(&["read"])
.set(usage.ru_inblock * BYTES_IN_BLOCK);
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
DISK_IO_BYTES
.with_label_values(&["write"])
.set(usage.ru_oublock * BYTES_IN_BLOCK);
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
}
fn get_rusage_stats() -> libc::rusage {
let mut rusage = std::mem::MaybeUninit::uninit();
// SAFETY: kernel will initialize the struct for us
unsafe {
let ret = libc::getrusage(libc::RUSAGE_SELF, rusage.as_mut_ptr());
assert!(ret == 0, "getrusage failed: bad args");
rusage.assume_init()
}
}

View File

@@ -186,7 +186,7 @@ mod tests {
assert_eq!(total, stream.len());
}
// This mimicks the constraints of std::thread::spawn
// This mimics the constraints of std::thread::spawn
fn assert_send_sync(_x: impl Sync + Send + 'static) {}
#[test]

View File

@@ -1,6 +1,6 @@
// For details about authentication see docs/authentication.md
// TODO there are two issues for our use case in jsonwebtoken library which will be resolved in next release
// The fisrt one is that there is no way to disable expiration claim, but it can be excluded from validation, so use this as a workaround for now.
// The first one is that there is no way to disable expiration claim, but it can be excluded from validation, so use this as a workaround for now.
// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/190
// The second one is that we wanted to use ed25519 keys, but they are also not supported until next version. So we go with RSA keys for now.
// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162
@@ -8,7 +8,8 @@
use hex::{self, FromHex};
use serde::de::Error;
use serde::{self, Deserializer, Serializer};
use std::{fs, path::PathBuf};
use std::fs;
use std::path::Path;
use anyhow::{bail, Result};
use jsonwebtoken::{
@@ -43,8 +44,8 @@ where
{
let opt: Option<String> = Option::deserialize(deserializer)?;
match opt {
Some(tid) => return Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)),
None => return Ok(None),
Some(tid) => Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)),
None => Ok(None),
}
}
@@ -91,7 +92,7 @@ pub struct JwtAuth {
}
impl JwtAuth {
pub fn new<'a>(decoding_key: DecodingKey<'a>) -> Self {
pub fn new(decoding_key: DecodingKey<'_>) -> Self {
Self {
decoding_key: decoding_key.into_static(),
validation: Validation {
@@ -102,7 +103,7 @@ impl JwtAuth {
}
}
pub fn from_key_path(key_path: &PathBuf) -> Result<Self> {
pub fn from_key_path(key_path: &Path) -> Result<Self> {
let public_key = fs::read_to_string(key_path)?;
Ok(Self::new(DecodingKey::from_rsa_pem(public_key.as_bytes())?))
}
@@ -113,8 +114,8 @@ impl JwtAuth {
}
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn encode_from_key_path(claims: &Claims, key_path: &PathBuf) -> Result<String> {
pub fn encode_from_key_path(claims: &Claims, key_path: &Path) -> Result<String> {
let key_data = fs::read_to_string(key_path)?;
let key = EncodingKey::from_rsa_pem(&key_data.as_bytes())?;
let key = EncodingKey::from_rsa_pem(key_data.as_bytes())?;
Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?)
}

View File

@@ -6,7 +6,7 @@
//!
//! The [`LeSer`] trait does the same thing, in little-endian form.
//!
//! Note: you will get a compile error if you try to `use` both trais
//! Note: you will get a compile error if you try to `use` both traits
//! in the same module or scope. This is intended to be a safety
//! mechanism: mixing big-endian and little-endian encoding in the same file
//! is error-prone.

View File

@@ -95,13 +95,13 @@ pub fn attach_openapi_ui(
fn parse_token(header_value: &str) -> Result<&str, ApiError> {
// header must be in form Bearer <token>
let (prefix, token) = header_value.split_once(' ').ok_or(ApiError::Unauthorized(
"malformed authorization header".to_string(),
))?;
let (prefix, token) = header_value
.split_once(' ')
.ok_or_else(|| ApiError::Unauthorized("malformed authorization header".to_string()))?;
if prefix != "Bearer" {
Err(ApiError::Unauthorized(
return Err(ApiError::Unauthorized(
"malformed authorization header".to_string(),
))?
));
}
Ok(token)
}
@@ -123,9 +123,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
.map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
req.set_context(data.claims);
}
None => Err(ApiError::Unauthorized(
"missing authorization header".to_string(),
))?,
None => {
return Err(ApiError::Unauthorized(
"missing authorization header".to_string(),
))
}
}
}
Ok(req)
@@ -145,7 +147,7 @@ pub fn serve_thread_main(
addr: String,
) -> anyhow::Result<()> {
let addr = addr.parse()?;
log::info!("Starting a http endoint at {}", addr);
log::info!("Starting an http endpoint at {}", addr);
// Create a Service from the router above to handle incoming requests.
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();

View File

@@ -1,6 +1,8 @@
//! zenith_utils is intended to be a place to put code that is shared
//! between other crates in this repository.
#![allow(clippy::manual_range_contains)]
/// `Lsn` type implements common tasks on Log Sequence Numbers
pub mod lsn;
/// SeqWait allows waiting for a future sequence number to arrive

View File

@@ -32,6 +32,12 @@ impl Lsn {
self.0.checked_sub(other).map(Lsn)
}
/// Subtract a number, returning the difference as i128 to avoid overflow.
pub fn widening_sub<T: Into<u64>>(self, other: T) -> i128 {
let other: u64 = other.into();
i128::from(self.0) - i128::from(other)
}
/// Parse an LSN from a filename in the form `0000000000000000`
pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
where
@@ -264,6 +270,11 @@ mod tests {
assert_eq!(Lsn(1234).checked_sub(1233u64), Some(Lsn(1)));
assert_eq!(Lsn(1234).checked_sub(1235u64), None);
assert_eq!(Lsn(1235).widening_sub(1234u64), 1);
assert_eq!(Lsn(1234).widening_sub(1235u64), -1);
assert_eq!(Lsn(u64::MAX).widening_sub(0u64), i128::from(u64::MAX));
assert_eq!(Lsn(0).widening_sub(u64::MAX), -i128::from(u64::MAX));
let seg_sz: usize = 16 * 1024 * 1024;
assert_eq!(Lsn(0x1000007).segment_offset(seg_sz), 7);
assert_eq!(Lsn(0x1000007).segment_number(seg_sz), 1u64);

View File

@@ -107,12 +107,21 @@ impl io::Write for WriteStream {
}
}
pub struct TlsBoxed {
stream: BufStream,
session: rustls::ServerSession,
}
impl TlsBoxed {
fn rustls_stream(&mut self) -> rustls::Stream<rustls::ServerSession, BufStream> {
rustls::Stream::new(&mut self.session, &mut self.stream)
}
}
pub enum BidiStream {
Tcp(BufStream),
Tls {
stream: BufStream,
session: rustls::ServerSession,
},
/// This variant is boxed, because [`rustls::ServerSession`] is quite larger than [`BufStream`].
Tls(Box<TlsBoxed>),
}
impl BidiStream {
@@ -123,17 +132,13 @@ impl BidiStream {
pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
match self {
Self::Tcp(stream) => stream.get_ref().shutdown(how),
Self::Tls {
stream: reader,
session,
} => {
Self::Tls(tls_boxed) => {
if how == Shutdown::Read {
reader.get_ref().shutdown(how)
tls_boxed.stream.get_ref().shutdown(how)
} else {
session.send_close_notify();
let mut stream = rustls::Stream::new(session, reader);
let res = stream.flush();
reader.get_ref().shutdown(how)?;
tls_boxed.session.send_close_notify();
let res = tls_boxed.rustls_stream().flush();
tls_boxed.stream.get_ref().shutdown(how)?;
res
}
}
@@ -149,8 +154,8 @@ impl BidiStream {
(ReadStream::Tcp(reader), WriteStream::Tcp(stream))
}
Self::Tls { stream, session } => {
let reader = stream.into_reader();
Self::Tls(tls_boxed) => {
let reader = tls_boxed.stream.into_reader();
let buffer_data = reader.buffer().to_owned();
let read_buf_cfg = rustls_split::BufCfg::with_data(buffer_data, 8192);
let write_buf_cfg = rustls_split::BufCfg::with_capacity(8192);
@@ -159,7 +164,7 @@ impl BidiStream {
let socket = Arc::try_unwrap(reader.into_inner().0).unwrap();
let (read_half, write_half) =
rustls_split::split(socket, session, read_buf_cfg, write_buf_cfg);
rustls_split::split(socket, tls_boxed.session, read_buf_cfg, write_buf_cfg);
(ReadStream::Tls(read_half), WriteStream::Tls(write_half))
}
}
@@ -170,7 +175,7 @@ impl BidiStream {
Self::Tcp(mut stream) => {
session.complete_io(&mut stream)?;
assert!(!session.is_handshaking());
Ok(Self::Tls { stream, session })
Ok(Self::Tls(Box::new(TlsBoxed { stream, session })))
}
Self::Tls { .. } => Err(io::Error::new(
io::ErrorKind::InvalidInput,
@@ -184,7 +189,7 @@ impl io::Read for BidiStream {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
match self {
Self::Tcp(stream) => stream.read(buf),
Self::Tls { stream, session } => rustls::Stream::new(session, stream).read(buf),
Self::Tls(tls_boxed) => tls_boxed.rustls_stream().read(buf),
}
}
}
@@ -193,14 +198,14 @@ impl io::Write for BidiStream {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
match self {
Self::Tcp(stream) => stream.write(buf),
Self::Tls { stream, session } => rustls::Stream::new(session, stream).write(buf),
Self::Tls(tls_boxed) => tls_boxed.rustls_stream().write(buf),
}
}
fn flush(&mut self) -> io::Result<()> {
match self {
Self::Tcp(stream) => stream.flush(),
Self::Tls { stream, session } => rustls::Stream::new(session, stream).flush(),
Self::Tls(tls_boxed) => tls_boxed.rustls_stream().flush(),
}
}
}