mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-23 08:00:37 +00:00
Compare commits
6 Commits
set_hints_
...
dropped_re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5e91d15f06 | ||
|
|
8c46178256 | ||
|
|
2a4d405f96 | ||
|
|
66174c0342 | ||
|
|
cee366ac17 | ||
|
|
66929ad6fb |
@@ -4,17 +4,14 @@ use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use postgres_ffi::pg_constants;
|
||||
use regex::Regex;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
@@ -89,6 +86,7 @@ impl ComputeControlPlane {
|
||||
&mut self,
|
||||
tenantid: ZTenantId,
|
||||
branch_name: &str,
|
||||
config_only: bool,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
@@ -103,15 +101,25 @@ impl ComputeControlPlane {
|
||||
is_test: false,
|
||||
timelineid: timeline_id,
|
||||
tenantid,
|
||||
uses_wal_proposer: false,
|
||||
});
|
||||
|
||||
node.create_pgdata()?;
|
||||
node.setup_pg_conf(self.env.auth_type)?;
|
||||
|
||||
node.init_from_page_server(self.env.auth_type, config_only)?;
|
||||
self.nodes
|
||||
.insert((tenantid, node.name.clone()), Arc::clone(&node));
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
concat!(
|
||||
"synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
|
||||
"zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
),
|
||||
node.connstr(),
|
||||
)
|
||||
.as_str(),
|
||||
)?;
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
}
|
||||
@@ -127,7 +135,6 @@ pub struct PostgresNode {
|
||||
is_test: bool,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub tenantid: ZTenantId,
|
||||
uses_wal_proposer: bool,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
@@ -212,8 +219,6 @@ impl PostgresNode {
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
let uses_wal_proposer = config.contains("wal_acceptors");
|
||||
|
||||
// ok now
|
||||
Ok(PostgresNode {
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
@@ -223,48 +228,15 @@ impl PostgresNode {
|
||||
is_test: false,
|
||||
timelineid,
|
||||
tenantid,
|
||||
uses_wal_proposer,
|
||||
})
|
||||
}
|
||||
|
||||
fn sync_walkeepers(&self) -> Result<Lsn> {
|
||||
let pg_path = self.env.pg_bin_dir().join("postgres");
|
||||
let sync_output = Command::new(pg_path)
|
||||
.arg("--sync-safekeepers")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGDATA", self.pgdata().to_str().unwrap())
|
||||
.output()
|
||||
.with_context(|| "sync-walkeepers failed")?;
|
||||
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"sync-walkeepers failed: '{}'",
|
||||
String::from_utf8_lossy(&sync_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
|
||||
println!("Walkeepers synced on {}", lsn);
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
/// Get basebackup from the pageserver as a tar archive and extract it
|
||||
/// to the `self.pgdata()` directory.
|
||||
fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
|
||||
println!(
|
||||
"Extracting base backup to create postgres instance: path={} port={}",
|
||||
self.pgdata().display(),
|
||||
self.address.port()
|
||||
);
|
||||
|
||||
let sql = if let Some(lsn) = lsn {
|
||||
format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn)
|
||||
} else {
|
||||
format!("basebackup {} {}", self.tenantid, self.timelineid)
|
||||
};
|
||||
pub fn do_basebackup(&self) -> Result<()> {
|
||||
let pgdata = self.pgdata();
|
||||
|
||||
let sql = format!("basebackup {} {}", self.tenantid, self.timelineid);
|
||||
let mut client = self
|
||||
.pageserver
|
||||
.page_server_psql_client()
|
||||
@@ -276,32 +248,47 @@ impl PostgresNode {
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
tar::Archive::new(copyreader)
|
||||
.unpack(&self.pgdata())
|
||||
.unpack(&pgdata)
|
||||
.with_context(|| "extracting page backup failed")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_pgdata(&self) -> Result<()> {
|
||||
fs::create_dir_all(&self.pgdata()).with_context(|| {
|
||||
format!(
|
||||
"could not create data directory {}",
|
||||
self.pgdata().display()
|
||||
)
|
||||
})?;
|
||||
fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
|
||||
.with_context(|| {
|
||||
/// Connect to a pageserver, get basebackup, and untar it to initialize a
|
||||
/// new data directory
|
||||
pub fn init_from_page_server(&self, auth_type: AuthType, config_only: bool) -> Result<()> {
|
||||
let pgdata = self.pgdata();
|
||||
|
||||
println!(
|
||||
"Extracting base backup to create postgres instance: path={} port={}",
|
||||
pgdata.display(),
|
||||
self.address.port()
|
||||
);
|
||||
|
||||
// initialize data directory
|
||||
if self.is_test {
|
||||
fs::remove_dir_all(&pgdata).ok();
|
||||
}
|
||||
|
||||
fs::create_dir_all(&pgdata)
|
||||
.with_context(|| format!("could not create data directory {}", pgdata.display()))?;
|
||||
fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"could not set permissions in data directory {}",
|
||||
self.pgdata().display()
|
||||
pgdata.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
},
|
||||
)?;
|
||||
|
||||
// Connect to a page server, get base backup, and untar it to initialize a
|
||||
// new data directory
|
||||
fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
|
||||
File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
|
||||
if config_only {
|
||||
//Just create an empty config file
|
||||
File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
|
||||
} else {
|
||||
self.do_basebackup()?;
|
||||
fs::create_dir_all(self.pgdata().join("pg_wal"))?;
|
||||
fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
|
||||
}
|
||||
|
||||
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
|
||||
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
|
||||
@@ -355,40 +342,6 @@ impl PostgresNode {
|
||||
.as_str(),
|
||||
)?;
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
concat!(
|
||||
"synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
|
||||
"zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
),
|
||||
self.connstr(),
|
||||
)
|
||||
.as_str(),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_basebackup(&self) -> Result<()> {
|
||||
let lsn = if self.uses_wal_proposer {
|
||||
// LSN WAL_SEGMENT_SIZE means that it is bootstrap and we need to download just
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
// when things would be more stable (TODO).
|
||||
let lsn = self.sync_walkeepers()?;
|
||||
if lsn == Lsn(pg_constants::WAL_SEGMENT_SIZE as u64) {
|
||||
None
|
||||
} else {
|
||||
Some(lsn)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.do_basebackup(lsn)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -455,22 +408,38 @@ impl PostgresNode {
|
||||
}
|
||||
|
||||
// 1. We always start compute node from scratch, so
|
||||
// if old dir exists, preserve 'postgresql.conf' and drop the directory
|
||||
// if old dir exists, preserve config files and drop the directory
|
||||
|
||||
// XXX Now we only use 'postgresql.conf'.
|
||||
// If we will need 'pg_hba.conf', support it here too
|
||||
|
||||
let postgresql_conf_path = self.pgdata().join("postgresql.conf");
|
||||
let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
|
||||
let postgresql_conf = fs::read(postgresql_conf_path.clone()).with_context(|| {
|
||||
format!(
|
||||
"failed to read config file in {}",
|
||||
postgresql_conf_path.to_str().unwrap()
|
||||
)
|
||||
})?;
|
||||
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
self.pgdata().to_str().unwrap()
|
||||
);
|
||||
fs::remove_dir_all(&self.pgdata())?;
|
||||
self.create_pgdata()?;
|
||||
|
||||
// 2. Bring back config files
|
||||
fs::write(&postgresql_conf_path, postgresql_conf)?;
|
||||
// 2. Create new node
|
||||
self.init_from_page_server(self.env.auth_type, false)?;
|
||||
|
||||
// 3. Load basebackup
|
||||
self.load_basebackup()?;
|
||||
// 3. Bring back config files
|
||||
|
||||
if let Ok(mut file) = OpenOptions::new()
|
||||
.append(false)
|
||||
.write(true)
|
||||
.open(&postgresql_conf_path)
|
||||
{
|
||||
file.write_all(&postgresql_conf)?;
|
||||
file.sync_all()?;
|
||||
}
|
||||
|
||||
// 4. Finally start the compute node postgres
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
|
||||
@@ -74,10 +74,7 @@ impl PageServerNode {
|
||||
args.extend(&["--auth-type", "ZenithJWT"]);
|
||||
}
|
||||
|
||||
if let Some(tenantid) = create_tenant {
|
||||
args.extend(&["--create-tenant", tenantid])
|
||||
}
|
||||
|
||||
create_tenant.map(|tenantid| args.extend(&["--create-tenant", tenantid]));
|
||||
let status = cmd
|
||||
.args(args)
|
||||
.env_clear()
|
||||
|
||||
@@ -11,4 +11,3 @@
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
1. Add t_cid to XLOG record
|
||||
- Why?
|
||||
The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax.
|
||||
|
||||
To recap, the problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares abut it anymore.
|
||||
|
||||
- Alternatives?
|
||||
I don't know
|
||||
|
||||
2. Add PD_WAL_LOGGED.
|
||||
- Why?
|
||||
Postgres sometimes writes data to the page before it is wal-logged. If such page ais swapped out, we will loose this change. The problem is currently solved by setting PD_WAL_LOGGED bit in page header. When page without this bit set is written to the SMGR, then it is forced to be written to the WAL as FPI using log_newpage_copy() function.
|
||||
|
||||
There was wrong assumption that it can happen only during construction of some exotic indexes (like gist). It is not true. The same situation can happen with COPY,VACUUM and when record hint bits are set.
|
||||
|
||||
- Discussion:
|
||||
https://discord.com/channels/869525774699462656/882681420986851359
|
||||
|
||||
- Alternatives:
|
||||
Do not store this flag in page header, but associate this bit with shared buffer. Logically it is more correct but in practice we will get not advantages: neither in space, neither in CPU overhead.
|
||||
|
||||
|
||||
3. XLogReadBufferForRedo not always loads and pins requested buffer. So we need to add extra checks that buffer is really pinned. Also do not use BufferGetBlockNumber for buffer returned by XLogReadBufferForRedo.
|
||||
- Why?
|
||||
XLogReadBufferForRedo is not pinning pages which are not requested by wal-redo. It is specific only for wal-redo Postgres.
|
||||
|
||||
- Alternatives?
|
||||
No
|
||||
|
||||
|
||||
4. Eliminate reporting of some warnings related with hint bits, for example
|
||||
"page is not marked all-visible but visibility map bit is set in relation".
|
||||
- Why?
|
||||
Hint bit may be not WAL logged.
|
||||
|
||||
- Alternative?
|
||||
Always wal log any page changes.
|
||||
|
||||
|
||||
5. Maintain last written LSN.
|
||||
- Why?
|
||||
When compute node requests page from page server, we need to specify LSN. Ideally it should be LSN
|
||||
of WAL record performing last update of this pages. But we do not know it, because we do not have page.
|
||||
We can use current WAL flush position, but in this case there is high probability that page server
|
||||
will be blocked until this peace of WAL is delivered.
|
||||
As better approximation we can keep max LSN of written page. It will be better to take in account LSNs only of evicted pages,
|
||||
but SMGR API doesn't provide such knowledge.
|
||||
|
||||
- Alternatives?
|
||||
Maintain map of LSNs of evicted pages.
|
||||
|
||||
|
||||
6. Launching Postgres without WAL.
|
||||
- Why?
|
||||
According to Zenith architecture compute node is stateless. So when we are launching
|
||||
compute node, we need to provide some dummy PG_DATADIR. Relation pages
|
||||
can be requested on demand from page server. But Postgres still need some non-relational data:
|
||||
control and configuration files, SLRUs,...
|
||||
It is currently implemented using basebackup (do not mix with pg_basebackup) which is created
|
||||
by pageserver. It includes in this tarball config/control files, SLRUs and required directories.
|
||||
As far as pageserver do not have original (non-scattered) WAL segments, it includes in
|
||||
this tarball dummy WAL segment which contains only SHUTDOWN_CHECKPOINT record at the beginning of segment,
|
||||
which redo field points to the end of wal. It allows to load checkpoint record in more or less
|
||||
standard way with minimal changes of Postgres, but then some special handling is needed,
|
||||
including restoring previous record position from zenith.signal file.
|
||||
Also we have to correctly initialize header of last WAL page (pointed by checkpoint.redo)
|
||||
to pass checks performed by XLogReader.
|
||||
|
||||
- Alternatives?
|
||||
We may not include fake WAL segment in tarball at all and modify xlog.c to load checkpoint record
|
||||
in special way. But it may only increase number of changes in xlog.c
|
||||
|
||||
7. Add redo_read_buffer_filter callback to XLogReadBufferForRedoExtended
|
||||
- Why?
|
||||
We need a way in wal-redo Postgres to ignore pages which are not requested by pageserver.
|
||||
So wal-redo Postgres reconstructs only requested page and for all other returns BLK_DONE
|
||||
which means that recovery for them is not needed.
|
||||
|
||||
- Alternatives?
|
||||
No
|
||||
|
||||
8. Enforce WAL logging of sequence updates.
|
||||
- Why?
|
||||
Due to performance reasons Postgres don't want to log each fetching of a value from a sequence,
|
||||
so we pre-log a few fetches in advance. In the event of crash we can lose
|
||||
(skip over) as many values as we pre-logged.
|
||||
But it doesn't work with Zenith because page with sequence value can be evicted from buffer cache
|
||||
and we will get a gap in sequence values even without crash.
|
||||
|
||||
- Alternatives:
|
||||
Do not try to preserve sequential order but avoid performance penalty.
|
||||
|
||||
|
||||
9. Treat unlogged tables as normal (permanent) tables.
|
||||
- Why?
|
||||
Unlogged tables are not transient, so them have to survive node restart (unlike temporary tables).
|
||||
But as far as compute node is stateless, we need to persist their data to storage node.
|
||||
And it can only be done through the WAL.
|
||||
|
||||
- Alternatives?
|
||||
* Store unlogged tables locally (violates requirement of stateless compute nodes).
|
||||
* Prohibit unlogged tables at all.
|
||||
|
||||
|
||||
10. Support start Postgres in wal-redo mode
|
||||
- Why?
|
||||
To be able to apply WAL record and reconstruct pages at page server.
|
||||
|
||||
- Alternatives?
|
||||
* Rewrite redo handlers in Rust
|
||||
* Do not reconstruct pages at page server at all and do it at compute node.
|
||||
|
||||
|
||||
11. WAL proposer
|
||||
- Why?
|
||||
WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.
|
||||
It is currently implemented as patch to standard WAL sender.
|
||||
|
||||
- Alternatives?
|
||||
Can be moved to extension if some extra callbacks will be added to wal sender code.
|
||||
|
||||
|
||||
12. Secure Computing BPF API wrapper.
|
||||
- Why?
|
||||
Pageserver delegates complex WAL decoding duties to Postgres,
|
||||
which means that the latter might fall victim to carefully designed
|
||||
malicious WAL records and start doing harmful things to the system.
|
||||
To prevent this, it has been decided to limit possible interactions
|
||||
with the outside world using the Secure Computing BPF mode.
|
||||
|
||||
- Alternatives:
|
||||
* Rewrite redo handlers in Rust.
|
||||
* Add more checks to guarantee correctness of WAL records.
|
||||
* Move seccomp.c to extension
|
||||
* Many other discussed approaches to neutralize incorrect WAL records vulnerabilities.
|
||||
|
||||
|
||||
13. Callbacks for replica feedbacks
|
||||
- Why?
|
||||
Allowing waproposer to interact with walsender code.
|
||||
|
||||
- Alternatives
|
||||
Copy walsender code to walproposer.
|
||||
|
||||
|
||||
14. Support multiple SMGR implementations.
|
||||
- Why?
|
||||
Postgres provides abstract API for storage manager but it has only one implementation
|
||||
and provides no way to replace it with custom storage manager.
|
||||
|
||||
- Alternatives?
|
||||
None.
|
||||
|
||||
|
||||
15. Calculate database size as sum of all database relations.
|
||||
- Why?
|
||||
Postgres is calculating database size by traversing data directory
|
||||
but as far as Zenith compute node is stateless we can not do it.
|
||||
|
||||
- Alternatives?
|
||||
Send this request directly to pageserver and calculate real (physical) size
|
||||
of Zenith representation of database/timeline, rather than sum logical size of all relations.
|
||||
|
||||
|
||||
-----------------------------------------------
|
||||
Not currently committed but proposed:
|
||||
|
||||
1. Disable ring buffer buffer manager strategies
|
||||
- Why?
|
||||
Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
|
||||
Even if there are free space in buffer cache, pages may be evicted.
|
||||
Negative effect of it can be somehow compensated by file system cache, but in case of Zenith
|
||||
cost of requesting page from page server is much higher.
|
||||
|
||||
- Alternatives?
|
||||
Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
|
||||
for example copy evicted page from ring buffer to some other buffer if there is free space
|
||||
in buffer cache.
|
||||
|
||||
2. Disable marking page as dirty when hint bits are set.
|
||||
- Why?
|
||||
Postgres has to modify page twice: first time when some tuple is updated and second time when
|
||||
hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
|
||||
|
||||
- Alternatives?
|
||||
Add special WAL record for setting page hints.
|
||||
|
||||
3. Prefetching
|
||||
- Why?
|
||||
As far as pages in Zenith are loaded on demand, to reduce node startup time
|
||||
and also sppedup some massive queries we need some mechanism for bulk loading to
|
||||
reduce page request round-trip overhead.
|
||||
|
||||
Currently Postgres is supporting prefetching only for bitmap scan.
|
||||
In Zenith we also use prefetch for sequential and index scan. For sequential scan we prefetch
|
||||
some number of following pages. For index scan we prefetch pages of heap relation addressed by TIDs.
|
||||
|
||||
4. Prewarming.
|
||||
- Why?
|
||||
Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
|
||||
But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
|
||||
We can capture state of compute node buffer cache and send bulk request for this pages at startup.
|
||||
@@ -47,45 +47,28 @@ impl<'a> Basebackup<'a> {
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
req_lsn: Option<Lsn>,
|
||||
) -> Basebackup<'a> {
|
||||
// current_prev may be zero if we are at the start of timeline branched from old lsn
|
||||
let RecordLsn {
|
||||
last: current_last,
|
||||
prev: current_prev,
|
||||
} = timeline.get_last_record_rlsn();
|
||||
|
||||
// Compute postgres doesn't have any previous WAL files, but the first record that this
|
||||
// postgres is going to write need to have LSN of previous record (xl_prev). So we are
|
||||
// writing prev_lsn to "zenith.signal" file so that postgres can read it during the start.
|
||||
// In some cases we don't know prev_lsn (branch or basebackup @old_lsn) so pass Lsn(0)
|
||||
// instead and embrace the wrong xl_prev in this situations.
|
||||
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
|
||||
if req_lsn > current_last {
|
||||
// FIXME: now wait_lsn() is inside of list_nonrels() so we don't have a way
|
||||
// to get it from there. It is better to wait just here.
|
||||
(Lsn(0), req_lsn)
|
||||
} else if req_lsn < current_last {
|
||||
// we don't know prev already. We don't currently use basebackup@old_lsn
|
||||
// but may use it for read only replicas in future
|
||||
(Lsn(0), req_lsn)
|
||||
} else {
|
||||
// we are exactly at req_lsn and know prev
|
||||
(current_prev, req_lsn)
|
||||
last: lsn,
|
||||
prev: prev_record_lsn,
|
||||
} = if let Some(lsn) = req_lsn {
|
||||
// FIXME: that wouldn't work since we don't know prev for old LSN's.
|
||||
// Probably it is better to avoid using prev in compute node start
|
||||
// at all and acept the fact that first WAL record in the timeline would
|
||||
// have zero as prev. https://github.com/zenithdb/zenith/issues/506
|
||||
RecordLsn {
|
||||
last: lsn,
|
||||
prev: lsn,
|
||||
}
|
||||
} else {
|
||||
// None in req_lsn means that we are branching from the latest LSN
|
||||
(current_prev, current_last)
|
||||
// Atomically get last and prev LSN's
|
||||
timeline.get_last_record_rlsn()
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={}",
|
||||
backup_prev, backup_lsn
|
||||
);
|
||||
|
||||
Basebackup {
|
||||
ar: Builder::new(write),
|
||||
timeline,
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: backup_prev,
|
||||
lsn,
|
||||
prev_record_lsn,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,10 +84,10 @@ impl<'a> Basebackup<'a> {
|
||||
for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() {
|
||||
if *filepath == "pg_hba.conf" {
|
||||
let data = pg_constants::PG_HBA.as_bytes();
|
||||
let header = new_tar_header(filepath, data.len() as u64)?;
|
||||
self.ar.append(&header, data)?;
|
||||
let header = new_tar_header(&filepath, data.len() as u64)?;
|
||||
self.ar.append(&header, &data[..])?;
|
||||
} else {
|
||||
let header = new_tar_header(filepath, 0)?;
|
||||
let header = new_tar_header(&filepath, 0)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
}
|
||||
}
|
||||
@@ -183,12 +166,14 @@ impl<'a> Basebackup<'a> {
|
||||
self.lsn,
|
||||
)?;
|
||||
let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
let dst_path = "PG_VERSION";
|
||||
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
|
||||
let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, &version_bytes[..])?;
|
||||
|
||||
let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
let dst_path = format!("global/PG_VERSION");
|
||||
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, &version_bytes[..])?;
|
||||
|
||||
String::from("global/pg_filenode.map") // filenode map for global tablespace
|
||||
} else {
|
||||
@@ -203,7 +188,7 @@ impl<'a> Basebackup<'a> {
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
|
||||
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
self.ar.append(&header, &version_bytes[..])?;
|
||||
|
||||
format!("base/{}/pg_filenode.map", dbnode)
|
||||
};
|
||||
@@ -253,7 +238,7 @@ impl<'a> Basebackup<'a> {
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;
|
||||
checkpoint.redo = self.lsn.0 + self.lsn.calc_padding(8u32);
|
||||
|
||||
//reset some fields we don't want to preserve
|
||||
//TODO Check this.
|
||||
@@ -266,14 +251,9 @@ impl<'a> Basebackup<'a> {
|
||||
pg_control.state = pg_constants::DB_SHUTDOWNED;
|
||||
|
||||
// add zenith.signal file
|
||||
let xl_prev = if self.prev_record_lsn == Lsn(0) {
|
||||
0xBAD0 // magic value to indicate that we don't know prev_lsn
|
||||
} else {
|
||||
self.prev_record_lsn.0
|
||||
};
|
||||
self.ar.append(
|
||||
&new_tar_header("zenith.signal", 8)?,
|
||||
&xl_prev.to_le_bytes()[..],
|
||||
&self.prev_record_lsn.0.to_le_bytes()[..],
|
||||
)?;
|
||||
|
||||
//send pg_control
|
||||
|
||||
@@ -113,7 +113,7 @@ impl CfgFileParams {
|
||||
.auth_type
|
||||
.as_ref()
|
||||
.map_or(Ok(AuthType::Trust), |auth_type| {
|
||||
AuthType::from_str(auth_type)
|
||||
AuthType::from_str(&auth_type)
|
||||
})?;
|
||||
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
@@ -273,7 +273,7 @@ fn main() -> Result<()> {
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
// Initialize logger
|
||||
let (_scope_guard, log_file) = logger::init_logging(conf, "pageserver.log")?;
|
||||
let (_scope_guard, log_file) = logger::init_logging(&conf, "pageserver.log")?;
|
||||
let _log_guard = slog_stdlog::init()?;
|
||||
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
@@ -284,7 +284,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
// There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let stdout = log_file.try_clone().unwrap();
|
||||
let stderr = log_file;
|
||||
|
||||
@@ -43,7 +43,7 @@ pub struct PointInTime {
|
||||
|
||||
pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
|
||||
// Initialize logger
|
||||
let (_scope_guard, _log_file) = logger::init_logging(conf, "pageserver.log")?;
|
||||
let (_scope_guard, _log_file) = logger::init_logging(&conf, "pageserver.log")?;
|
||||
let _log_guard = slog_stdlog::init()?;
|
||||
|
||||
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
|
||||
@@ -264,22 +264,15 @@ pub(crate) fn create_branch(
|
||||
}
|
||||
|
||||
let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
|
||||
let timeline = repo.get_timeline(startpoint.timelineid)?;
|
||||
|
||||
if startpoint.lsn == Lsn(0) {
|
||||
// Find end of WAL on the old timeline
|
||||
let end_of_wal = timeline.get_last_record_lsn();
|
||||
let end_of_wal = repo
|
||||
.get_timeline(startpoint.timelineid)?
|
||||
.get_last_record_lsn();
|
||||
info!("branching at end of WAL: {}", end_of_wal);
|
||||
startpoint.lsn = end_of_wal;
|
||||
}
|
||||
startpoint.lsn = startpoint.lsn.align();
|
||||
if timeline.get_start_lsn() > startpoint.lsn {
|
||||
anyhow::bail!(
|
||||
"invalid startpoint {} for the branch {}: less than timeline start {}",
|
||||
startpoint.lsn,
|
||||
branchname,
|
||||
timeline.get_start_lsn()
|
||||
);
|
||||
}
|
||||
|
||||
// create a new timeline directory for it
|
||||
let newtli = create_timeline(conf, Some(startpoint), tenantid)?;
|
||||
@@ -291,7 +284,7 @@ pub(crate) fn create_branch(
|
||||
// FIXME: there's a race condition, if you create a branch with the same
|
||||
// name concurrently.
|
||||
let data = newtli.to_string();
|
||||
fs::write(conf.branch_path(branchname, tenantid), data)?;
|
||||
fs::write(conf.branch_path(&branchname, tenantid), data)?;
|
||||
|
||||
Ok(BranchInfo {
|
||||
name: branchname.to_string(),
|
||||
@@ -340,21 +333,21 @@ fn parse_point_in_time(
|
||||
|
||||
// Check if it's a tag
|
||||
if lsn.is_none() {
|
||||
let tagpath = conf.tag_path(name, tenantid);
|
||||
let tagpath = conf.tag_path(name, &tenantid);
|
||||
if tagpath.exists() {
|
||||
let pointstr = fs::read_to_string(tagpath)?;
|
||||
|
||||
return parse_point_in_time(conf, &pointstr, tenantid);
|
||||
return parse_point_in_time(conf, &pointstr, &tenantid);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if it's a branch
|
||||
// Check if it's branch @ LSN
|
||||
let branchpath = conf.branch_path(name, tenantid);
|
||||
let branchpath = conf.branch_path(name, &tenantid);
|
||||
if branchpath.exists() {
|
||||
let pointstr = fs::read_to_string(branchpath)?;
|
||||
|
||||
let mut result = parse_point_in_time(conf, &pointstr, tenantid)?;
|
||||
let mut result = parse_point_in_time(conf, &pointstr, &tenantid)?;
|
||||
|
||||
result.lsn = lsn.unwrap_or(Lsn(0));
|
||||
return Ok(result);
|
||||
@@ -363,7 +356,7 @@ fn parse_point_in_time(
|
||||
// Check if it's a timelineid
|
||||
// Check if it's timelineid @ LSN
|
||||
if let Ok(timelineid) = ZTimelineId::from_str(name) {
|
||||
let tlipath = conf.timeline_path(&timelineid, tenantid);
|
||||
let tlipath = conf.timeline_path(&timelineid, &tenantid);
|
||||
if tlipath.exists() {
|
||||
return Ok(PointInTime {
|
||||
timelineid,
|
||||
|
||||
@@ -150,24 +150,12 @@ impl Repository for LayeredRepository {
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> {
|
||||
let src_timeline = self.get_timeline(src)?;
|
||||
|
||||
let RecordLsn {
|
||||
last: src_last,
|
||||
prev: src_prev,
|
||||
} = src_timeline.get_last_record_rlsn();
|
||||
|
||||
// Use src_prev from the source timeline only if we branched at the last record.
|
||||
let dst_prev = if src_last == start_lsn {
|
||||
Some(src_prev)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Create the metadata file, noting the ancestor of the new timeline.
|
||||
// There is initially no data in it, but all the read-calls know to look
|
||||
// into the ancestor.
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: start_lsn,
|
||||
prev_record_lsn: dst_prev,
|
||||
prev_record_lsn: Some(src_timeline.get_prev_record_lsn()), // FIXME not atomic with start_lsn
|
||||
ancestor_timeline: Some(src),
|
||||
ancestor_lsn: start_lsn,
|
||||
};
|
||||
@@ -258,8 +246,8 @@ impl LayeredRepository {
|
||||
tenantid: ZTenantId,
|
||||
) -> LayeredRepository {
|
||||
LayeredRepository {
|
||||
tenantid,
|
||||
conf,
|
||||
tenantid: tenantid,
|
||||
conf: conf,
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
walredo_mgr,
|
||||
}
|
||||
@@ -595,7 +583,7 @@ impl Timeline for LayeredTimeline {
|
||||
}
|
||||
|
||||
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>> {
|
||||
trace!("list_rels called at {}", lsn);
|
||||
println!("list_rels called at {}", lsn);
|
||||
|
||||
// List all rels in this timeline, and all its ancestors.
|
||||
let mut all_rels = HashSet::new();
|
||||
@@ -794,14 +782,6 @@ impl Timeline for LayeredTimeline {
|
||||
fn get_last_record_rlsn(&self) -> RecordLsn {
|
||||
self.last_record_lsn.load()
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
if let Some(ancestor) = self.ancestor_timeline.as_ref() {
|
||||
ancestor.get_start_lsn()
|
||||
} else {
|
||||
self.ancestor_lsn
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayeredTimeline {
|
||||
@@ -922,7 +902,7 @@ impl LayeredTimeline {
|
||||
|
||||
while lsn < timeline.ancestor_lsn {
|
||||
trace!("going into ancestor {} ", timeline.ancestor_lsn);
|
||||
timeline = timeline.ancestor_timeline.as_ref().unwrap();
|
||||
timeline = &timeline.ancestor_timeline.as_ref().unwrap();
|
||||
}
|
||||
|
||||
// Now we have the right starting timeline for our search.
|
||||
@@ -956,7 +936,7 @@ impl LayeredTimeline {
|
||||
// If not, check if there's a layer on the ancestor timeline
|
||||
if let Some(ancestor) = &timeline.ancestor_timeline {
|
||||
lsn = timeline.ancestor_lsn;
|
||||
timeline = ancestor.as_ref();
|
||||
timeline = &ancestor.as_ref();
|
||||
trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn);
|
||||
continue;
|
||||
}
|
||||
@@ -1007,7 +987,7 @@ impl LayeredTimeline {
|
||||
if prev_layer.get_timeline_id() != self.timelineid {
|
||||
// First modification on this timeline
|
||||
start_lsn = self.ancestor_lsn;
|
||||
trace!(
|
||||
println!(
|
||||
"creating file for write for {} at branch point {}/{}",
|
||||
seg,
|
||||
self.timelineid,
|
||||
@@ -1015,14 +995,14 @@ impl LayeredTimeline {
|
||||
);
|
||||
} else {
|
||||
start_lsn = prev_layer.get_end_lsn();
|
||||
trace!(
|
||||
println!(
|
||||
"creating file for write for {} after previous layer {}/{}",
|
||||
seg,
|
||||
self.timelineid,
|
||||
start_lsn
|
||||
);
|
||||
}
|
||||
trace!(
|
||||
println!(
|
||||
"prev layer is at {}/{} - {}",
|
||||
prev_layer.get_timeline_id(),
|
||||
prev_layer.get_start_lsn(),
|
||||
@@ -1038,7 +1018,7 @@ impl LayeredTimeline {
|
||||
)?;
|
||||
} else {
|
||||
// New relation.
|
||||
trace!(
|
||||
println!(
|
||||
"creating layer for write for new rel {} at {}/{}",
|
||||
seg,
|
||||
self.timelineid,
|
||||
@@ -1050,6 +1030,7 @@ impl LayeredTimeline {
|
||||
}
|
||||
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
println!("before layers.insert_open {:?}", layer.filename());
|
||||
let layer_rc: Arc<InMemoryLayer> = Arc::new(layer);
|
||||
layers.insert_open(Arc::clone(&layer_rc));
|
||||
|
||||
@@ -1071,7 +1052,7 @@ impl LayeredTimeline {
|
||||
// FIXME: we can deadlock if we call wait_lsn() from WAL receiver. And we actually
|
||||
// it a lot from there. Only deadlock that I caught was while trying to add wait_lsn()
|
||||
// in list_rels(). But it makes sense to make all functions in timeline non-waiting;
|
||||
// assert that arg_lsn <= current_record_lsn; call wait_lsn explicitly where it is
|
||||
// assert that arg_lsn <= current_record_lsn; call wait_lsn explicetly where it is
|
||||
// needed (page_service and basebackup); uncomment this check:
|
||||
// assert_ne!(thread::current().name(), Some("WAL receiver thread"));
|
||||
|
||||
@@ -1110,7 +1091,7 @@ impl LayeredTimeline {
|
||||
prev: prev_record_lsn,
|
||||
} = self.last_record_lsn.load();
|
||||
|
||||
trace!(
|
||||
println!(
|
||||
"checkpointing timeline {} at {}",
|
||||
self.timelineid,
|
||||
last_record_lsn
|
||||
@@ -1153,7 +1134,7 @@ impl LayeredTimeline {
|
||||
}
|
||||
|
||||
// freeze it
|
||||
let (new_historics, new_open) = oldest_layer.freeze(last_record_lsn, self)?;
|
||||
let (new_historics, new_open) = oldest_layer.freeze(last_record_lsn, &self)?;
|
||||
|
||||
// replace this layer with the new layers that 'freeze' returned
|
||||
layers.pop_oldest_open();
|
||||
@@ -1195,7 +1176,7 @@ impl LayeredTimeline {
|
||||
let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid);
|
||||
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn,
|
||||
disk_consistent_lsn: disk_consistent_lsn,
|
||||
prev_record_lsn: ondisk_prev_record_lsn,
|
||||
ancestor_timeline: ancestor_timelineid,
|
||||
ancestor_lsn: self.ancestor_lsn,
|
||||
@@ -1323,14 +1304,18 @@ impl LayeredTimeline {
|
||||
doomed_layer.delete()?;
|
||||
layers.remove_historic(&*doomed_layer);
|
||||
|
||||
match (
|
||||
doomed_layer.is_dropped(),
|
||||
doomed_layer.get_seg_tag().rel.is_relation(),
|
||||
) {
|
||||
(true, true) => result.ondisk_relfiles_dropped += 1,
|
||||
(true, false) => result.ondisk_nonrelfiles_dropped += 1,
|
||||
(false, true) => result.ondisk_relfiles_removed += 1,
|
||||
(false, false) => result.ondisk_nonrelfiles_removed += 1,
|
||||
if doomed_layer.is_dropped() {
|
||||
if doomed_layer.get_seg_tag().rel.is_relation() {
|
||||
result.ondisk_relfiles_dropped += 1;
|
||||
} else {
|
||||
result.ondisk_nonrelfiles_dropped += 1;
|
||||
}
|
||||
} else {
|
||||
if doomed_layer.get_seg_tag().rel.is_relation() {
|
||||
result.ondisk_relfiles_removed += 1;
|
||||
} else {
|
||||
result.ondisk_nonrelfiles_removed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1446,7 +1431,6 @@ impl LayeredTimeline {
|
||||
trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn);
|
||||
}
|
||||
let img = self.walredo_mgr.request_redo(
|
||||
self,
|
||||
rel,
|
||||
blknum,
|
||||
request_lsn,
|
||||
|
||||
@@ -130,23 +130,23 @@ pub struct DeltaLayerInner {
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
return self.timelineid;
|
||||
}
|
||||
|
||||
fn get_seg_tag(&self) -> SegmentTag {
|
||||
self.seg
|
||||
return self.seg;
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
self.dropped
|
||||
return self.dropped;
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
self.start_lsn
|
||||
return self.start_lsn;
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
self.end_lsn
|
||||
return self.end_lsn;
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
@@ -251,6 +251,8 @@ impl Layer for DeltaLayer {
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
// Is the requested LSN after the rel was dropped?
|
||||
if self.dropped && lsn >= self.end_lsn {
|
||||
return Ok(false);
|
||||
@@ -285,8 +287,8 @@ impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for tli {} seg {} {}-{} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, self.end_lsn
|
||||
"----- delta layer for {} {}-{} ----",
|
||||
self.seg, self.start_lsn, self.end_lsn
|
||||
);
|
||||
|
||||
println!("--- relsizes ---");
|
||||
@@ -358,7 +360,6 @@ impl DeltaLayer {
|
||||
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
|
||||
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
|
||||
/// expedient.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
@@ -373,16 +374,16 @@ impl DeltaLayer {
|
||||
) -> Result<DeltaLayer> {
|
||||
let delta_layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
timelineid: timelineid,
|
||||
tenantid: tenantid,
|
||||
seg: seg,
|
||||
start_lsn: start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: true,
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes,
|
||||
relsizes: relsizes,
|
||||
}),
|
||||
predecessor,
|
||||
};
|
||||
|
||||
@@ -111,10 +111,8 @@ impl DeltaFileName {
|
||||
dropped,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DeltaFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fn to_string(&self) -> String {
|
||||
let basename = match self.seg.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
@@ -136,12 +134,11 @@ impl fmt::Display for DeltaFileName {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
|
||||
RelishTag::ControlFile => "pg_control".to_string(),
|
||||
RelishTag::Checkpoint => format!("pg_control_checkpoint"),
|
||||
RelishTag::ControlFile => format!("pg_control"),
|
||||
};
|
||||
|
||||
write!(
|
||||
f,
|
||||
format!(
|
||||
"{}_{}_{:016X}_{:016X}{}",
|
||||
basename,
|
||||
self.seg.segno,
|
||||
@@ -152,6 +149,12 @@ impl fmt::Display for DeltaFileName {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DeltaFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct ImageFileName {
|
||||
pub seg: SegmentTag,
|
||||
@@ -230,10 +233,8 @@ impl ImageFileName {
|
||||
|
||||
Some(ImageFileName { seg, lsn })
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ImageFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fn to_string(&self) -> String {
|
||||
let basename = match self.seg.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
@@ -255,12 +256,11 @@ impl fmt::Display for ImageFileName {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
|
||||
RelishTag::ControlFile => "pg_control".to_string(),
|
||||
RelishTag::Checkpoint => format!("pg_control_checkpoint"),
|
||||
RelishTag::ControlFile => format!("pg_control"),
|
||||
};
|
||||
|
||||
write!(
|
||||
f,
|
||||
format!(
|
||||
"{}_{}_{:016X}",
|
||||
basename,
|
||||
self.seg.segno,
|
||||
@@ -269,6 +269,12 @@ impl fmt::Display for ImageFileName {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ImageFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan timeline directory and create ImageFileName and DeltaFilename
|
||||
/// structs representing all files on disk
|
||||
///
|
||||
@@ -296,7 +302,7 @@ pub fn list_files(
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
}
|
||||
}
|
||||
Ok((imgfiles, deltafiles))
|
||||
return Ok((imgfiles, deltafiles));
|
||||
}
|
||||
|
||||
/// Helper enum to hold a PageServerConf, or a path
|
||||
|
||||
@@ -98,23 +98,23 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
return self.timelineid;
|
||||
}
|
||||
|
||||
fn get_seg_tag(&self) -> SegmentTag {
|
||||
self.seg
|
||||
return self.seg;
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
false
|
||||
return false;
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
return self.lsn;
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
return self.lsn;
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
@@ -157,7 +157,9 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Get size of the segment
|
||||
fn get_seg_size(&self, _lsn: Lsn) -> Result<u32> {
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => Ok(num_blocks),
|
||||
@@ -166,7 +168,8 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
assert!(lsn >= self.lsn);
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
@@ -193,10 +196,7 @@ impl Layer for ImageLayer {
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for tli {} seg {} at {} ----",
|
||||
self.timelineid, self.seg, self.lsn
|
||||
);
|
||||
println!("----- image layer for {} at {} ----", self.seg, self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
@@ -259,10 +259,10 @@ impl ImageLayer {
|
||||
|
||||
let layer = ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
lsn,
|
||||
timelineid: timelineid,
|
||||
tenantid: tenantid,
|
||||
seg: seg,
|
||||
lsn: lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
loaded: true,
|
||||
image_type: image_type.clone(),
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
|
||||
};
|
||||
use crate::layered_repository::LayeredTimeline;
|
||||
use crate::layered_repository::ZERO_PAGE;
|
||||
use crate::layered_repository::{DeltaLayer, ImageLayer};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::PageServerConf;
|
||||
@@ -15,7 +14,6 @@ use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::PathBuf;
|
||||
@@ -95,8 +93,8 @@ impl Layer for InMemoryLayer {
|
||||
let delta_filename = DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
end_lsn: end_lsn,
|
||||
dropped: dropped,
|
||||
}
|
||||
.to_string();
|
||||
|
||||
@@ -104,15 +102,15 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
return self.timelineid;
|
||||
}
|
||||
|
||||
fn get_seg_tag(&self) -> SegmentTag {
|
||||
self.seg
|
||||
return self.seg;
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
self.start_lsn
|
||||
return self.start_lsn;
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
@@ -200,6 +198,8 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
// Is the requested LSN after the segment was dropped?
|
||||
@@ -246,27 +246,16 @@ impl Layer for InMemoryLayer {
|
||||
);
|
||||
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
println!("segsizes {}: {}", k, v);
|
||||
}
|
||||
|
||||
for (k, v) in inner.page_versions.iter() {
|
||||
println!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
k.0,
|
||||
k.1,
|
||||
v.page_image.is_some(),
|
||||
v.record.is_some()
|
||||
);
|
||||
println!("{}: {}", k, v);
|
||||
}
|
||||
//for (k, v) in inner.page_versions.iter() {
|
||||
// println!("blk {} at {}: {}/{}", k.0, k.1, v.page_image.is_some(), v.record.is_some());
|
||||
//}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Type alias to simplify InMemoryLayer::freeze signature
|
||||
//
|
||||
type SuccessorLayers = (Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>);
|
||||
|
||||
impl InMemoryLayer {
|
||||
/// Return the oldest page version that's stored in this layer
|
||||
pub fn get_oldest_pending_lsn(&self) -> Lsn {
|
||||
@@ -284,7 +273,7 @@ impl InMemoryLayer {
|
||||
start_lsn: Lsn,
|
||||
oldest_pending_lsn: Lsn,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!(
|
||||
println!(
|
||||
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
timelineid,
|
||||
@@ -338,7 +327,7 @@ impl InMemoryLayer {
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
trace!(
|
||||
println!(
|
||||
"put_page_version blk {} of {} at {}/{}",
|
||||
blknum,
|
||||
self.seg.rel,
|
||||
@@ -365,43 +354,13 @@ impl InMemoryLayer {
|
||||
// which we've just acquired above
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
if newsize > oldsize {
|
||||
trace!(
|
||||
println!(
|
||||
"enlarging segment {} from {} to {} blocks at {}",
|
||||
self.seg,
|
||||
oldsize,
|
||||
newsize,
|
||||
lsn
|
||||
);
|
||||
|
||||
// If we are extending the relation by more than one page, initialize the "gap"
|
||||
// with zeros
|
||||
//
|
||||
// XXX: What if the caller initializes the gap with subsequent call with same LSN?
|
||||
// I don't think that can happen currently, but that is highly dependent on how
|
||||
// PostgreSQL writes its WAL records and there's no guarantee of it. If it does
|
||||
// happen, we would hit the "page version already exists" warning above on the
|
||||
// subsequent call to initialize the gap page.
|
||||
let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize;
|
||||
for gapblknum in gapstart..blknum {
|
||||
let zeropv = PageVersion {
|
||||
page_image: Some(ZERO_PAGE.clone()),
|
||||
record: None,
|
||||
};
|
||||
println!(
|
||||
"filling gap blk {} with zeros for write of {}",
|
||||
gapblknum, blknum
|
||||
);
|
||||
let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
|
||||
if old.is_some() {
|
||||
warn!(
|
||||
"Page version of rel {} blk {} at {} already exists",
|
||||
self.seg.rel, blknum, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
inner.segsizes.insert(lsn, newsize);
|
||||
}
|
||||
}
|
||||
@@ -429,7 +388,35 @@ impl InMemoryLayer {
|
||||
assert!(inner.drop_lsn.is_none());
|
||||
inner.drop_lsn = Some(lsn);
|
||||
|
||||
info!("dropped segment {} at {}", self.seg, lsn);
|
||||
println!("dropped segment {} at {}", self.seg, lsn);
|
||||
|
||||
let end_str = inner
|
||||
.drop_lsn
|
||||
.as_ref()
|
||||
.map(|drop_lsn| drop_lsn.to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
{
|
||||
let mut result = format!(
|
||||
"----- inmemory layer tli {} for {} {}-{} ----\n",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str
|
||||
);
|
||||
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
result += &format!("{}: {}\n", k, v);
|
||||
}
|
||||
for (k, v) in inner.page_versions.iter() {
|
||||
result += &format!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
k.0,
|
||||
k.1,
|
||||
v.page_image.is_some(),
|
||||
v.record.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
println!("{}", result);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -448,7 +435,7 @@ impl InMemoryLayer {
|
||||
) -> Result<InMemoryLayer> {
|
||||
let seg = src.get_seg_tag();
|
||||
|
||||
trace!(
|
||||
println!(
|
||||
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
timelineid,
|
||||
@@ -472,7 +459,7 @@ impl InMemoryLayer {
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
segsizes,
|
||||
segsizes: segsizes,
|
||||
}),
|
||||
predecessor: Some(src),
|
||||
})
|
||||
@@ -497,8 +484,8 @@ impl InMemoryLayer {
|
||||
cutoff_lsn: Lsn,
|
||||
// This is needed just to call materialize_page()
|
||||
timeline: &LayeredTimeline,
|
||||
) -> Result<SuccessorLayers> {
|
||||
info!(
|
||||
) -> Result<(Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>)> {
|
||||
println!(
|
||||
"freezing in memory layer for {} on timeline {} at {}",
|
||||
self.seg, self.timelineid, cutoff_lsn
|
||||
);
|
||||
@@ -537,17 +524,13 @@ impl InMemoryLayer {
|
||||
before_page_versions = BTreeMap::new();
|
||||
after_page_versions = BTreeMap::new();
|
||||
for ((blknum, lsn), pv) in inner.page_versions.iter() {
|
||||
match lsn.cmp(&end_lsn) {
|
||||
Ordering::Less => {
|
||||
before_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
Ordering::Equal => {
|
||||
// Page versions at the cutoff LSN will be stored in the
|
||||
// materialized image layer.
|
||||
}
|
||||
Ordering::Greater => {
|
||||
after_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
if *lsn == end_lsn {
|
||||
// Page versions at the cutoff LSN will be stored in the
|
||||
// materialized image layer.
|
||||
} else if *lsn > end_lsn {
|
||||
after_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
} else {
|
||||
before_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -578,7 +561,7 @@ impl InMemoryLayer {
|
||||
)?;
|
||||
let delta_layer_rc: Arc<dyn Layer> = Arc::new(delta_layer);
|
||||
frozen_layers.push(delta_layer_rc);
|
||||
trace!(
|
||||
println!(
|
||||
"freeze: created delta layer {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
@@ -594,7 +577,7 @@ impl InMemoryLayer {
|
||||
let imgfile = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
|
||||
let imgfile_rc: Arc<dyn Layer> = Arc::new(imgfile);
|
||||
frozen_layers.push(Arc::clone(&imgfile_rc));
|
||||
trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
println!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
|
||||
// If there were any page versions newer than the cutoff, initialize a new in-memory
|
||||
// layer to hold them
|
||||
@@ -611,7 +594,7 @@ impl InMemoryLayer {
|
||||
new_inner.page_versions.append(&mut after_page_versions);
|
||||
new_inner.segsizes.append(&mut after_segsizes);
|
||||
drop(new_inner);
|
||||
trace!("freeze: created new in-mem layer {} {}-", self.seg, end_lsn);
|
||||
println!("freeze: created new in-mem layer {} {}-", self.seg, end_lsn);
|
||||
|
||||
new_open_rc = Some(Arc::new(new_open))
|
||||
}
|
||||
@@ -619,4 +602,41 @@ impl InMemoryLayer {
|
||||
|
||||
Ok((frozen_layers, new_open_rc))
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self) -> String {
|
||||
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
.drop_lsn
|
||||
.as_ref()
|
||||
.map(|drop_lsn| drop_lsn.to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut result = format!(
|
||||
"----- dump inmemory layer for tli {} seg {} {}-{} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str
|
||||
);
|
||||
|
||||
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
result += &format!("{}: {}\n", k, v);
|
||||
}
|
||||
for (k, v) in inner.page_versions.iter() {
|
||||
result += &format!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
k.0,
|
||||
k.1,
|
||||
v.page_image.is_some(),
|
||||
v.record.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
println!("inner println {}", result);
|
||||
result
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +109,7 @@ impl LayerMap {
|
||||
|
||||
if let Some(open) = &segentry.open {
|
||||
if open.get_start_lsn() <= lsn {
|
||||
let x: Arc<dyn Layer> = Arc::clone(open) as _;
|
||||
let x: Arc<dyn Layer> = Arc::clone(&open) as _;
|
||||
return Some(x);
|
||||
}
|
||||
}
|
||||
@@ -119,7 +119,7 @@ impl LayerMap {
|
||||
.range((Included(Lsn(0)), Included(lsn)))
|
||||
.next_back()
|
||||
{
|
||||
let x: Arc<dyn Layer> = Arc::clone(v) as _;
|
||||
let x: Arc<dyn Layer> = Arc::clone(&v) as _;
|
||||
Some(x)
|
||||
} else {
|
||||
None
|
||||
@@ -132,7 +132,12 @@ impl LayerMap {
|
||||
///
|
||||
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
segentry.open.as_ref().map(Arc::clone)
|
||||
|
||||
if let Some(open) = &segentry.open {
|
||||
Some(Arc::clone(open))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -144,6 +149,7 @@ impl LayerMap {
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
if let Some(_old) = &segentry.open {
|
||||
// FIXME: shouldn't exist, but check
|
||||
println!("insert_open() // FIXME: shouldn't exist, but check");
|
||||
}
|
||||
segentry.open = Some(Arc::clone(&layer));
|
||||
} else {
|
||||
@@ -156,7 +162,7 @@ impl LayerMap {
|
||||
|
||||
let opensegentry = OpenSegEntry {
|
||||
oldest_pending_lsn: layer.get_oldest_pending_lsn(),
|
||||
layer,
|
||||
layer: layer,
|
||||
generation: self.current_generation,
|
||||
};
|
||||
self.open_segs.push(opensegentry);
|
||||
@@ -215,6 +221,8 @@ impl LayerMap {
|
||||
pub fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>> {
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
println!("layer list_rels()");
|
||||
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let RelishTag::Relation(reltag) = seg.rel {
|
||||
if (spcnode == 0 || reltag.spcnode == spcnode)
|
||||
@@ -222,15 +230,33 @@ impl LayerMap {
|
||||
{
|
||||
// Add only if it exists at the requested LSN.
|
||||
if let Some(open) = &segentry.open {
|
||||
if open.get_end_lsn() > lsn {
|
||||
println!("explore segentry.open {:?}", open.filename());
|
||||
|
||||
if open.get_end_lsn() > lsn && open.get_start_lsn() <= lsn {
|
||||
rels.insert(reltag);
|
||||
println!("found rel {} at lsn {} in open layer start_lsn {} end lsn {}",
|
||||
reltag, lsn, open.get_start_lsn(), open.get_end_lsn());
|
||||
println!("{}", open.dump());
|
||||
|
||||
}
|
||||
} else if let Some((_k, _v)) = segentry
|
||||
else
|
||||
{
|
||||
println!("found DROPPED rel {} at lsn {} in open layer start_lsn {} end lsn {}",
|
||||
reltag, lsn, open.get_start_lsn(), open.get_end_lsn());
|
||||
println!("{}", open.dump());
|
||||
}
|
||||
} else if let Some((l_start_lsn, layer)) = segentry
|
||||
.historic
|
||||
.range((Included(Lsn(0)), Included(lsn)))
|
||||
.next_back()
|
||||
{
|
||||
rels.insert(reltag);
|
||||
println!("explore historic segment {:?}", layer.filename());
|
||||
|
||||
if !layer.is_dropped() && l_start_lsn <= &lsn {
|
||||
rels.insert(reltag);
|
||||
println!("found rel {} in historic layer", reltag);
|
||||
layer.dump()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -248,15 +274,17 @@ impl LayerMap {
|
||||
} else {
|
||||
// Add only if it exists at the requested LSN.
|
||||
if let Some(open) = &segentry.open {
|
||||
if open.get_end_lsn() > lsn {
|
||||
if open.get_end_lsn() > lsn && open.get_start_lsn() <= lsn {
|
||||
rels.insert(seg.rel);
|
||||
}
|
||||
} else if let Some((_k, _v)) = segentry
|
||||
} else if let Some((l_start_lsn, layer)) = segentry
|
||||
.historic
|
||||
.range((Included(Lsn(0)), Included(lsn)))
|
||||
.next_back()
|
||||
{
|
||||
rels.insert(seg.rel);
|
||||
if !layer.is_dropped() && l_start_lsn <= &lsn {
|
||||
rels.insert(seg.rel);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -321,23 +349,6 @@ impl LayerMap {
|
||||
iter: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let Some(open) = &segentry.open {
|
||||
open.dump()?;
|
||||
}
|
||||
|
||||
for (_, layer) in segentry.historic.iter() {
|
||||
layer.dump()?;
|
||||
}
|
||||
}
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LayerMap {
|
||||
|
||||
@@ -24,7 +24,7 @@ pub fn init_logging(
|
||||
if record.level().is_at_least(slog::Level::Info) {
|
||||
return true;
|
||||
}
|
||||
false
|
||||
true
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(
|
||||
|
||||
@@ -372,7 +372,7 @@ impl PageServerHandler {
|
||||
.claims
|
||||
.as_ref()
|
||||
.expect("claims presence already checked");
|
||||
auth::check_permission(claims, tenantid)
|
||||
Ok(auth::check_permission(claims, tenantid)?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,7 +389,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.as_ref()
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.decode(str::from_utf8(jwt_response)?)?;
|
||||
.decode(&str::from_utf8(jwt_response)?)?;
|
||||
|
||||
if matches!(data.claims.scope, Scope::Tenant) {
|
||||
ensure!(
|
||||
@@ -425,7 +425,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
self.handle_controlfile(pgb)?;
|
||||
} else if query_string.starts_with("pagestream ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
let params = params_raw.split(" ").collect::<Vec<_>>();
|
||||
ensure!(
|
||||
params.len() == 2,
|
||||
"invalid param number for pagestream command"
|
||||
@@ -484,7 +484,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.get_timeline(timelineid)
|
||||
.context(format!("error fetching timeline {}", timelineid))?;
|
||||
|
||||
walreceiver::launch_wal_receiver(self.conf, timelineid, &connstr, tenantid.to_owned());
|
||||
walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr, tenantid.to_owned());
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("branch_create ") {
|
||||
@@ -492,10 +492,10 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
// branch_create <tenantid> <branchname> <startpoint>
|
||||
// TODO lazy static
|
||||
// TODO: escaping, to allow branch names with spaces
|
||||
// TOOD: escaping, to allow branch names with spaces
|
||||
let re = Regex::new(r"^branch_create ([[:xdigit:]]+) (\S+) ([^\r\n\s;]+)[\r\n\s;]*;?$")
|
||||
.unwrap();
|
||||
let caps = re.captures(query_string).ok_or_else(err)?;
|
||||
let caps = re.captures(&query_string).ok_or_else(err)?;
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let branchname = caps.get(2).ok_or_else(err)?.as_str().to_owned();
|
||||
@@ -504,7 +504,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
let branch =
|
||||
branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
|
||||
branches::create_branch(&self.conf, &branchname, &startpoint_str, &tenantid)?;
|
||||
let branch = serde_json::to_vec(&branch)?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
@@ -519,14 +519,14 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
|
||||
let branches = crate::branches::get_branches(self.conf, &tenantid)?;
|
||||
let branches = crate::branches::get_branches(&self.conf, &tenantid)?;
|
||||
let branches_buf = serde_json::to_vec(&branches)?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("tenant_list") {
|
||||
let tenants = crate::branches::get_tenants(self.conf)?;
|
||||
let tenants = crate::branches::get_tenants(&self.conf)?;
|
||||
let tenants_buf = serde_json::to_vec(&tenants)?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
@@ -537,13 +537,13 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
// tenant_create <tenantid>
|
||||
let re = Regex::new(r"^tenant_create ([[:xdigit:]]+)$").unwrap();
|
||||
let caps = re.captures(query_string).ok_or_else(err)?;
|
||||
let caps = re.captures(&query_string).ok_or_else(err)?;
|
||||
|
||||
self.check_permission(None)?;
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
|
||||
tenant_mgr::create_repository_for_tenant(self.conf, tenantid)?;
|
||||
tenant_mgr::create_repository_for_tenant(&self.conf, tenantid)?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
@@ -597,39 +597,39 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
RowDescriptor::int8_col(b"elapsed"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(result.ondisk_relfiles_total.to_string().as_bytes()),
|
||||
Some(&result.ondisk_relfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
&result
|
||||
.ondisk_relfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
result
|
||||
&result
|
||||
.ondisk_relfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
|
||||
Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
|
||||
Some(&result.ondisk_relfiles_not_updated.to_string().as_bytes()),
|
||||
Some(&result.ondisk_relfiles_removed.to_string().as_bytes()),
|
||||
Some(&result.ondisk_relfiles_dropped.to_string().as_bytes()),
|
||||
Some(&result.ondisk_nonrelfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
&result
|
||||
.ondisk_nonrelfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
result
|
||||
&result
|
||||
.ondisk_nonrelfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.elapsed.as_millis().to_string().as_bytes()),
|
||||
Some(&result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
|
||||
Some(&result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
|
||||
Some(&result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
|
||||
Some(&result.elapsed.as_millis().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
|
||||
@@ -125,7 +125,11 @@ impl RelishTag {
|
||||
|
||||
// convenience function to check if this relish is a normal relation.
|
||||
pub const fn is_relation(&self) -> bool {
|
||||
matches!(self, RelishTag::Relation(_))
|
||||
if let RelishTag::Relation(_) = self {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -113,7 +113,7 @@ pub trait Timeline: Send + Sync {
|
||||
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;
|
||||
|
||||
/// Get a list of non-relational objects
|
||||
fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
|
||||
fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
@@ -146,7 +146,6 @@ pub trait Timeline: Send + Sync {
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
@@ -201,7 +200,6 @@ impl WALRecord {
|
||||
///
|
||||
/// Tests that should work the same with any Repository/Timeline implementation.
|
||||
///
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -230,6 +228,8 @@ mod tests {
|
||||
forknum: 0,
|
||||
});
|
||||
|
||||
const TESTDB: u32 = 111;
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_IMG(s: &str) -> Bytes {
|
||||
@@ -358,37 +358,6 @@ mod tests {
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate to zero length
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x60), 0)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x60));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 0);
|
||||
|
||||
// Extend from 0 to 2 blocks, leaving a gap
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x70));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?,
|
||||
TEST_IMG("foo blk 1")
|
||||
);
|
||||
|
||||
// Extend a lot more, leaving a big gap that spans across segments
|
||||
// FIXME: This is currently broken, see https://github.com/zenithdb/zenith/issues/500
|
||||
/*
|
||||
tline.put_page_image(TESTREL_A, 1500, Lsn(0x80), TEST_IMG("foo blk 1500"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x80));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), 1501);
|
||||
for blk in 2..1500 {
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?,
|
||||
ZERO_PAGE);
|
||||
}
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?,
|
||||
TEST_IMG("foo blk 1500"));
|
||||
*/
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -449,6 +418,53 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Test list_rels() function, with branches and unlinking
|
||||
///
|
||||
#[test]
|
||||
fn test_list_rels_and_unlink() -> Result<()> {
|
||||
let repo = get_test_repo("test_unlink")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0x00))?;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_PAGE.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
|
||||
// Check that list_rels() lists it after LSN 2, but no before it.
|
||||
let reltag = match TESTREL_A {
|
||||
RelishTag::Relation(reltag) => reltag,
|
||||
_ => panic!("unexpected relish")
|
||||
};
|
||||
assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&reltag));
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&reltag));
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
|
||||
|
||||
// Create a branch, check that the relation is visible there
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
assert!(newtline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
|
||||
|
||||
// Unlink it on the branch
|
||||
newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
|
||||
|
||||
// Check that it's no longer listed on the branch after the point where it was unlinked
|
||||
assert!(newtline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
|
||||
assert!(!newtline.list_rels(0, TESTDB, Lsn(0x40))?.contains(&reltag));
|
||||
|
||||
// Run checkpoint and garbage collection and check that it's still not visible
|
||||
newtline.checkpoint()?;
|
||||
repo.gc_iteration(Some(newtimelineid), 0, true)?;
|
||||
assert!(!newtline.list_rels(0, TESTDB, Lsn(0x40))?.contains(&reltag));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Test branch creation
|
||||
///
|
||||
@@ -507,7 +523,6 @@ mod tests {
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
|
||||
@@ -29,7 +29,7 @@ use zenith_utils::lsn::Lsn;
|
||||
const MAX_MBR_BLKNO: u32 =
|
||||
pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
const ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
///
|
||||
/// Import all relation data pages from local disk into the repository.
|
||||
@@ -128,7 +128,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
|
||||
let xid = u32::from_str_radix(&entry.path().to_str().unwrap(), 16)?;
|
||||
import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
@@ -427,7 +427,7 @@ pub fn save_decoded_record(
|
||||
},
|
||||
rpageno,
|
||||
lsn,
|
||||
ZERO_PAGE.clone(),
|
||||
ZERO_PAGE,
|
||||
)?;
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
@@ -484,7 +484,7 @@ pub fn save_decoded_record(
|
||||
},
|
||||
rpageno,
|
||||
lsn,
|
||||
ZERO_PAGE.clone(),
|
||||
ZERO_PAGE,
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
|
||||
let pageno = buf.get_u32_le();
|
||||
@@ -497,7 +497,7 @@ pub fn save_decoded_record(
|
||||
},
|
||||
rpageno,
|
||||
lsn,
|
||||
ZERO_PAGE.clone(),
|
||||
ZERO_PAGE,
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
@@ -595,16 +595,19 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
// TODO This implementation is very inefficient -
|
||||
// it scans all non-rels only to find FileNodeMaps
|
||||
for tag in timeline.list_nonrels(req_lsn)? {
|
||||
if let RelishTag::FileNodeMap { spcnode, dbnode } = tag {
|
||||
if spcnode == src_tablespace_id && dbnode == src_db_id {
|
||||
let img = timeline.get_page_at_lsn_nowait(tag, 0, req_lsn)?;
|
||||
let new_tag = RelishTag::FileNodeMap {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
};
|
||||
timeline.put_page_image(new_tag, 0, lsn, img)?;
|
||||
break;
|
||||
match tag {
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
if spcnode == src_tablespace_id && dbnode == src_db_id {
|
||||
let img = timeline.get_page_at_lsn_nowait(tag, 0, req_lsn)?;
|
||||
let new_tag = RelishTag::FileNodeMap {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
};
|
||||
timeline.put_page_image(new_tag, 0, lsn, img)?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {} // do nothing
|
||||
}
|
||||
}
|
||||
info!(
|
||||
@@ -780,14 +783,17 @@ fn save_clog_truncate_record(
|
||||
// instead.
|
||||
let req_lsn = min(timeline.get_last_record_lsn(), lsn);
|
||||
for obj in timeline.list_nonrels(req_lsn)? {
|
||||
if let RelishTag::Slru { slru, segno } = obj {
|
||||
if slru == SlruKind::Clog {
|
||||
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
|
||||
timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?;
|
||||
trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn);
|
||||
match obj {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
if slru == SlruKind::Clog {
|
||||
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
|
||||
timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?;
|
||||
trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -197,7 +197,6 @@ impl WalStreamDecoder {
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Default)]
|
||||
pub struct DecodedBkpBlock {
|
||||
/* Is this block ref in use? */
|
||||
//in_use: bool,
|
||||
@@ -230,7 +229,25 @@ pub struct DecodedBkpBlock {
|
||||
|
||||
impl DecodedBkpBlock {
|
||||
pub fn new() -> DecodedBkpBlock {
|
||||
Default::default()
|
||||
DecodedBkpBlock {
|
||||
rnode_spcnode: 0,
|
||||
rnode_dbnode: 0,
|
||||
rnode_relnode: 0,
|
||||
forknum: 0,
|
||||
blkno: 0,
|
||||
|
||||
flags: 0,
|
||||
has_image: false,
|
||||
apply_image: false,
|
||||
will_init: false,
|
||||
hole_offset: 0,
|
||||
hole_length: 0,
|
||||
bimg_len: 0,
|
||||
bimg_info: 0,
|
||||
|
||||
has_data: false,
|
||||
data_len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -164,7 +164,7 @@ fn walreceiver_main(
|
||||
// There might be some padding after the last full record, skip it.
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
|
||||
info!(
|
||||
debug!(
|
||||
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, timelineid, end_of_wal
|
||||
);
|
||||
@@ -457,7 +457,7 @@ fn write_wal_file(
|
||||
{
|
||||
Ok(mut file) => {
|
||||
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
||||
file.write_all(ZERO_BLOCK)?;
|
||||
file.write_all(&ZERO_BLOCK)?;
|
||||
}
|
||||
wal_file = file;
|
||||
}
|
||||
|
||||
@@ -43,7 +43,7 @@ use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::{Timeline, WALRecord};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder::XlMultiXactCreate;
|
||||
use crate::waldecoder::XlXactParsedRecord;
|
||||
use crate::PageServerConf;
|
||||
@@ -79,7 +79,6 @@ pub trait WalRedoManager: Send + Sync {
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
@@ -97,7 +96,6 @@ pub struct DummyRedoManager {}
|
||||
impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
_timeline: &dyn Timeline,
|
||||
_rel: RelishTag,
|
||||
_blknum: u32,
|
||||
_lsn: Lsn,
|
||||
@@ -178,7 +176,6 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
fn request_redo(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
@@ -212,20 +209,13 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
let process = (*process_guard).as_ref().unwrap();
|
||||
|
||||
self.runtime
|
||||
.block_on(self.handle_apply_request(process, &request))
|
||||
.block_on(self.handle_apply_request(&process, &request))
|
||||
};
|
||||
end_time = Instant::now();
|
||||
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
|
||||
|
||||
if let Ok(page) = result {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&page);
|
||||
self.set_hint_bits(timeline, &mut buf, lsn, &request.records);
|
||||
return Ok(buf.freeze());
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
@@ -252,117 +242,6 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
fn xid_status(&self, timeline: &dyn Timeline, xid: u32, lsn: Lsn) -> u8 {
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if let Ok(clog_page) = timeline.get_page_at_lsn_nowait(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
lsn,
|
||||
) {
|
||||
postgres_ffi::nonrelfile_utils::transaction_id_get_status(xid, &clog_page[..])
|
||||
} else {
|
||||
pg_constants::TRANSACTION_STATUS_IN_PROGRESS
|
||||
}
|
||||
}
|
||||
|
||||
fn set_hint_bits(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
page: &mut BytesMut,
|
||||
lsn: Lsn,
|
||||
records: &Vec<WALRecord>,
|
||||
) {
|
||||
let mut flags = LittleEndian::read_u16(
|
||||
&page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
|
||||
);
|
||||
if (flags & (pg_constants::PD_HEAP_RELATION | pg_constants::PD_NONHEAP_RELATION)) == 0 {
|
||||
// If type of relation was not determined yet,
|
||||
// then do it now
|
||||
for r in records {
|
||||
let xl_rmid = r.rec[pg_constants::XL_RMID_OFFS];
|
||||
if xl_rmid == pg_constants::RM_HEAP_ID || xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
flags |= pg_constants::PD_HEAP_RELATION;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (flags & pg_constants::PD_HEAP_RELATION) == 0 {
|
||||
flags |= pg_constants::PD_NONHEAP_RELATION;
|
||||
}
|
||||
LittleEndian::write_u16(
|
||||
&mut page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
|
||||
flags,
|
||||
);
|
||||
}
|
||||
if (flags & pg_constants::PD_HEAP_RELATION) != 0 {
|
||||
// Set hint bits for heap relation page
|
||||
let pd_lower = LittleEndian::read_u16(
|
||||
&page[pg_constants::PD_LOWER_OFFSET..pg_constants::PD_LOWER_OFFSET + 2],
|
||||
) as usize;
|
||||
let mut tid_offs = pg_constants::SIZE_OF_PAGE_HEADER_DATA;
|
||||
while tid_offs < pd_lower {
|
||||
let tid = LittleEndian::read_u32(&page[tid_offs..tid_offs + 4]);
|
||||
let lp_off = (tid & 0x7FFF) as usize;
|
||||
if ((tid >> 15) & 3) == pg_constants::LP_NORMAL {
|
||||
// normal item pointer
|
||||
let t_xmin = LittleEndian::read_u32(
|
||||
&page[lp_off + pg_constants::T_XMIN_OFFS
|
||||
..lp_off + pg_constants::T_XMIN_OFFS + 4],
|
||||
);
|
||||
let t_xmax = LittleEndian::read_u32(
|
||||
&page[lp_off + pg_constants::T_XMAX_OFFS
|
||||
..lp_off + pg_constants::T_XMAX_OFFS + 4],
|
||||
);
|
||||
let mut t_infomask = LittleEndian::read_u16(
|
||||
&page[lp_off + pg_constants::T_INFOMASK_OFFS
|
||||
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
|
||||
);
|
||||
if (t_infomask
|
||||
& (pg_constants::HEAP_XMIN_COMMITTED | pg_constants::HEAP_XMIN_INVALID))
|
||||
== 0
|
||||
&& t_xmin != 0
|
||||
{
|
||||
let status = self.xid_status(timeline, t_xmin, lsn);
|
||||
if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
|
||||
t_infomask |= pg_constants::HEAP_XMIN_COMMITTED;
|
||||
} else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
|
||||
t_infomask |= pg_constants::HEAP_XMIN_INVALID;
|
||||
}
|
||||
LittleEndian::write_u16(
|
||||
&mut page[lp_off + pg_constants::T_INFOMASK_OFFS
|
||||
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
|
||||
t_infomask,
|
||||
);
|
||||
}
|
||||
if (t_infomask
|
||||
& (pg_constants::HEAP_XMAX_COMMITTED
|
||||
| pg_constants::HEAP_XMAX_INVALID
|
||||
| pg_constants::HEAP_XMAX_IS_MULTI))
|
||||
== 0
|
||||
&& t_xmax != 0
|
||||
{
|
||||
let status = self.xid_status(timeline, t_xmax, lsn);
|
||||
if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
|
||||
t_infomask |= pg_constants::HEAP_XMAX_COMMITTED;
|
||||
} else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
|
||||
t_infomask |= pg_constants::HEAP_XMAX_INVALID;
|
||||
}
|
||||
LittleEndian::write_u16(
|
||||
&mut page[lp_off + pg_constants::T_INFOMASK_OFFS
|
||||
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
|
||||
t_infomask,
|
||||
);
|
||||
}
|
||||
}
|
||||
tid_offs += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo.
|
||||
///
|
||||
@@ -445,7 +324,7 @@ impl PostgresRedoManager {
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
@@ -574,7 +453,7 @@ impl PostgresRedoProcess {
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir");
|
||||
let datadir = conf.tenant_path(&tenantid).join("wal-redo-datadir");
|
||||
|
||||
// Create empty data directory for wal-redo postgres, deleting old one first.
|
||||
if datadir.exists() {
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
//! data directory is compatible with a postgres binary. That includes
|
||||
//! a version number, configuration options that can be set at
|
||||
//! compilation time like the block size, and the platform's alignment
|
||||
//! and endianness information. (The PostgreSQL on-disk file format is
|
||||
//! and endianess information. (The PostgreSQL on-disk file format is
|
||||
//! not portable across platforms.)
|
||||
//!
|
||||
//! The control file is stored in the PostgreSQL data directory, as
|
||||
|
||||
@@ -46,7 +46,6 @@ pub const SIZE_OF_PAGE_HEADER: u16 = 24;
|
||||
pub const BITS_PER_HEAPBLOCK: u16 = 2;
|
||||
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
|
||||
|
||||
pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00;
|
||||
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
|
||||
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
|
||||
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
|
||||
@@ -190,36 +189,11 @@ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
|
||||
pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
|
||||
pub const XLP_LONG_HEADER: u16 = 0x0002;
|
||||
|
||||
pub const PG_MAJORVERSION: &str = "14";
|
||||
|
||||
// Zenith specific page flags used to distinguish heap and non-heap relations
|
||||
pub const PD_HEAP_RELATION: u16 = 0x10;
|
||||
pub const PD_NONHEAP_RELATION: u16 = 0x20;
|
||||
|
||||
// bufpage.h
|
||||
pub const PD_FLAGS_OFFSET: usize = 10; // PageHeaderData.pd_flags
|
||||
pub const PD_LOWER_OFFSET: usize = 12; // PageHeaderData.pd_lower
|
||||
|
||||
// itemid.h
|
||||
pub const LP_NORMAL: u32 = 1;
|
||||
|
||||
// htup_details.h
|
||||
pub const T_XMIN_OFFS: usize = 0;
|
||||
pub const T_XMAX_OFFS: usize = 4;
|
||||
pub const T_INFOMASK_OFFS: usize = 4 * 3 + 2 * 3 + 2;
|
||||
pub const HEAP_XMIN_COMMITTED: u16 = 0x0100; /* t_xmin committed */
|
||||
pub const HEAP_XMIN_INVALID: u16 = 0x0200; /* t_xmin invalid/aborted */
|
||||
pub const HEAP_XMAX_COMMITTED: u16 = 0x0400; /* t_xmax committed */
|
||||
pub const HEAP_XMAX_INVALID: u16 = 0x0800; /* t_xmax invalid/aborted */
|
||||
pub const HEAP_XMAX_IS_MULTI: u16 = 0x1000; /* t_xmax is a MultiXactId */
|
||||
pub const SIZE_OF_PAGE_HEADER_DATA: usize = 24;
|
||||
|
||||
// xlogrecord.h
|
||||
pub const XL_RMID_OFFS: usize = 17;
|
||||
pub const PG_MAJORVERSION: &'static str = "14";
|
||||
|
||||
// List of subdirectories inside pgdata.
|
||||
// Copied from src/bin/initdb/initdb.c
|
||||
pub const PGDATA_SUBDIRS: [&str; 22] = [
|
||||
pub const PGDATA_SUBDIRS: [&'static str; 22] = [
|
||||
"global",
|
||||
"pg_wal/archive_status",
|
||||
"pg_commit_ts",
|
||||
@@ -244,11 +218,11 @@ pub const PGDATA_SUBDIRS: [&str; 22] = [
|
||||
"pg_logical/mappings",
|
||||
];
|
||||
|
||||
// Don't include postgresql.conf as it is inconvenient on node start:
|
||||
// we need postgresql.conf before basebackup to synchronize safekeepers
|
||||
// so no point in overwriting it during backup restore. Rest of the files
|
||||
// here are not needed before backup so it is okay to edit them after.
|
||||
pub const PGDATA_SPECIAL_FILES: [&str; 3] =
|
||||
["pg_hba.conf", "pg_ident.conf", "postgresql.auto.conf"];
|
||||
pub const PGDATA_SPECIAL_FILES: [&'static str; 4] = [
|
||||
"pg_hba.conf",
|
||||
"pg_ident.conf",
|
||||
"postgresql.conf",
|
||||
"postgresql.auto.conf",
|
||||
];
|
||||
|
||||
pub static PG_HBA: &str = include_str!("../samples/pg_hba.conf");
|
||||
pub static PG_HBA: &'static str = include_str!("../samples/pg_hba.conf");
|
||||
|
||||
@@ -26,7 +26,6 @@ use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub const XLOG_BLCKSZ: usize = 8192;
|
||||
@@ -38,7 +37,6 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
|
||||
#[allow(clippy::identity_op)]
|
||||
pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
|
||||
|
||||
pub type XLogRecPtr = u64;
|
||||
@@ -90,21 +88,6 @@ pub fn IsPartialXLogFileName(fname: &str) -> bool {
|
||||
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
|
||||
}
|
||||
|
||||
/// If LSN points to the beginning of the page, then shift it to first record,
|
||||
/// otherwise align on 8-bytes boundary (required for WAL records)
|
||||
pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
|
||||
if lsn.0 % XLOG_BLCKSZ as u64 == 0 {
|
||||
let hdr_size = if lsn.0 % seg_sz as u64 == 0 {
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD
|
||||
} else {
|
||||
XLOG_SIZE_OF_XLOG_SHORT_PHD
|
||||
};
|
||||
lsn + hdr_size as u64
|
||||
} else {
|
||||
lsn.align()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_current_timestamp() -> TimestampTz {
|
||||
const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
|
||||
const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
|
||||
@@ -190,11 +173,12 @@ fn find_end_of_wal_segment(
|
||||
let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS;
|
||||
wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
|
||||
crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]);
|
||||
crc = !crc;
|
||||
} else {
|
||||
crc ^= 0xFFFFFFFFu32;
|
||||
crc = crc32c_append(crc, &buf[page_offs..page_offs + n]);
|
||||
crc = !crc;
|
||||
}
|
||||
crc = !crc;
|
||||
rec_offs += n;
|
||||
offs += n;
|
||||
contlen -= n;
|
||||
@@ -432,6 +416,7 @@ mod tests {
|
||||
use super::*;
|
||||
use regex::Regex;
|
||||
use std::{env, process::Command, str::FromStr};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Run find_end_of_wal against file in test_wal dir
|
||||
// Ensure that it finds last record correctly
|
||||
@@ -480,7 +465,7 @@ mod tests {
|
||||
let waldump_output = std::str::from_utf8(&waldump_output.stderr).unwrap();
|
||||
println!("waldump_output = '{}'", &waldump_output);
|
||||
let re = Regex::new(r"invalid record length at (.+):").unwrap();
|
||||
let caps = re.captures(waldump_output).unwrap();
|
||||
let caps = re.captures(&waldump_output).unwrap();
|
||||
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||
|
||||
// 5. Rename file to partial to actually find last valid lsn
|
||||
|
||||
@@ -56,7 +56,7 @@ impl CPlaneApi {
|
||||
md5::compute([stored_hash.as_bytes(), salt].concat())
|
||||
);
|
||||
|
||||
let received_hash = std::str::from_utf8(md5_response)?;
|
||||
let received_hash = std::str::from_utf8(&md5_response)?;
|
||||
|
||||
println!(
|
||||
"auth: {} rh={} sh={} ssh={} {:?}",
|
||||
|
||||
@@ -143,10 +143,10 @@ fn main() -> anyhow::Result<()> {
|
||||
// for each connection.
|
||||
thread::Builder::new()
|
||||
.name("Proxy thread".into())
|
||||
.spawn(move || proxy::thread_main(state, pageserver_listener))?,
|
||||
.spawn(move || proxy::thread_main(&state, pageserver_listener))?,
|
||||
thread::Builder::new()
|
||||
.name("Mgmt thread".into())
|
||||
.spawn(move || mgmt::thread_main(state, mgmt_listener))?,
|
||||
.spawn(move || mgmt::thread_main(&state, mgmt_listener))?,
|
||||
];
|
||||
|
||||
for t in threads.into_iter() {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import subprocess
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
|
||||
@@ -75,18 +74,3 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (200100, )
|
||||
|
||||
# Check bad lsn's for branching
|
||||
|
||||
# branch at segment boundary
|
||||
zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
|
||||
pg = postgres.create_start("test_branch_segment_boundary")
|
||||
cur = pg.connect().cursor()
|
||||
cur.execute('SELECT 1')
|
||||
assert cur.fetchone() == (1, )
|
||||
|
||||
# branch at pre-initdb lsn
|
||||
try:
|
||||
zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
|
||||
except subprocess.CalledProcessError:
|
||||
print("Branch creation with pre-initdb LSN failed (as expected)")
|
||||
|
||||
@@ -3,7 +3,7 @@ import os
|
||||
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import os
|
||||
import pathlib
|
||||
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -69,8 +69,6 @@ def test_dropdb(
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('DROP DATABASE foodb')
|
||||
|
||||
cur.execute('CHECKPOINT')
|
||||
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_after_drop = cur.fetchone()[0]
|
||||
|
||||
@@ -96,6 +94,3 @@ def test_dropdb(
|
||||
print(dbpath)
|
||||
|
||||
assert os.path.isdir(dbpath) == False
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(zenith_cli, pg, lsn_after_drop, postgres)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -63,6 +63,3 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_b
|
||||
|
||||
# Check that we restored pg_controlfile correctly
|
||||
assert next_multixact_id_new == next_multixact_id
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(zenith_cli, pg, lsn, postgres)
|
||||
|
||||
@@ -6,19 +6,19 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def print_gc_result(row):
|
||||
print("GC duration {elapsed} ms".format_map(row));
|
||||
print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
|
||||
print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))
|
||||
print(" REL total: {snapshot_relfiles_total}, needed_by_cutoff {snapshot_relfiles_needed_by_cutoff}, needed_by_branches: {snapshot_relfiles_needed_by_branches}, not_updated: {snapshot_relfiles_not_updated}, removed: {snapshot_relfiles_removed}, dropped: {snapshot_relfiles_dropped}".format_map(row))
|
||||
print(" NONREL total: {snapshot_nonrelfiles_total}, needed_by_cutoff {snapshot_nonrelfiles_needed_by_cutoff}, needed_by_branches: {snapshot_nonrelfiles_needed_by_branches}, not_updated: {snapshot_nonrelfiles_not_updated}, removed: {snapshot_nonrelfiles_removed}, dropped: {snapshot_nonrelfiles_dropped}".format_map(row))
|
||||
|
||||
|
||||
#
|
||||
# Test Garbage Collection of old layer files
|
||||
# Test Garbage Collection of old snapshot files
|
||||
#
|
||||
# This test is pretty tightly coupled with the current implementation of layered
|
||||
# storage, in layered_repository.rs.
|
||||
#
|
||||
def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_layerfiles_gc", "empty"])
|
||||
pg = postgres.create_start('test_layerfiles_gc')
|
||||
def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_snapfiles_gc", "empty"])
|
||||
pg = postgres.create_start('test_snapfiles_gc')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -55,8 +55,8 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
# remember the number of files
|
||||
layer_relfiles_remain = row['layer_relfiles_total'] - row['layer_relfiles_removed']
|
||||
assert layer_relfiles_remain > 0
|
||||
snapshot_relfiles_remain = row['snapshot_relfiles_total'] - row['snapshot_relfiles_removed']
|
||||
assert snapshot_relfiles_remain > 0
|
||||
|
||||
# Insert a row.
|
||||
print("Inserting one row and running GC")
|
||||
@@ -64,12 +64,12 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
|
||||
assert row['layer_relfiles_removed'] == 1
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
|
||||
assert row['snapshot_relfiles_removed'] == 1
|
||||
assert row['snapshot_relfiles_dropped'] == 0
|
||||
|
||||
# Insert two more rows and run GC.
|
||||
# This should create a new layer file with the new contents, and
|
||||
# This should create a new snapshot file with the new contents, and
|
||||
# remove the old one.
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
@@ -78,11 +78,11 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
|
||||
assert row['layer_relfiles_removed'] == 1
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
|
||||
assert row['snapshot_relfiles_removed'] == 1
|
||||
assert row['snapshot_relfiles_dropped'] == 0
|
||||
|
||||
# Do it again. Should again create a new layer file and remove old one.
|
||||
# Do it again. Should again create a new snapshot file and remove old one.
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
@@ -90,18 +90,18 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
|
||||
assert row['layer_relfiles_removed'] == 1
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
|
||||
assert row['snapshot_relfiles_removed'] == 1
|
||||
assert row['snapshot_relfiles_dropped'] == 0
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
print("Run GC again, with nothing to do")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain
|
||||
assert row['layer_relfiles_removed'] == 0
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
|
||||
assert row['snapshot_relfiles_removed'] == 0
|
||||
assert row['snapshot_relfiles_dropped'] == 0
|
||||
|
||||
#
|
||||
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||
@@ -114,11 +114,11 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
print_gc_result(row);
|
||||
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
assert row['layer_relfiles_dropped'] == 3
|
||||
assert row['snapshot_relfiles_dropped'] == 3
|
||||
|
||||
# The catalog updates also create new layer files of the catalogs, which
|
||||
# The catalog updates also create new snapshot files of the catalogs, which
|
||||
# are counted as 'removed'
|
||||
assert row['layer_relfiles_removed'] > 0
|
||||
assert row['snapshot_relfiles_removed'] > 0
|
||||
|
||||
# TODO: perhaps we should count catalog and user relations separately,
|
||||
# to make this kind of testing more robust
|
||||
|
||||
@@ -59,8 +59,9 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
# Create a branch with the transaction in prepared state
|
||||
zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])
|
||||
|
||||
# Start compute on the new branch
|
||||
pg2 = postgres.create_start(
|
||||
# Create compute node, but don't start.
|
||||
# We want to observe pgdata before postgres starts
|
||||
pg2 = postgres.create(
|
||||
'test_twophase_prepared',
|
||||
config_lines=['max_prepared_transactions=5'],
|
||||
)
|
||||
@@ -70,6 +71,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
print(twophase_files2)
|
||||
assert twophase_files2.sort() == twophase_files.sort()
|
||||
|
||||
pg2 = pg2.start()
|
||||
conn2 = pg2.connect()
|
||||
cur2 = conn2.cursor()
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -49,10 +49,3 @@ def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_
|
||||
# logs the exact same data to `regression.out` anyway.
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
|
||||
|
||||
# checkpoint one more time to ensure that the lsn we get is the latest one
|
||||
pg.safe_psql('CHECKPOINT')
|
||||
lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(zenith_cli, pg, lsn, postgres)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
from fixtures.zenith_fixtures import PostgresFactory, check_restored_datadir_content
|
||||
from fixtures.zenith_fixtures import PostgresFactory
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -50,10 +50,3 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
|
||||
# logs the exact same data to `regression.out` anyway.
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
|
||||
|
||||
# checkpoint one more time to ensure that the lsn we get is the latest one
|
||||
pg.safe_psql('CHECKPOINT')
|
||||
lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(zenith_cli, pg, lsn, postgres)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from pprint import pprint
|
||||
|
||||
import os
|
||||
import re
|
||||
import timeit
|
||||
import pathlib
|
||||
import uuid
|
||||
@@ -79,7 +78,7 @@ class ZenithBenchmarkResults:
|
||||
|
||||
self.results.append((test_name, metric_name, metric_value, unit))
|
||||
|
||||
# Session scope fixture that initializes the results object
|
||||
# Sesssion scope fixture that initializes the results object
|
||||
@pytest.fixture(autouse=True, scope='session')
|
||||
def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
|
||||
"""
|
||||
@@ -121,35 +120,6 @@ class ZenithBenchmarker:
|
||||
|
||||
self.results.record(self.request.node.name, metric_name, end - start, 's')
|
||||
|
||||
def get_io_writes(self, pageserver) -> int:
|
||||
"""
|
||||
Fetch the "cumulative # of bytes written" metric from the pageserver
|
||||
"""
|
||||
# Fetch all the exposed prometheus metrics from page server
|
||||
all_metrics = pageserver.http_client().get_metrics()
|
||||
# Use a regular expression to extract the one we're interested in
|
||||
#
|
||||
# TODO: If we start to collect more of the prometheus metrics in the
|
||||
# performance test suite like this, we should refactor this to load and
|
||||
# parse all the metrics into a more convenient structure in one go.
|
||||
#
|
||||
# The metric should be an integer, as it's a number of bytes. But in general
|
||||
# all prometheus metrics are floats. So to be pedantic, read it as a float
|
||||
# and round to integer.
|
||||
matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics)
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
@contextmanager
|
||||
def record_pageserver_writes(self, pageserver, metric_name):
|
||||
"""
|
||||
Record bytes written by the pageserver during a test.
|
||||
"""
|
||||
before = self.get_io_writes(pageserver)
|
||||
yield
|
||||
after = self.get_io_writes(pageserver)
|
||||
|
||||
self.results.record(self.request.node.name, metric_name, round((after - before) / (1024 * 1024)), 'MB')
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
|
||||
"""
|
||||
|
||||
@@ -10,8 +10,6 @@ import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
import filecmp
|
||||
import difflib
|
||||
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
@@ -171,23 +169,12 @@ class ZenithCli:
|
||||
|
||||
args = [self.bin_zenith] + arguments
|
||||
print('Running command "{}"'.format(' '.join(args)))
|
||||
|
||||
# Interceipt CalledProcessError and print more info
|
||||
try:
|
||||
res = subprocess.run(args,
|
||||
env=self.env,
|
||||
check=True,
|
||||
universal_newlines=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(f"Run failed: {err}")
|
||||
print(f" stdout: {err.stdout}")
|
||||
print(f" stderr: {err.stderr}")
|
||||
|
||||
raise err
|
||||
|
||||
return res
|
||||
return subprocess.run(args,
|
||||
env=self.env,
|
||||
check=True,
|
||||
universal_newlines=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
|
||||
@zenfixture
|
||||
@@ -239,11 +226,6 @@ class ZenithPageserverHttpClient(requests.Session):
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def get_metrics(self) -> str:
|
||||
res = self.get(f"http://localhost:{self.port}/metrics")
|
||||
res.raise_for_status()
|
||||
return res.text
|
||||
|
||||
|
||||
@dataclass
|
||||
class AuthKeys:
|
||||
@@ -452,6 +434,7 @@ class Postgres(PgProtocol):
|
||||
branch: str,
|
||||
wal_acceptors: Optional[str] = None,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
config_only: bool = False,
|
||||
) -> 'Postgres':
|
||||
"""
|
||||
Create the pg data directory.
|
||||
@@ -463,7 +446,10 @@ class Postgres(PgProtocol):
|
||||
if not config_lines:
|
||||
config_lines = []
|
||||
|
||||
self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
|
||||
if config_only:
|
||||
self.zenith_cli.run(['pg', 'create', '--config-only', branch, f'--tenantid={self.tenant_id}'])
|
||||
else:
|
||||
self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
|
||||
self.branch = branch
|
||||
path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch
|
||||
self.pgdata_dir = os.path.join(self.repo_dir, path)
|
||||
@@ -484,13 +470,11 @@ class Postgres(PgProtocol):
|
||||
|
||||
assert self.branch is not None
|
||||
|
||||
print(f"Starting postgres on branch {self.branch}")
|
||||
print(f"Starting postgres on brach {self.branch}")
|
||||
|
||||
run_result = self.zenith_cli.run(['pg', 'start', self.branch, f'--tenantid={self.tenant_id}'])
|
||||
self.zenith_cli.run(['pg', 'start', self.branch, f'--tenantid={self.tenant_id}'])
|
||||
self.running = True
|
||||
|
||||
print(f"stdout: {run_result.stdout}")
|
||||
|
||||
self.pg_bin.run(['pg_controldata', self.pg_data_dir_path()])
|
||||
|
||||
return self
|
||||
@@ -588,6 +572,7 @@ class Postgres(PgProtocol):
|
||||
branch=branch,
|
||||
wal_acceptors=wal_acceptors,
|
||||
config_lines=config_lines,
|
||||
config_only=True,
|
||||
).start()
|
||||
|
||||
return self
|
||||
@@ -599,23 +584,6 @@ class Postgres(PgProtocol):
|
||||
self.stop()
|
||||
|
||||
|
||||
def list_files_to_compare(self):
|
||||
pgdata_files = []
|
||||
for root, _file, filenames in os.walk(self.pgdata_dir):
|
||||
for filename in filenames:
|
||||
rel_dir = os.path.relpath(root, self.pgdata_dir)
|
||||
# Skip some dirs and files we don't want to compare
|
||||
skip_dirs = ['pg_wal', 'pg_stat', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical']
|
||||
skip_files = ['pg_internal.init', 'pg.log', 'zenith.signal', 'postgresql.conf',
|
||||
'postmaster.opts', 'postmaster.pid', 'pg_control']
|
||||
if rel_dir not in skip_dirs and filename not in skip_files:
|
||||
rel_file = os.path.join(rel_dir, filename)
|
||||
pgdata_files.append(rel_file)
|
||||
|
||||
pgdata_files.sort()
|
||||
print(pgdata_files)
|
||||
return pgdata_files
|
||||
|
||||
class PostgresFactory:
|
||||
""" An object representing multiple running postgres daemons. """
|
||||
def __init__(self, zenith_cli: ZenithCli, repo_dir: str, pg_bin: PgBin, initial_tenant: str, base_port: int = 55431):
|
||||
@@ -943,55 +911,3 @@ class TenantFactory:
|
||||
@zenfixture
|
||||
def tenant_factory(zenith_cli: ZenithCli):
|
||||
return TenantFactory(zenith_cli)
|
||||
|
||||
|
||||
# pg is the existing comute node we want to compare our basebackup to
|
||||
# lsn is the latest lsn of this node
|
||||
def check_restored_datadir_content(zenith_cli, pg, lsn, postgres: PostgresFactory):
|
||||
# stop postgres to ensure that files won't change
|
||||
pg.stop()
|
||||
|
||||
# list files we're going to compare
|
||||
pgdata_files = pg.list_files_to_compare()
|
||||
|
||||
# create new branch, but don't start postgres
|
||||
# We only need 'basebackup' result here.
|
||||
zenith_cli.run(
|
||||
["branch", "check_restored_datadir", pg.branch + "@" + lsn])
|
||||
|
||||
pg2 = postgres.create('check_restored_datadir')
|
||||
print('postgres is created on check_restored_datadir branch')
|
||||
|
||||
print('files in a basebackup')
|
||||
# list files we're going to compare
|
||||
pgdata_files2 = pg2.list_files_to_compare()
|
||||
|
||||
# check that file sets are equal
|
||||
assert pgdata_files == pgdata_files2
|
||||
|
||||
# compare content of the files
|
||||
# filecmp returns (match, mismatch, error) lists
|
||||
# We've already filtered all mismatching files in list_files_to_compare(),
|
||||
# so here expect that the content is identical
|
||||
(match, mismatch, error) = filecmp.cmpfiles(pg.pgdata_dir,
|
||||
pg2.pgdata_dir,
|
||||
pgdata_files,
|
||||
shallow=False)
|
||||
print('filecmp result mismatch and error lists:')
|
||||
print(mismatch)
|
||||
print(error)
|
||||
|
||||
for f in mismatch:
|
||||
|
||||
f1 = os.path.join(pg.pgdata_dir, f)
|
||||
f2 = os.path.join(pg2.pgdata_dir, f)
|
||||
stdout_filename = "{}.diff".format(f2)
|
||||
|
||||
with open(stdout_filename, 'w') as stdout_f:
|
||||
subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True)
|
||||
subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True)
|
||||
|
||||
cmd = ['diff {}.hex {}.hex'.format(f1, f2)]
|
||||
subprocess.run(cmd, stdout=stdout_f, shell=True)
|
||||
|
||||
assert (mismatch, error) == ([], [])
|
||||
|
||||
@@ -46,14 +46,13 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
|
||||
|
||||
connstr = pg.connstr()
|
||||
|
||||
# Initialize pgbench database, recording the time and I/O it takes
|
||||
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('init'):
|
||||
pg_bin.run_capture(['pgbench', '-s5', '-i', connstr])
|
||||
# Initialize pgbench database
|
||||
with zenbenchmark.record_duration('init'):
|
||||
pg_bin.run_capture(['pgbench', '-s5', '-i', connstr])
|
||||
|
||||
# Flush the layers from memory to disk. This is included in the reported
|
||||
# time and I/O
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
# Flush the layers from memory to disk. The time to do that is included in the
|
||||
# reported init time.
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Run pgbench for 5000 transactions
|
||||
with zenbenchmark.record_duration('5000_xacts'):
|
||||
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 084b228b89...909c606355
@@ -148,7 +148,7 @@ impl ReplicationConn {
|
||||
}
|
||||
});
|
||||
|
||||
let (mut start_pos, mut stop_pos) = Self::parse_start_stop(cmd)?;
|
||||
let (mut start_pos, mut stop_pos) = Self::parse_start_stop(&cmd)?;
|
||||
|
||||
let mut wal_seg_size: usize;
|
||||
loop {
|
||||
@@ -229,7 +229,7 @@ impl ReplicationConn {
|
||||
|
||||
start_pos += send_size as u64;
|
||||
|
||||
debug!("sent WAL up to {}", end_pos);
|
||||
debug!("Sent WAL to page server up to {}", end_pos);
|
||||
|
||||
// Decide whether to reuse this file. If we don't set wal_file here
|
||||
// a new file will be opened next time.
|
||||
|
||||
@@ -10,7 +10,6 @@ use log::*;
|
||||
use postgres_ffi::xlog_utils::TimeLineID;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::max;
|
||||
use std::cmp::min;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
|
||||
@@ -48,7 +47,6 @@ pub struct ServerInfo {
|
||||
/// Postgres server version
|
||||
pub pg_version: u32,
|
||||
pub system_id: SystemId,
|
||||
pub tenant_id: ZTenantId,
|
||||
/// Zenith timelineid
|
||||
pub ztli: ZTimelineId,
|
||||
pub tli: TimeLineID,
|
||||
@@ -67,9 +65,10 @@ pub struct SafeKeeperState {
|
||||
/// information about server
|
||||
pub server: ServerInfo,
|
||||
/// Unique id of the last *elected* proposer we dealed with. Not needed
|
||||
/// for correctness, exists for monitoring purposes.
|
||||
/// correctness, exists for monitoring purposes.
|
||||
pub proposer_uuid: PgUuid,
|
||||
/// part of WAL acknowledged by quorum and available locally
|
||||
/// part of WAL acknowledged by quorum (note that we might not have wal to
|
||||
/// up this point locally)
|
||||
pub commit_lsn: Lsn,
|
||||
/// minimal LSN which may be needed for recovery of some safekeeper (end lsn
|
||||
/// + 1 of last record streamed to everyone)
|
||||
@@ -85,7 +84,6 @@ impl SafeKeeperState {
|
||||
server: ServerInfo {
|
||||
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
|
||||
system_id: 0, /* Postgres system identifier */
|
||||
tenant_id: ZTenantId::from([0u8; 16]),
|
||||
ztli: ZTimelineId::from([0u8; 16]),
|
||||
tli: 0,
|
||||
wal_seg_size: 0,
|
||||
@@ -97,12 +95,6 @@ impl SafeKeeperState {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SafeKeeperState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
// protocol messages
|
||||
|
||||
/// Initial Proposer -> Acceptor message
|
||||
@@ -163,7 +155,7 @@ pub struct AppendRequestHeader {
|
||||
end_lsn: Lsn,
|
||||
/// LSN committed by quorum of safekeepers
|
||||
commit_lsn: Lsn,
|
||||
/// restart LSN position (minimal LSN which may be needed by proposer to perform recovery)
|
||||
/// restart LSN position (minimal LSN which may be needed by proposer to perform recovery)
|
||||
restart_lsn: Lsn,
|
||||
// only for logging/debugging
|
||||
proposer_uuid: PgUuid,
|
||||
@@ -180,9 +172,6 @@ pub struct AppendResponse {
|
||||
// make much sense without taking epoch into account, as history can be
|
||||
// diverged.
|
||||
pub flush_lsn: Lsn,
|
||||
// We report back our awareness about which WAL is committed, as this is
|
||||
// a criterion for walproposer --sync mode exit
|
||||
pub commit_lsn: Lsn,
|
||||
pub hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
|
||||
@@ -216,7 +205,7 @@ impl ProposerAcceptorMessage {
|
||||
let rec_size = hdr
|
||||
.end_lsn
|
||||
.checked_sub(hdr.begin_lsn)
|
||||
.ok_or_else(|| anyhow!("begin_lsn > end_lsn in AppendRequest"))?
|
||||
.ok_or(anyhow!("begin_lsn > end_lsn in AppendRequest"))?
|
||||
.0 as usize;
|
||||
if rec_size > MAX_SEND_SIZE {
|
||||
bail!(
|
||||
@@ -228,7 +217,10 @@ impl ProposerAcceptorMessage {
|
||||
let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
|
||||
stream.read_exact(&mut wal_data_vec)?;
|
||||
let wal_data = Bytes::from(wal_data_vec);
|
||||
let msg = AppendRequest { h: hdr, wal_data };
|
||||
let msg = AppendRequest {
|
||||
h: hdr,
|
||||
wal_data: wal_data,
|
||||
};
|
||||
|
||||
Ok(ProposerAcceptorMessage::AppendRequest(msg))
|
||||
}
|
||||
@@ -282,9 +274,7 @@ pub struct SafeKeeper<ST: Storage> {
|
||||
/// reading wal.
|
||||
pub flush_lsn: Lsn,
|
||||
pub tli: u32,
|
||||
/// not-yet-flushed pairs of same named fields in s.*
|
||||
pub commit_lsn: Lsn,
|
||||
pub truncate_lsn: Lsn,
|
||||
pub flushed_truncate_lsn: Lsn,
|
||||
pub storage: ST,
|
||||
pub s: SafeKeeperState, // persistent part
|
||||
pub elected_proposer_term: Term, // for monitoring/debugging
|
||||
@@ -299,8 +289,7 @@ where
|
||||
SafeKeeper {
|
||||
flush_lsn,
|
||||
tli,
|
||||
commit_lsn: state.commit_lsn,
|
||||
truncate_lsn: state.truncate_lsn,
|
||||
flushed_truncate_lsn: Lsn(0),
|
||||
storage,
|
||||
s: state,
|
||||
elected_proposer_term: 0,
|
||||
@@ -331,6 +320,13 @@ where
|
||||
SK_PROTOCOL_VERSION
|
||||
);
|
||||
}
|
||||
if self.s.server.system_id != 0 && self.s.server.system_id != msg.system_id {
|
||||
bail!(
|
||||
"system identifier changed: got {}, expected {}",
|
||||
msg.system_id,
|
||||
self.s.server.system_id,
|
||||
);
|
||||
}
|
||||
/* Postgres upgrade is not treated as fatal error */
|
||||
if msg.pg_version != self.s.server.pg_version
|
||||
&& self.s.server.pg_version != UNKNOWN_SERVER_VERSION
|
||||
@@ -343,7 +339,6 @@ where
|
||||
|
||||
// set basic info about server, if not yet
|
||||
self.s.server.system_id = msg.system_id;
|
||||
self.s.server.tenant_id = msg.tenant_id;
|
||||
self.s.server.ztli = msg.ztli;
|
||||
self.s.server.tli = msg.tli;
|
||||
self.s.server.wal_seg_size = msg.wal_seg_size;
|
||||
@@ -383,13 +378,12 @@ where
|
||||
}
|
||||
|
||||
/// Handle request to append WAL.
|
||||
#[allow(clippy::comparison_chain)]
|
||||
fn handle_append_request(&mut self, msg: &AppendRequest) -> Result<AcceptorProposerMessage> {
|
||||
// log first AppendRequest from this proposer
|
||||
if self.elected_proposer_term < msg.h.term {
|
||||
info!(
|
||||
"start accepting WAL from timeline {}, tenant {}, term {}, epochStartLsn {:?}",
|
||||
self.s.server.ztli, self.s.server.tenant_id, msg.h.term, msg.h.epoch_start_lsn,
|
||||
"start receiving WAL from timeline {} term {}",
|
||||
self.s.server.ztli, msg.h.term,
|
||||
);
|
||||
self.elected_proposer_term = msg.h.term;
|
||||
}
|
||||
@@ -404,7 +398,6 @@ where
|
||||
let resp = AppendResponse {
|
||||
term: self.s.acceptor_state.term,
|
||||
epoch: self.s.acceptor_state.epoch,
|
||||
commit_lsn: Lsn(0),
|
||||
flush_lsn: Lsn(0),
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
};
|
||||
@@ -421,13 +414,9 @@ where
|
||||
* maximum (vcl) determined by WAL proposer during handshake.
|
||||
* Switching epoch means that node completes recovery and start writing in the WAL new data.
|
||||
* XXX: this is wrong, we must actively truncate not matching part of log.
|
||||
*
|
||||
* The non-strict inequality is important for us, as proposer in --sync mode doesn't
|
||||
* generate new records, but to advance commit_lsn epoch switch must happen on majority.
|
||||
* We can regard this as commit of empty entry in new epoch, this should be safe.
|
||||
*/
|
||||
if self.s.acceptor_state.epoch < msg.h.term
|
||||
&& msg.h.end_lsn >= max(self.flush_lsn, msg.h.epoch_start_lsn)
|
||||
&& msg.h.end_lsn > max(self.flush_lsn, msg.h.epoch_start_lsn)
|
||||
{
|
||||
info!("switched to new epoch {}", msg.h.term);
|
||||
self.s.acceptor_state.epoch = msg.h.term; /* bump epoch */
|
||||
@@ -438,20 +427,8 @@ where
|
||||
}
|
||||
|
||||
self.s.proposer_uuid = msg.h.proposer_uuid;
|
||||
// Advance commit_lsn taking into account what we have locally.
|
||||
// xxx this is wrapped into epoch check because we overwrite wal
|
||||
// instead of truncating it, so without it commit_lsn might include
|
||||
// wrong part. Anyway, nobody is much interested in our commit_lsn while
|
||||
// epoch switch hasn't happened, right?
|
||||
if self.s.acceptor_state.epoch == msg.h.term {
|
||||
let commit_lsn = min(msg.h.commit_lsn, self.flush_lsn);
|
||||
// If new commit_lsn reached epoch switch, force sync of control file:
|
||||
// walproposer in sync mode is very interested when this happens.
|
||||
sync_control_file |=
|
||||
commit_lsn >= msg.h.epoch_start_lsn && self.s.commit_lsn < msg.h.epoch_start_lsn;
|
||||
self.commit_lsn = commit_lsn;
|
||||
}
|
||||
self.truncate_lsn = msg.h.restart_lsn;
|
||||
self.s.commit_lsn = msg.h.commit_lsn;
|
||||
self.s.truncate_lsn = msg.h.restart_lsn;
|
||||
|
||||
/*
|
||||
* Update restart LSN in control file.
|
||||
@@ -459,26 +436,24 @@ where
|
||||
* when restart_lsn delta exceeds WAL segment size.
|
||||
*/
|
||||
sync_control_file |=
|
||||
self.s.truncate_lsn + (self.s.server.wal_seg_size as u64) < self.truncate_lsn;
|
||||
if sync_control_file {
|
||||
self.s.commit_lsn = self.commit_lsn;
|
||||
self.s.truncate_lsn = self.truncate_lsn;
|
||||
}
|
||||
self.flushed_truncate_lsn + (self.s.server.wal_seg_size as u64) < self.s.truncate_lsn;
|
||||
self.storage.persist(&self.s, sync_control_file)?;
|
||||
if sync_control_file {
|
||||
self.flushed_truncate_lsn = self.s.truncate_lsn;
|
||||
}
|
||||
|
||||
let resp = AppendResponse {
|
||||
term: self.s.acceptor_state.term,
|
||||
epoch: self.s.acceptor_state.epoch,
|
||||
flush_lsn: self.flush_lsn,
|
||||
commit_lsn: self.s.commit_lsn,
|
||||
// will be filled by caller code to avoid bothering safekeeper
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
};
|
||||
debug!(
|
||||
"processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, resp {:?}",
|
||||
trace!(
|
||||
"processed AppendRequest of len {}, flush_lsn={:X}/{:>08X}, resp {:?}",
|
||||
msg.wal_data.len(),
|
||||
msg.h.end_lsn,
|
||||
msg.h.commit_lsn,
|
||||
(self.flush_lsn.0 >> 32) as u32,
|
||||
self.flush_lsn.0 as u32,
|
||||
&resp,
|
||||
);
|
||||
Ok(AcceptorProposerMessage::AppendResponse(resp))
|
||||
@@ -517,7 +492,7 @@ mod tests {
|
||||
let mut vote_resp = sk.process_msg(&vote_request);
|
||||
match vote_resp.unwrap() {
|
||||
AcceptorProposerMessage::VoteResponse(resp) => assert!(resp.vote_given != 0),
|
||||
r => panic!("unexpected response: {:?}", r),
|
||||
_ => assert!(false),
|
||||
}
|
||||
|
||||
// reboot...
|
||||
@@ -531,7 +506,7 @@ mod tests {
|
||||
vote_resp = sk.process_msg(&vote_request);
|
||||
match vote_resp.unwrap() {
|
||||
AcceptorProposerMessage::VoteResponse(resp) => assert!(resp.vote_given == 0),
|
||||
r => panic!("unexpected response: {:?}", r),
|
||||
_ => assert!(false),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -544,7 +519,7 @@ mod tests {
|
||||
|
||||
let mut ar_hdr = AppendRequestHeader {
|
||||
term: 1,
|
||||
epoch_start_lsn: Lsn(3),
|
||||
epoch_start_lsn: Lsn(2),
|
||||
begin_lsn: Lsn(1),
|
||||
end_lsn: Lsn(2),
|
||||
commit_lsn: Lsn(0),
|
||||
@@ -556,20 +531,20 @@ mod tests {
|
||||
wal_data: Bytes::from_static(b"b"),
|
||||
};
|
||||
|
||||
// check that AppendRequest before epochStartLsn doesn't switch epoch
|
||||
// check that AppendRequest before VCL doesn't switch epoch
|
||||
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
|
||||
assert!(resp.is_ok());
|
||||
assert_eq!(sk.storage.persisted_state.acceptor_state.epoch, 0);
|
||||
assert!(sk.storage.persisted_state.acceptor_state.epoch == 0);
|
||||
|
||||
// but record at epochStartLsn does the switch
|
||||
// but record after VCL does the switch
|
||||
ar_hdr.begin_lsn = Lsn(2);
|
||||
ar_hdr.end_lsn = Lsn(3);
|
||||
append_request = AppendRequest {
|
||||
h: ar_hdr,
|
||||
h: ar_hdr.clone(),
|
||||
wal_data: Bytes::from_static(b"b"),
|
||||
};
|
||||
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
|
||||
assert!(resp.is_ok());
|
||||
assert_eq!(sk.storage.persisted_state.acceptor_state.epoch, 1);
|
||||
assert!(sk.storage.persisted_state.acceptor_state.epoch == 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,7 +127,7 @@ impl SharedState {
|
||||
if let CreateControlFile::False = create {
|
||||
bail!("control file is empty");
|
||||
}
|
||||
Ok((file, SafeKeeperState::new()))
|
||||
return Ok((file, SafeKeeperState::new()));
|
||||
} else {
|
||||
match SafeKeeperState::des_from(&mut file) {
|
||||
Err(e) => {
|
||||
@@ -144,7 +144,7 @@ impl SharedState {
|
||||
SK_FORMAT_VERSION
|
||||
);
|
||||
}
|
||||
Ok((file, s))
|
||||
return Ok((file, s));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -217,11 +217,14 @@ impl Timeline {
|
||||
rmsg = shared_state.sk.process_msg(msg)?;
|
||||
// locally available commit lsn. flush_lsn can be smaller than
|
||||
// commit_lsn if we are catching up safekeeper.
|
||||
commit_lsn = shared_state.sk.commit_lsn;
|
||||
commit_lsn = min(shared_state.sk.flush_lsn, shared_state.sk.s.commit_lsn);
|
||||
|
||||
// if this is AppendResponse, fill in proper hot standby feedback
|
||||
if let AcceptorProposerMessage::AppendResponse(ref mut resp) = rmsg {
|
||||
resp.hs_feedback = shared_state.hs_feedback.clone();
|
||||
match rmsg {
|
||||
AcceptorProposerMessage::AppendResponse(ref mut resp) => {
|
||||
resp.hs_feedback = shared_state.hs_feedback.clone();
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
// Ping wal sender that new data might be available.
|
||||
@@ -398,7 +401,7 @@ impl Storage for FileStorage {
|
||||
{
|
||||
Ok(mut file) => {
|
||||
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
||||
file.write_all(ZERO_BLOCK)?;
|
||||
file.write_all(&ZERO_BLOCK)?;
|
||||
}
|
||||
wal_file = file;
|
||||
}
|
||||
|
||||
@@ -95,7 +95,7 @@ fn main() -> Result<()> {
|
||||
.required(false)
|
||||
))
|
||||
.subcommand(SubCommand::with_name("start")
|
||||
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
|
||||
.about("Start a postrges compute node.\n This command actually creates new node from scrath, but preserves existing config files")
|
||||
.arg(timeline_arg.clone()).arg(tenantid_arg.clone()))
|
||||
.subcommand(
|
||||
SubCommand::with_name("stop")
|
||||
@@ -359,7 +359,7 @@ fn get_branch_infos(
|
||||
}
|
||||
|
||||
fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let pageserver = PageServerNode::from_env(env);
|
||||
let pageserver = PageServerNode::from_env(&env);
|
||||
match tenant_match.subcommand() {
|
||||
("list", Some(_)) => {
|
||||
for tenant in pageserver.tenant_list()? {
|
||||
@@ -381,12 +381,12 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
|
||||
}
|
||||
|
||||
fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let pageserver = PageServerNode::from_env(env);
|
||||
let pageserver = PageServerNode::from_env(&env);
|
||||
|
||||
if let Some(branchname) = branch_match.value_of("branchname") {
|
||||
let startpoint_str = branch_match
|
||||
.value_of("start-point")
|
||||
.ok_or_else(|| anyhow!("Missing start-point"))?;
|
||||
.ok_or(anyhow!("Missing start-point"))?;
|
||||
let tenantid: ZTenantId = branch_match
|
||||
.value_of("tenantid")
|
||||
.map_or(Ok(env.tenantid), |value| value.parse())?;
|
||||
@@ -447,8 +447,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
.value_of("tenantid")
|
||||
.map_or(Ok(env.tenantid), |value| value.parse())?;
|
||||
let timeline_name = create_match.value_of("timeline").unwrap_or("main");
|
||||
let config_only = create_match.is_present("config-only");
|
||||
|
||||
cplane.new_node(tenantid, timeline_name)?;
|
||||
cplane.new_node(tenantid, timeline_name, config_only)?;
|
||||
}
|
||||
("start", Some(start_match)) => {
|
||||
let tenantid: ZTenantId = start_match
|
||||
@@ -465,15 +466,11 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
None
|
||||
};
|
||||
|
||||
println!(
|
||||
"Starting {} postgres on timeline {}...",
|
||||
if node.is_some() { "existing" } else { "new" },
|
||||
timeline_name
|
||||
);
|
||||
println!("Starting postgres on timeline {}...", timeline_name);
|
||||
if let Some(node) = node {
|
||||
node.start(&auth_token)?;
|
||||
} else {
|
||||
let node = cplane.new_node(tenantid, timeline_name)?;
|
||||
let node = cplane.new_node(tenantid, timeline_name, false)?;
|
||||
node.start(&auth_token)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
|
||||
pub use prometheus::{Encoder, TextEncoder};
|
||||
|
||||
mod wrappers;
|
||||
use libc::{c_long, getrusage, rusage, suseconds_t, time_t, timeval, RUSAGE_SELF};
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
|
||||
/// Gathers all Prometheus metrics and records the I/O stats just before that.
|
||||
@@ -41,26 +42,40 @@ lazy_static! {
|
||||
// performed by the process.
|
||||
// We know the the size of the block, so we can determine the I/O bytes out of it.
|
||||
// The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
fn update_io_metrics() {
|
||||
let rusage_stats = get_rusage_stats();
|
||||
let mut usage = rusage {
|
||||
ru_utime: timeval {
|
||||
tv_sec: 0 as time_t,
|
||||
tv_usec: 0 as suseconds_t,
|
||||
},
|
||||
ru_stime: timeval {
|
||||
tv_sec: 0 as time_t,
|
||||
tv_usec: 0 as suseconds_t,
|
||||
},
|
||||
ru_maxrss: 0 as c_long,
|
||||
ru_ixrss: 0 as c_long,
|
||||
ru_idrss: 0 as c_long,
|
||||
ru_isrss: 0 as c_long,
|
||||
ru_minflt: 0 as c_long,
|
||||
ru_majflt: 0 as c_long,
|
||||
ru_nswap: 0 as c_long,
|
||||
ru_inblock: 0 as c_long,
|
||||
ru_oublock: 0 as c_long,
|
||||
ru_msgsnd: 0 as c_long,
|
||||
ru_msgrcv: 0 as c_long,
|
||||
ru_nsignals: 0 as c_long,
|
||||
ru_nvcsw: 0 as c_long,
|
||||
ru_nivcsw: 0 as c_long,
|
||||
};
|
||||
unsafe {
|
||||
getrusage(RUSAGE_SELF, (&mut usage) as *mut rusage);
|
||||
}
|
||||
|
||||
const BYTES_IN_BLOCK: i64 = 512;
|
||||
DISK_IO_BYTES
|
||||
.with_label_values(&["read"])
|
||||
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
|
||||
.set(usage.ru_inblock * BYTES_IN_BLOCK);
|
||||
DISK_IO_BYTES
|
||||
.with_label_values(&["write"])
|
||||
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
|
||||
}
|
||||
|
||||
fn get_rusage_stats() -> libc::rusage {
|
||||
let mut rusage = std::mem::MaybeUninit::uninit();
|
||||
|
||||
// SAFETY: kernel will initialize the struct for us
|
||||
unsafe {
|
||||
let ret = libc::getrusage(libc::RUSAGE_SELF, rusage.as_mut_ptr());
|
||||
assert!(ret == 0, "getrusage failed: bad args");
|
||||
rusage.assume_init()
|
||||
}
|
||||
.set(usage.ru_oublock * BYTES_IN_BLOCK);
|
||||
}
|
||||
|
||||
@@ -186,7 +186,7 @@ mod tests {
|
||||
assert_eq!(total, stream.len());
|
||||
}
|
||||
|
||||
// This mimics the constraints of std::thread::spawn
|
||||
// This mimicks the constraints of std::thread::spawn
|
||||
fn assert_send_sync(_x: impl Sync + Send + 'static) {}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// For details about authentication see docs/authentication.md
|
||||
// TODO there are two issues for our use case in jsonwebtoken library which will be resolved in next release
|
||||
// The first one is that there is no way to disable expiration claim, but it can be excluded from validation, so use this as a workaround for now.
|
||||
// The fisrt one is that there is no way to disable expiration claim, but it can be excluded from validation, so use this as a workaround for now.
|
||||
// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/190
|
||||
// The second one is that we wanted to use ed25519 keys, but they are also not supported until next version. So we go with RSA keys for now.
|
||||
// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162
|
||||
@@ -8,8 +8,7 @@
|
||||
use hex::{self, FromHex};
|
||||
use serde::de::Error;
|
||||
use serde::{self, Deserializer, Serializer};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use jsonwebtoken::{
|
||||
@@ -44,8 +43,8 @@ where
|
||||
{
|
||||
let opt: Option<String> = Option::deserialize(deserializer)?;
|
||||
match opt {
|
||||
Some(tid) => Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)),
|
||||
None => Ok(None),
|
||||
Some(tid) => return Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)),
|
||||
None => return Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,7 +91,7 @@ pub struct JwtAuth {
|
||||
}
|
||||
|
||||
impl JwtAuth {
|
||||
pub fn new(decoding_key: DecodingKey<'_>) -> Self {
|
||||
pub fn new<'a>(decoding_key: DecodingKey<'a>) -> Self {
|
||||
Self {
|
||||
decoding_key: decoding_key.into_static(),
|
||||
validation: Validation {
|
||||
@@ -103,7 +102,7 @@ impl JwtAuth {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_key_path(key_path: &Path) -> Result<Self> {
|
||||
pub fn from_key_path(key_path: &PathBuf) -> Result<Self> {
|
||||
let public_key = fs::read_to_string(key_path)?;
|
||||
Ok(Self::new(DecodingKey::from_rsa_pem(public_key.as_bytes())?))
|
||||
}
|
||||
@@ -114,8 +113,8 @@ impl JwtAuth {
|
||||
}
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn encode_from_key_path(claims: &Claims, key_path: &Path) -> Result<String> {
|
||||
pub fn encode_from_key_path(claims: &Claims, key_path: &PathBuf) -> Result<String> {
|
||||
let key_data = fs::read_to_string(key_path)?;
|
||||
let key = EncodingKey::from_rsa_pem(key_data.as_bytes())?;
|
||||
let key = EncodingKey::from_rsa_pem(&key_data.as_bytes())?;
|
||||
Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?)
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
//!
|
||||
//! The [`LeSer`] trait does the same thing, in little-endian form.
|
||||
//!
|
||||
//! Note: you will get a compile error if you try to `use` both traits
|
||||
//! Note: you will get a compile error if you try to `use` both trais
|
||||
//! in the same module or scope. This is intended to be a safety
|
||||
//! mechanism: mixing big-endian and little-endian encoding in the same file
|
||||
//! is error-prone.
|
||||
|
||||
@@ -95,13 +95,13 @@ pub fn attach_openapi_ui(
|
||||
|
||||
fn parse_token(header_value: &str) -> Result<&str, ApiError> {
|
||||
// header must be in form Bearer <token>
|
||||
let (prefix, token) = header_value
|
||||
.split_once(' ')
|
||||
.ok_or_else(|| ApiError::Unauthorized("malformed authorization header".to_string()))?;
|
||||
let (prefix, token) = header_value.split_once(' ').ok_or(ApiError::Unauthorized(
|
||||
"malformed authorization header".to_string(),
|
||||
))?;
|
||||
if prefix != "Bearer" {
|
||||
return Err(ApiError::Unauthorized(
|
||||
Err(ApiError::Unauthorized(
|
||||
"malformed authorization header".to_string(),
|
||||
));
|
||||
))?
|
||||
}
|
||||
Ok(token)
|
||||
}
|
||||
@@ -123,11 +123,9 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
.map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
|
||||
req.set_context(data.claims);
|
||||
}
|
||||
None => {
|
||||
return Err(ApiError::Unauthorized(
|
||||
"missing authorization header".to_string(),
|
||||
))
|
||||
}
|
||||
None => Err(ApiError::Unauthorized(
|
||||
"missing authorization header".to_string(),
|
||||
))?,
|
||||
}
|
||||
}
|
||||
Ok(req)
|
||||
@@ -147,7 +145,7 @@ pub fn serve_thread_main(
|
||||
addr: String,
|
||||
) -> anyhow::Result<()> {
|
||||
let addr = addr.parse()?;
|
||||
log::info!("Starting an http endpoint at {}", addr);
|
||||
log::info!("Starting a http endoint at {}", addr);
|
||||
|
||||
// Create a Service from the router above to handle incoming requests.
|
||||
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
//! zenith_utils is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
#![allow(clippy::manual_range_contains)]
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
|
||||
@@ -107,21 +107,12 @@ impl io::Write for WriteStream {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TlsBoxed {
|
||||
stream: BufStream,
|
||||
session: rustls::ServerSession,
|
||||
}
|
||||
|
||||
impl TlsBoxed {
|
||||
fn rustls_stream(&mut self) -> rustls::Stream<rustls::ServerSession, BufStream> {
|
||||
rustls::Stream::new(&mut self.session, &mut self.stream)
|
||||
}
|
||||
}
|
||||
|
||||
pub enum BidiStream {
|
||||
Tcp(BufStream),
|
||||
/// This variant is boxed, because [`rustls::ServerSession`] is quite larger than [`BufStream`].
|
||||
Tls(Box<TlsBoxed>),
|
||||
Tls {
|
||||
stream: BufStream,
|
||||
session: rustls::ServerSession,
|
||||
},
|
||||
}
|
||||
|
||||
impl BidiStream {
|
||||
@@ -132,13 +123,17 @@ impl BidiStream {
|
||||
pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.get_ref().shutdown(how),
|
||||
Self::Tls(tls_boxed) => {
|
||||
Self::Tls {
|
||||
stream: reader,
|
||||
session,
|
||||
} => {
|
||||
if how == Shutdown::Read {
|
||||
tls_boxed.stream.get_ref().shutdown(how)
|
||||
reader.get_ref().shutdown(how)
|
||||
} else {
|
||||
tls_boxed.session.send_close_notify();
|
||||
let res = tls_boxed.rustls_stream().flush();
|
||||
tls_boxed.stream.get_ref().shutdown(how)?;
|
||||
session.send_close_notify();
|
||||
let mut stream = rustls::Stream::new(session, reader);
|
||||
let res = stream.flush();
|
||||
reader.get_ref().shutdown(how)?;
|
||||
res
|
||||
}
|
||||
}
|
||||
@@ -154,8 +149,8 @@ impl BidiStream {
|
||||
|
||||
(ReadStream::Tcp(reader), WriteStream::Tcp(stream))
|
||||
}
|
||||
Self::Tls(tls_boxed) => {
|
||||
let reader = tls_boxed.stream.into_reader();
|
||||
Self::Tls { stream, session } => {
|
||||
let reader = stream.into_reader();
|
||||
let buffer_data = reader.buffer().to_owned();
|
||||
let read_buf_cfg = rustls_split::BufCfg::with_data(buffer_data, 8192);
|
||||
let write_buf_cfg = rustls_split::BufCfg::with_capacity(8192);
|
||||
@@ -164,7 +159,7 @@ impl BidiStream {
|
||||
let socket = Arc::try_unwrap(reader.into_inner().0).unwrap();
|
||||
|
||||
let (read_half, write_half) =
|
||||
rustls_split::split(socket, tls_boxed.session, read_buf_cfg, write_buf_cfg);
|
||||
rustls_split::split(socket, session, read_buf_cfg, write_buf_cfg);
|
||||
(ReadStream::Tls(read_half), WriteStream::Tls(write_half))
|
||||
}
|
||||
}
|
||||
@@ -175,7 +170,7 @@ impl BidiStream {
|
||||
Self::Tcp(mut stream) => {
|
||||
session.complete_io(&mut stream)?;
|
||||
assert!(!session.is_handshaking());
|
||||
Ok(Self::Tls(Box::new(TlsBoxed { stream, session })))
|
||||
Ok(Self::Tls { stream, session })
|
||||
}
|
||||
Self::Tls { .. } => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
@@ -189,7 +184,7 @@ impl io::Read for BidiStream {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.read(buf),
|
||||
Self::Tls(tls_boxed) => tls_boxed.rustls_stream().read(buf),
|
||||
Self::Tls { stream, session } => rustls::Stream::new(session, stream).read(buf),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -198,14 +193,14 @@ impl io::Write for BidiStream {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.write(buf),
|
||||
Self::Tls(tls_boxed) => tls_boxed.rustls_stream().write(buf),
|
||||
Self::Tls { stream, session } => rustls::Stream::new(session, stream).write(buf),
|
||||
}
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.flush(),
|
||||
Self::Tls(tls_boxed) => tls_boxed.rustls_stream().flush(),
|
||||
Self::Tls { stream, session } => rustls::Stream::new(session, stream).flush(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user