mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 13:02:55 +00:00
Introduce a new "layered" repository implementation.
This replaces the RocksDB based implementation with an approach using "snapshot files" on disk, and in-memory btreemaps to hold the recent changes. This make the repository implementation a configuration option. You can choose 'layered' or 'rocksdb' with "zenith init --repository-format=<format>" The unit tests have been refactored to exercise both implementations. 'layered' is now the default. Push/pull is not implemented. The 'test_history_inmemory' test has been commented out accordingly. It's not clear how we will implement that functionality; probably by copying the snapshot files directly.
This commit is contained in:
99
Cargo.lock
generated
99
Cargo.lock
generated
@@ -1,7 +1,5 @@
|
|||||||
# This file is automatically @generated by Cargo.
|
# This file is automatically @generated by Cargo.
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 3
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ahash"
|
name = "ahash"
|
||||||
version = "0.4.7"
|
version = "0.4.7"
|
||||||
@@ -82,6 +80,30 @@ version = "1.0.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aversion"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "41992ab8cfcc3026ef9abceffe0c2b0479c043183fc23825e30d22baab6df334"
|
||||||
|
dependencies = [
|
||||||
|
"aversion-macros",
|
||||||
|
"byteorder",
|
||||||
|
"serde",
|
||||||
|
"serde_cbor",
|
||||||
|
"thiserror",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aversion-macros"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5ba5785f953985aa0caca927ba4005880f3b4f53de87f134e810ae3549f744d2"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-creds"
|
name = "aws-creds"
|
||||||
version = "0.26.0"
|
version = "0.26.0"
|
||||||
@@ -166,6 +188,18 @@ dependencies = [
|
|||||||
"generic-array",
|
"generic-array",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bookfile"
|
||||||
|
version = "0.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "efa3e2086414e1bbecbc10730f265e5b079ab4ea0b830e7219a70dab6471e753"
|
||||||
|
dependencies = [
|
||||||
|
"aversion",
|
||||||
|
"byteorder",
|
||||||
|
"serde",
|
||||||
|
"thiserror",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "boxfnonce"
|
name = "boxfnonce"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
@@ -646,6 +680,12 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "half"
|
||||||
|
version = "1.7.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "hashbrown"
|
name = "hashbrown"
|
||||||
version = "0.9.1"
|
version = "0.9.1"
|
||||||
@@ -1139,6 +1179,7 @@ name = "pageserver"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"bookfile",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"chrono",
|
"chrono",
|
||||||
@@ -1276,24 +1317,6 @@ dependencies = [
|
|||||||
"tokio-postgres 0.7.1",
|
"tokio-postgres 0.7.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "postgres-protocol"
|
|
||||||
version = "0.6.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ff3e0f70d32e20923cabf2df02913be7c1842d4c772db8065c00fcfdd1d1bff3"
|
|
||||||
dependencies = [
|
|
||||||
"base64 0.13.0",
|
|
||||||
"byteorder",
|
|
||||||
"bytes",
|
|
||||||
"fallible-iterator",
|
|
||||||
"hmac",
|
|
||||||
"md-5",
|
|
||||||
"memchr",
|
|
||||||
"rand",
|
|
||||||
"sha2",
|
|
||||||
"stringprep",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres-protocol"
|
name = "postgres-protocol"
|
||||||
version = "0.6.1"
|
version = "0.6.1"
|
||||||
@@ -1313,14 +1336,21 @@ dependencies = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres-types"
|
name = "postgres-protocol"
|
||||||
version = "0.2.1"
|
version = "0.6.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "430f4131e1b7657b0cd9a2b0c3408d77c9a43a042d300b8c77f981dffcc43a2f"
|
checksum = "ff3e0f70d32e20923cabf2df02913be7c1842d4c772db8065c00fcfdd1d1bff3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"base64 0.13.0",
|
||||||
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"fallible-iterator",
|
"fallible-iterator",
|
||||||
"postgres-protocol 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"hmac",
|
||||||
|
"md-5",
|
||||||
|
"memchr",
|
||||||
|
"rand",
|
||||||
|
"sha2",
|
||||||
|
"stringprep",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1333,6 +1363,17 @@ dependencies = [
|
|||||||
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "postgres-types"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "430f4131e1b7657b0cd9a2b0c3408d77c9a43a042d300b8c77f981dffcc43a2f"
|
||||||
|
dependencies = [
|
||||||
|
"bytes",
|
||||||
|
"fallible-iterator",
|
||||||
|
"postgres-protocol 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres_ffi"
|
name = "postgres_ffi"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
@@ -1735,6 +1776,16 @@ dependencies = [
|
|||||||
"xml-rs",
|
"xml-rs",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_cbor"
|
||||||
|
version = "0.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622"
|
||||||
|
dependencies = [
|
||||||
|
"half",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_derive"
|
name = "serde_derive"
|
||||||
version = "1.0.126"
|
version = "1.0.126"
|
||||||
|
|||||||
@@ -42,6 +42,9 @@ pub struct LocalEnv {
|
|||||||
#[serde(with = "hex")]
|
#[serde(with = "hex")]
|
||||||
pub tenantid: ZTenantId,
|
pub tenantid: ZTenantId,
|
||||||
|
|
||||||
|
// Repository format, 'rocksdb' or 'layered' or None for default
|
||||||
|
pub repository_format: Option<String>,
|
||||||
|
|
||||||
// jwt auth token used for communication with pageserver
|
// jwt auth token used for communication with pageserver
|
||||||
pub auth_token: String,
|
pub auth_token: String,
|
||||||
|
|
||||||
@@ -101,6 +104,7 @@ pub fn init(
|
|||||||
remote_pageserver: Option<&str>,
|
remote_pageserver: Option<&str>,
|
||||||
tenantid: ZTenantId,
|
tenantid: ZTenantId,
|
||||||
auth_type: AuthType,
|
auth_type: AuthType,
|
||||||
|
repository_format: Option<&str>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// check if config already exists
|
// check if config already exists
|
||||||
let base_path = base_path();
|
let base_path = base_path();
|
||||||
@@ -176,6 +180,7 @@ pub fn init(
|
|||||||
base_data_dir: base_path,
|
base_data_dir: base_path,
|
||||||
remotes: BTreeMap::default(),
|
remotes: BTreeMap::default(),
|
||||||
tenantid,
|
tenantid,
|
||||||
|
repository_format: repository_format.map(|x| x.into()),
|
||||||
auth_token,
|
auth_token,
|
||||||
auth_type,
|
auth_type,
|
||||||
private_key_path,
|
private_key_path,
|
||||||
@@ -194,6 +199,7 @@ pub fn init(
|
|||||||
base_data_dir: base_path,
|
base_data_dir: base_path,
|
||||||
remotes: BTreeMap::default(),
|
remotes: BTreeMap::default(),
|
||||||
tenantid,
|
tenantid,
|
||||||
|
repository_format: repository_format.map(|x| x.into()),
|
||||||
auth_token,
|
auth_token,
|
||||||
auth_type,
|
auth_type,
|
||||||
private_key_path,
|
private_key_path,
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ impl PageServerNode {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> Result<()> {
|
pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool, repository_format: Option<&str>) -> Result<()> {
|
||||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||||
let mut args = vec![
|
let mut args = vec![
|
||||||
"--init",
|
"--init",
|
||||||
@@ -65,6 +65,10 @@ impl PageServerNode {
|
|||||||
args.extend(&["--auth-type", "ZenithJWT"]);
|
args.extend(&["--auth-type", "ZenithJWT"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(repo_format) = repository_format {
|
||||||
|
args.extend(&["--repository-format", repo_format]);
|
||||||
|
}
|
||||||
|
|
||||||
create_tenant.map(|tenantid| args.extend(&["--create-tenant", tenantid]));
|
create_tenant.map(|tenantid| args.extend(&["--create-tenant", tenantid]));
|
||||||
let status = cmd
|
let status = cmd
|
||||||
.args(args)
|
.args(args)
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ edition = "2018"
|
|||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
bookfile = "^0.3"
|
||||||
chrono = "0.4.19"
|
chrono = "0.4.19"
|
||||||
rand = "0.8.3"
|
rand = "0.8.3"
|
||||||
regex = "1.4.5"
|
regex = "1.4.5"
|
||||||
|
|||||||
@@ -20,14 +20,14 @@ use anyhow::{ensure, Result};
|
|||||||
use clap::{App, Arg, ArgMatches};
|
use clap::{App, Arg, ArgMatches};
|
||||||
use daemonize::Daemonize;
|
use daemonize::Daemonize;
|
||||||
|
|
||||||
use pageserver::{branches, logger, page_cache, page_service, PageServerConf};
|
use pageserver::{branches, logger, page_cache, page_service, PageServerConf, RepositoryFormat};
|
||||||
use zenith_utils::http_endpoint;
|
use zenith_utils::http_endpoint;
|
||||||
|
|
||||||
const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";
|
const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";
|
||||||
const DEFAULT_HTTP_ENDPOINT_ADDR: &str = "127.0.0.1:9898";
|
const DEFAULT_HTTP_ENDPOINT_ADDR: &str = "127.0.0.1:9898";
|
||||||
|
|
||||||
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||||
const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
|
const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||||
|
|
||||||
@@ -41,6 +41,7 @@ struct CfgFileParams {
|
|||||||
pg_distrib_dir: Option<String>,
|
pg_distrib_dir: Option<String>,
|
||||||
auth_validation_public_key_path: Option<String>,
|
auth_validation_public_key_path: Option<String>,
|
||||||
auth_type: Option<String>,
|
auth_type: Option<String>,
|
||||||
|
repository_format: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CfgFileParams {
|
impl CfgFileParams {
|
||||||
@@ -58,6 +59,7 @@ impl CfgFileParams {
|
|||||||
pg_distrib_dir: get_arg("postgres-distrib"),
|
pg_distrib_dir: get_arg("postgres-distrib"),
|
||||||
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
|
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
|
||||||
auth_type: get_arg("auth-type"),
|
auth_type: get_arg("auth-type"),
|
||||||
|
repository_format: get_arg("repository-format"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -74,6 +76,7 @@ impl CfgFileParams {
|
|||||||
.auth_validation_public_key_path
|
.auth_validation_public_key_path
|
||||||
.or(other.auth_validation_public_key_path),
|
.or(other.auth_validation_public_key_path),
|
||||||
auth_type: self.auth_type.or(other.auth_type),
|
auth_type: self.auth_type.or(other.auth_type),
|
||||||
|
repository_format: self.repository_format.or(other.repository_format),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -133,6 +136,16 @@ impl CfgFileParams {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let repository_format = match self.repository_format.as_ref() {
|
||||||
|
Some(repo_format_str) if repo_format_str == "rocksdb" => RepositoryFormat::RocksDb,
|
||||||
|
Some(repo_format_str) if repo_format_str == "layered" => RepositoryFormat::Layered,
|
||||||
|
Some(repo_format_str) => anyhow::bail!(
|
||||||
|
"invalid --repository-format '{}', must be 'rocksdb' or 'layered'",
|
||||||
|
repo_format_str
|
||||||
|
),
|
||||||
|
None => RepositoryFormat::Layered, // default
|
||||||
|
};
|
||||||
|
|
||||||
Ok(PageServerConf {
|
Ok(PageServerConf {
|
||||||
daemonize: false,
|
daemonize: false,
|
||||||
|
|
||||||
@@ -148,8 +161,9 @@ impl CfgFileParams {
|
|||||||
pg_distrib_dir,
|
pg_distrib_dir,
|
||||||
|
|
||||||
auth_validation_public_key_path,
|
auth_validation_public_key_path,
|
||||||
|
|
||||||
auth_type,
|
auth_type,
|
||||||
|
|
||||||
|
repository_format,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -221,6 +235,12 @@ fn main() -> Result<()> {
|
|||||||
.takes_value(true)
|
.takes_value(true)
|
||||||
.help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
|
.help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("repository-format")
|
||||||
|
.long("repository-format")
|
||||||
|
.takes_value(true)
|
||||||
|
.help("Which repository implementation to use, 'rocksdb' or 'layered'"),
|
||||||
|
)
|
||||||
.get_matches();
|
.get_matches();
|
||||||
|
|
||||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ use crate::object_repository::ObjectRepository;
|
|||||||
use crate::page_cache;
|
use crate::page_cache;
|
||||||
use crate::restore_local_repo;
|
use crate::restore_local_repo;
|
||||||
use crate::walredo::WalRedoManager;
|
use crate::walredo::WalRedoManager;
|
||||||
use crate::{repository::Repository, PageServerConf};
|
use crate::{repository::Repository, PageServerConf, RepositoryFormat};
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone)]
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
pub struct BranchInfo {
|
pub struct BranchInfo {
|
||||||
@@ -65,8 +65,8 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
|
|||||||
pub fn create_repo(
|
pub fn create_repo(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenantid: ZTenantId,
|
tenantid: ZTenantId,
|
||||||
wal_redo_manager: Arc<dyn WalRedoManager>,
|
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
|
||||||
) -> Result<ObjectRepository> {
|
) -> Result<Arc<dyn Repository>> {
|
||||||
let repo_dir = conf.tenant_path(&tenantid);
|
let repo_dir = conf.tenant_path(&tenantid);
|
||||||
if repo_dir.exists() {
|
if repo_dir.exists() {
|
||||||
bail!("repo for {} already exists", tenantid)
|
bail!("repo for {} already exists", tenantid)
|
||||||
@@ -96,19 +96,27 @@ pub fn create_repo(
|
|||||||
// and we failed to run initdb again in the same directory. This has been solved for the
|
// and we failed to run initdb again in the same directory. This has been solved for the
|
||||||
// rapid init+start case now, but the general race condition remains if you restart the
|
// rapid init+start case now, but the general race condition remains if you restart the
|
||||||
// server quickly.
|
// server quickly.
|
||||||
let storage = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
|
let repo: Arc<dyn Repository + Sync + Send> =
|
||||||
|
match conf.repository_format {
|
||||||
|
RepositoryFormat::Layered => Arc::new(
|
||||||
|
crate::layered_repository::LayeredRepository::new(conf, wal_redo_manager, tenantid),
|
||||||
|
),
|
||||||
|
RepositoryFormat::RocksDb => {
|
||||||
|
let obj_store = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
|
||||||
|
|
||||||
let repo = crate::object_repository::ObjectRepository::new(
|
Arc::new(ObjectRepository::new(
|
||||||
conf,
|
conf,
|
||||||
std::sync::Arc::new(storage),
|
Arc::new(obj_store),
|
||||||
wal_redo_manager,
|
wal_redo_manager,
|
||||||
tenantid,
|
tenantid,
|
||||||
);
|
))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Load data into pageserver
|
// Load data into pageserver
|
||||||
// TODO To implement zenith import we need to
|
// TODO To implement zenith import we need to
|
||||||
// move data loading out of create_repo()
|
// move data loading out of create_repo()
|
||||||
bootstrap_timeline(conf, tenantid, tli, &repo)?;
|
bootstrap_timeline(conf, tenantid, tli, &*repo)?;
|
||||||
|
|
||||||
Ok(repo)
|
Ok(repo)
|
||||||
}
|
}
|
||||||
|
|||||||
1212
pageserver/src/layered_repository.rs
Normal file
1212
pageserver/src/layered_repository.rs
Normal file
File diff suppressed because it is too large
Load Diff
298
pageserver/src/layered_repository/README.md
Normal file
298
pageserver/src/layered_repository/README.md
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
# Overview
|
||||||
|
|
||||||
|
The on-disk format is based on immutable files. The page server
|
||||||
|
receives a stream of incoming WAL, parses the WAL records to determine
|
||||||
|
which pages they apply to, and accumulates the incoming changes in
|
||||||
|
memory. Every now and then, the accumulated changes are written out to
|
||||||
|
new files.
|
||||||
|
|
||||||
|
The files are called "snapshot files". Each snapshot file corresponds
|
||||||
|
to one PostgreSQL relation fork. The snapshot files for each timeline
|
||||||
|
are stored in the timeline's subdirectory under
|
||||||
|
.zenith/tenants/<tenantid>/timelines.
|
||||||
|
|
||||||
|
The files are named like this:
|
||||||
|
|
||||||
|
rel_<spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
rel_1663_13990_2609_0_000000000169C348_0000000001702000
|
||||||
|
|
||||||
|
Some non-relation files are also stored in repository. For example,
|
||||||
|
a CLOG segment would be named like this:
|
||||||
|
|
||||||
|
pg_xact_0000_00000000198B06B0_00000000198C2550
|
||||||
|
|
||||||
|
There is no difference in how the relation and non-relation files are
|
||||||
|
managed, except that the first part of file names is different.
|
||||||
|
Internally, the relations and non-relation files that are managed in
|
||||||
|
the versioned store are together called "relishes".
|
||||||
|
|
||||||
|
Each snapshot file contains a full snapshot, that is, full copy of all
|
||||||
|
pages in the relation, as of the "start LSN". It also contains all WAL
|
||||||
|
records applicable to the relation between the start and end
|
||||||
|
LSNs. With this information, the page server can reconstruct any page
|
||||||
|
version of the relation in the LSN range.
|
||||||
|
|
||||||
|
If a file has been dropped, the last snapshot file for it is created
|
||||||
|
with the _DROPPED suffix, e.g.
|
||||||
|
|
||||||
|
rel_1663_13990_2609_0_000000000169C348_0000000001702000_DROPPED
|
||||||
|
|
||||||
|
In addition to the relations, with "rel_*" prefix, we use the same
|
||||||
|
format for storing various smaller files from the PostgreSQL data
|
||||||
|
directory. They will use different suffixes and the naming scheme
|
||||||
|
up to the LSN range varies. The Zenith source code uses the term
|
||||||
|
"relish" to mean "a relation, or other file that's treated like a
|
||||||
|
relation in the storage"
|
||||||
|
|
||||||
|
## Notation used in this document
|
||||||
|
|
||||||
|
The full path of a snapshot file looks like this:
|
||||||
|
|
||||||
|
.zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_000000000169C348_0000000001702000
|
||||||
|
|
||||||
|
For simplicity, the examples below use a simplified notation for the
|
||||||
|
paths. The tenant ID is left out, the timeline ID is replaced with
|
||||||
|
the human-readable branch name, and spcnode+dbnode+relnode+forkum with
|
||||||
|
a human-readable table name. The LSNs are also shorter. For example, a
|
||||||
|
snapshot file for 'orders' table on 'main' branch, with LSN range
|
||||||
|
100-200 would be:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
|
||||||
|
|
||||||
|
# Creating snapshot files
|
||||||
|
|
||||||
|
Let's start with a simple example with a system that contains one
|
||||||
|
branch called 'main' and two tables, 'orders' and 'customers'. The end
|
||||||
|
of WAL is currently at LSN 250. In this starting situation, you would
|
||||||
|
have two files on disk:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/customers_100_200
|
||||||
|
|
||||||
|
In addition to those files, the recent changes between LSN 200 and the
|
||||||
|
end of WAL at 250 are kept in memory. If the page server crashes, the
|
||||||
|
latest records between 200-250 need to be re-read from the WAL.
|
||||||
|
|
||||||
|
Whenever enough WAL has been accumulated in memory, the page server
|
||||||
|
writes out the changes in memory into new snapshot files. This process
|
||||||
|
is called "checkpointing" (not to be confused with the PostgreSQL
|
||||||
|
checkpoints, that's a different thing). The page server only creates
|
||||||
|
snapshot files for relations that have been modified since the last
|
||||||
|
checkpoint. For example, if the current end of WAL is at LSN 450, and
|
||||||
|
the last checkpoint happened at LSN 400 but there hasn't been any
|
||||||
|
recent changes to 'customers' table, you would have these files on
|
||||||
|
disk:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
main/orders_300_400
|
||||||
|
main/customers_100_200
|
||||||
|
|
||||||
|
If the customers table is modified later, a new file is created for it
|
||||||
|
at the next checkpoint. The new file will cover the "gap" from the
|
||||||
|
last snapshot file, so the LSN ranges are always contiguous:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
main/orders_300_400
|
||||||
|
main/customers_100_200
|
||||||
|
main/customers_200_500
|
||||||
|
|
||||||
|
## Reading page versions
|
||||||
|
|
||||||
|
Whenever a GetPage@LSN request comes in from the compute node, the
|
||||||
|
page server needs to reconstruct the requested page, as it was at the
|
||||||
|
requested LSN. To do that, the page server first checks the recent
|
||||||
|
in-memory layer; if the requested page version is found there, it can
|
||||||
|
be returned immediatedly without looking at the files on
|
||||||
|
disk. Otherwise the page server needs to locate the snapshot file that
|
||||||
|
contains the requested page version.
|
||||||
|
|
||||||
|
For example, if a request comes in for table 'orders' at LSN 250, the
|
||||||
|
page server would load the 'main/orders_200_300' file into memory, and
|
||||||
|
reconstruct and return the requested page from it, as it was at
|
||||||
|
LSN 250. Because the snapshot file consists of a full image of the
|
||||||
|
relation at the start LSN and the WAL, reconstructing the page
|
||||||
|
involves replaying any WAL records applicable to the page between LSNs
|
||||||
|
200-250, starting from the base image at LSN 200.
|
||||||
|
|
||||||
|
A request at a file boundary can be satisfied using either file. For
|
||||||
|
example, if there are two files on disk:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
|
||||||
|
And a request comes with LSN 200, either file can be used for it. It
|
||||||
|
is better to use the later file, however, because it contains an
|
||||||
|
already materialized version of all the pages at LSN 200. Using the
|
||||||
|
first file, you would need to apply any WAL records between 100 and
|
||||||
|
200 to reconstruct the requested page.
|
||||||
|
|
||||||
|
# Multiple branches
|
||||||
|
|
||||||
|
Imagine that a child branch is created at LSN 250:
|
||||||
|
|
||||||
|
@250
|
||||||
|
----main--+-------------------------->
|
||||||
|
\
|
||||||
|
+---child-------------->
|
||||||
|
|
||||||
|
|
||||||
|
Then, the 'orders' table is updated differently on the 'main' and
|
||||||
|
'child' branches. You now have this situation on disk:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
main/orders_300_400
|
||||||
|
main/customers_100_200
|
||||||
|
child/orders_250_300
|
||||||
|
child/orders_300_400
|
||||||
|
|
||||||
|
Because the 'customers' table hasn't been modified on the child
|
||||||
|
branch, there is no file for it there. If you request a page for it on
|
||||||
|
the 'child' branch, the page server will not find any snapshot file
|
||||||
|
for it in the 'child' directory, so it will recurse to look into the
|
||||||
|
parent 'main' branch instead.
|
||||||
|
|
||||||
|
From the 'child' branch's point of view, the history for each relation
|
||||||
|
is linear, and the request's LSN identifies unambiguously which file
|
||||||
|
you need to look at. For example, the history for the 'orders' table
|
||||||
|
on the 'main' branch consists of these files:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
main/orders_300_400
|
||||||
|
|
||||||
|
And from the 'child' branch's point of view, it consists of these
|
||||||
|
files:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
child/orders_250_300
|
||||||
|
child/orders_300_400
|
||||||
|
|
||||||
|
The branch metadata includes the point where the child branch was
|
||||||
|
created, LSN 250. If a page request comes with LSN 275, we read the
|
||||||
|
page version from the 'child/orders_250_300' file. If the request LSN
|
||||||
|
is 225, we read it from the 'main/orders_200_300' file instead. The
|
||||||
|
page versions between 250-300 in the 'main/orders_200_300' file are
|
||||||
|
ignored when operating on the child branch.
|
||||||
|
|
||||||
|
Note: It doesn't make any difference if the child branch is created
|
||||||
|
when the end of the main branch was at LSN 250, or later when the tip of
|
||||||
|
the main branch had already moved on. The latter case, creating a
|
||||||
|
branch at a historic LSN, is how we support PITR in Zenith.
|
||||||
|
|
||||||
|
|
||||||
|
# Garbage collection
|
||||||
|
|
||||||
|
In this scheme, we keep creating new snapshot files over time. We also
|
||||||
|
need a mechanism to remove old files that are no longer needed,
|
||||||
|
because disk space isn't infinite.
|
||||||
|
|
||||||
|
What files are still needed? Currently, the page server supports PITR
|
||||||
|
and branching from any branch at any LSN that is "recent enough" from
|
||||||
|
the tip of the branch. "Recent enough" is defined as an LSN horizon,
|
||||||
|
which by default is 64 MB. (See DEFAULT_GC_HORIZON). For this
|
||||||
|
example, let's assume that the LSN horizon is 150 units.
|
||||||
|
|
||||||
|
Let's look at the single branch scenario again. Imagine that the end
|
||||||
|
of the branch is LSN 525, so that the GC horizon is currently at
|
||||||
|
525-150 = 375
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
main/orders_300_400
|
||||||
|
main/orders_400_500
|
||||||
|
main/customers_100_200
|
||||||
|
|
||||||
|
We can remove files 'main/orders_100_200' and 'main/orders_200_300',
|
||||||
|
because the end LSNs of those files are older than GC horizon 375, and
|
||||||
|
there are more recent snapshot files for the table. 'main/orders_300_400'
|
||||||
|
and 'main/orders_400_500' are still within the horizon, so they must be
|
||||||
|
retained. 'main/customers_100_200' is old enough, but it cannot be
|
||||||
|
removed because there is no newer snapshot file for the table.
|
||||||
|
|
||||||
|
Things get slightly more complicated with multiple branches. All of
|
||||||
|
the above still holds, but in addition to recent files we must also
|
||||||
|
retain older shapshot files that are still needed by child branches.
|
||||||
|
For example, if child branch is created at LSN 150, and the 'customers'
|
||||||
|
table is updated on the branch, you would have these files:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
main/orders_300_400
|
||||||
|
main/orders_400_500
|
||||||
|
main/customers_100_200
|
||||||
|
child/customers_150_300
|
||||||
|
|
||||||
|
In this situation, the 'main/orders_100_200' file cannot be removed,
|
||||||
|
even though it is older than the GC horizon, because it is still
|
||||||
|
needed by the child branch. 'main/orders_200_300' can still be
|
||||||
|
removed. So after garbage collection, these files would remain:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
|
||||||
|
main/orders_300_400
|
||||||
|
main/orders_400_500
|
||||||
|
main/customers_100_200
|
||||||
|
child/customers_150_300
|
||||||
|
|
||||||
|
If 'orders' is modified later on the 'child' branch, we will create a
|
||||||
|
snapshot file for it on the child:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
|
||||||
|
main/orders_300_400
|
||||||
|
main/orders_400_500
|
||||||
|
main/customers_100_200
|
||||||
|
child/customers_150_300
|
||||||
|
child/orders_150_400
|
||||||
|
|
||||||
|
After this, the 'main/orders_100_200' file can be removed. It is no
|
||||||
|
longer needed by the child branch, because there is a newer snapshot
|
||||||
|
file there. TODO: This optimization hasn't been implemented! The GC
|
||||||
|
algorithm will currently keep the file on the 'main' branch anyway, for
|
||||||
|
as long as the child branch exists.
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: On LSN ranges
|
||||||
|
|
||||||
|
In principle, each relation can be checkpointed separately, i.e. the
|
||||||
|
LSN ranges of the files don't need to line up. So this would be legal:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
main/orders_300_400
|
||||||
|
main/customers_150_250
|
||||||
|
main/customers_250_500
|
||||||
|
|
||||||
|
However, the code currently always checkpoints all relations together.
|
||||||
|
So that situation doesn't arise in practice.
|
||||||
|
|
||||||
|
It would also be OK to have overlapping LSN ranges for the same relation:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
main/orders_250_350
|
||||||
|
main/orders_300_400
|
||||||
|
|
||||||
|
The code that reads the snapshot files should cope with this, but this
|
||||||
|
situation doesn't arise either, because the checkpointing code never
|
||||||
|
does that. It could be useful, however, as a transient state when
|
||||||
|
garbage collecting around branch points, or explicit recovery
|
||||||
|
points. For example, if we start with this:
|
||||||
|
|
||||||
|
main/orders_100_200
|
||||||
|
main/orders_200_300
|
||||||
|
main/orders_300_400
|
||||||
|
|
||||||
|
And there is a branch or explicit recovery point at LSN 150, we could
|
||||||
|
replace 'main/orders_100_200' with 'main/orders_150_150' to keep a
|
||||||
|
snapshot only at that exact point that's still needed, removing the
|
||||||
|
other page versions around it. But such compaction has not been
|
||||||
|
implemented yet.
|
||||||
534
pageserver/src/layered_repository/inmemory_layer.rs
Normal file
534
pageserver/src/layered_repository/inmemory_layer.rs
Normal file
@@ -0,0 +1,534 @@
|
|||||||
|
//!
|
||||||
|
//! An in-memory layer stores recently received page versions in memory. The page versions
|
||||||
|
//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
|
||||||
|
//!
|
||||||
|
|
||||||
|
use crate::layered_repository::storage_layer::Layer;
|
||||||
|
use crate::layered_repository::storage_layer::PageVersion;
|
||||||
|
use crate::layered_repository::SnapshotLayer;
|
||||||
|
use crate::relish::*;
|
||||||
|
use crate::repository::WALRecord;
|
||||||
|
use crate::walredo::WalRedoManager;
|
||||||
|
use crate::PageServerConf;
|
||||||
|
use crate::{ZTenantId, ZTimelineId};
|
||||||
|
use anyhow::{bail, Result};
|
||||||
|
use bytes::Bytes;
|
||||||
|
use log::*;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::ops::Bound::Included;
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
use zenith_utils::lsn::Lsn;
|
||||||
|
|
||||||
|
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||||
|
|
||||||
|
pub struct InMemoryLayer {
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
tenantid: ZTenantId,
|
||||||
|
timelineid: ZTimelineId,
|
||||||
|
rel: RelishTag,
|
||||||
|
|
||||||
|
///
|
||||||
|
/// This layer contains all the changes from 'start_lsn'. The
|
||||||
|
/// start is inclusive. There is no end LSN; we only use in-memory
|
||||||
|
/// layer at the end of a timeline.
|
||||||
|
///
|
||||||
|
start_lsn: Lsn,
|
||||||
|
|
||||||
|
/// The above fields never change. The parts that do change are in 'inner',
|
||||||
|
/// and protected by mutex.
|
||||||
|
inner: Mutex<InMemoryLayerInner>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct InMemoryLayerInner {
|
||||||
|
/// If this relation was dropped, remember when that happened.
|
||||||
|
drop_lsn: Option<Lsn>,
|
||||||
|
|
||||||
|
///
|
||||||
|
/// All versions of all pages in the layer are are kept here.
|
||||||
|
/// Indexed by block number and LSN.
|
||||||
|
///
|
||||||
|
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||||
|
|
||||||
|
///
|
||||||
|
/// `relsizes` tracks the size of the relation at different points in time.
|
||||||
|
///
|
||||||
|
relsizes: BTreeMap<Lsn, u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Layer for InMemoryLayer {
|
||||||
|
fn is_frozen(&self) -> bool {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_timeline_id(&self) -> ZTimelineId {
|
||||||
|
return self.timelineid;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_relish_tag(&self) -> RelishTag {
|
||||||
|
return self.rel;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_start_lsn(&self) -> Lsn {
|
||||||
|
return self.start_lsn;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_end_lsn(&self) -> Lsn {
|
||||||
|
return Lsn(u64::MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_dropped(&self) -> bool {
|
||||||
|
let inner = self.inner.lock().unwrap();
|
||||||
|
inner.drop_lsn.is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Look up given page in the cache.
|
||||||
|
fn get_page_at_lsn(
|
||||||
|
&self,
|
||||||
|
walredo_mgr: &dyn WalRedoManager,
|
||||||
|
blknum: u32,
|
||||||
|
lsn: Lsn,
|
||||||
|
) -> Result<Bytes> {
|
||||||
|
// Scan the BTreeMap backwards, starting from the given entry.
|
||||||
|
let mut records: Vec<WALRecord> = Vec::new();
|
||||||
|
let mut page_img: Option<Bytes> = None;
|
||||||
|
let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
|
||||||
|
|
||||||
|
{
|
||||||
|
let inner = self.inner.lock().unwrap();
|
||||||
|
let minkey = (blknum, Lsn(0));
|
||||||
|
let maxkey = (blknum, lsn);
|
||||||
|
let mut iter = inner
|
||||||
|
.page_versions
|
||||||
|
.range((Included(&minkey), Included(&maxkey)));
|
||||||
|
while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
|
||||||
|
if let Some(img) = &entry.page_image {
|
||||||
|
page_img = Some(img.clone());
|
||||||
|
need_base_image_lsn = None;
|
||||||
|
break;
|
||||||
|
} else if let Some(rec) = &entry.record {
|
||||||
|
records.push(rec.clone());
|
||||||
|
if rec.will_init {
|
||||||
|
// This WAL record initializes the page, so no need to go further back
|
||||||
|
need_base_image_lsn = None;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
need_base_image_lsn = Some(*entry_lsn);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No base image, and no WAL record. Huh?
|
||||||
|
bail!("no page image or WAL record for requested page");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// release lock on 'page_versions'
|
||||||
|
}
|
||||||
|
records.reverse();
|
||||||
|
|
||||||
|
// If we needed a base image to apply the WAL records against, we should have found it in memory.
|
||||||
|
if let Some(lsn) = need_base_image_lsn {
|
||||||
|
if records.is_empty() {
|
||||||
|
// no records, and no base image. This can happen if PostgreSQL extends a relation
|
||||||
|
// but never writes the page.
|
||||||
|
//
|
||||||
|
// Would be nice to detect that situation better.
|
||||||
|
warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
|
||||||
|
return Ok(ZERO_PAGE.clone());
|
||||||
|
}
|
||||||
|
bail!(
|
||||||
|
"No base image found for page {} blk {} at {}/{}",
|
||||||
|
self.rel,
|
||||||
|
blknum,
|
||||||
|
self.timelineid,
|
||||||
|
lsn
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have a page image, and no WAL, we're all set
|
||||||
|
if records.is_empty() {
|
||||||
|
if let Some(img) = page_img {
|
||||||
|
trace!(
|
||||||
|
"found page image for blk {} in {} at {}/{}, no WAL redo required",
|
||||||
|
blknum,
|
||||||
|
self.rel,
|
||||||
|
self.timelineid,
|
||||||
|
lsn
|
||||||
|
);
|
||||||
|
Ok(img)
|
||||||
|
} else {
|
||||||
|
// FIXME: this ought to be an error?
|
||||||
|
warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
|
||||||
|
Ok(ZERO_PAGE.clone())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// We need to do WAL redo.
|
||||||
|
//
|
||||||
|
// If we don't have a base image, then the oldest WAL record better initialize
|
||||||
|
// the page
|
||||||
|
if page_img.is_none() && !records.first().unwrap().will_init {
|
||||||
|
// FIXME: this ought to be an error?
|
||||||
|
warn!(
|
||||||
|
"Base image for page {}/{} at {} not found, but got {} WAL records",
|
||||||
|
self.rel,
|
||||||
|
blknum,
|
||||||
|
lsn,
|
||||||
|
records.len()
|
||||||
|
);
|
||||||
|
Ok(ZERO_PAGE.clone())
|
||||||
|
} else {
|
||||||
|
if page_img.is_some() {
|
||||||
|
trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
|
||||||
|
} else {
|
||||||
|
trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
|
||||||
|
}
|
||||||
|
let img = walredo_mgr.request_redo(self.rel, blknum, lsn, page_img, records)?;
|
||||||
|
|
||||||
|
self.put_page_image(blknum, lsn, img.clone())?;
|
||||||
|
|
||||||
|
Ok(img)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get size of the relation at given LSN
|
||||||
|
fn get_relish_size(&self, lsn: Lsn) -> Result<Option<u32>> {
|
||||||
|
// Scan the BTreeMap backwards, starting from the given entry.
|
||||||
|
let inner = self.inner.lock().unwrap();
|
||||||
|
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||||
|
|
||||||
|
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||||
|
let result = *entry;
|
||||||
|
drop(inner);
|
||||||
|
trace!("get_relish_size: {} at {} -> {}", self.rel, lsn, result);
|
||||||
|
Ok(Some(result))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Does this relation exist at given LSN?
|
||||||
|
fn get_rel_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||||
|
let inner = self.inner.lock().unwrap();
|
||||||
|
|
||||||
|
// Is the requested LSN after the rel was dropped?
|
||||||
|
if let Some(drop_lsn) = inner.drop_lsn {
|
||||||
|
if lsn >= drop_lsn {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise, it exists
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write operations
|
||||||
|
|
||||||
|
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||||
|
/// Adds the page version to the in-memory tree
|
||||||
|
fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
|
||||||
|
trace!(
|
||||||
|
"put_page_version blk {} of {} at {}/{}",
|
||||||
|
blknum,
|
||||||
|
self.rel,
|
||||||
|
self.timelineid,
|
||||||
|
lsn
|
||||||
|
);
|
||||||
|
let mut inner = self.inner.lock().unwrap();
|
||||||
|
|
||||||
|
let old = inner.page_versions.insert((blknum, lsn), pv);
|
||||||
|
|
||||||
|
if old.is_some() {
|
||||||
|
// We already had an entry for this LSN. That's odd..
|
||||||
|
warn!(
|
||||||
|
"Page version of rel {:?} blk {} at {} already exists",
|
||||||
|
self.rel, blknum, lsn
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also update the relation size, if this extended the relation.
|
||||||
|
if self.rel.is_blocky() {
|
||||||
|
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||||
|
|
||||||
|
let oldsize;
|
||||||
|
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||||
|
oldsize = *entry;
|
||||||
|
} else {
|
||||||
|
oldsize = 0;
|
||||||
|
//bail!("No old size found for {} at {}", self.tag, lsn);
|
||||||
|
}
|
||||||
|
if blknum >= oldsize {
|
||||||
|
trace!(
|
||||||
|
"enlarging relation {} from {} to {} blocks at {}",
|
||||||
|
self.rel,
|
||||||
|
oldsize,
|
||||||
|
blknum + 1,
|
||||||
|
lsn
|
||||||
|
);
|
||||||
|
inner.relsizes.insert(lsn, blknum + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remember that the relation was truncated at given LSN
|
||||||
|
fn put_truncation(&self, lsn: Lsn, relsize: u32) -> anyhow::Result<()> {
|
||||||
|
let mut inner = self.inner.lock().unwrap();
|
||||||
|
let old = inner.relsizes.insert(lsn, relsize);
|
||||||
|
|
||||||
|
if old.is_some() {
|
||||||
|
// We already had an entry for this LSN. That's odd..
|
||||||
|
warn!("Inserting truncation, but had an entry for the LSN already");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remember that the relation was dropped at given LSN
|
||||||
|
fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||||
|
let mut inner = self.inner.lock().unwrap();
|
||||||
|
|
||||||
|
assert!(inner.drop_lsn.is_none());
|
||||||
|
inner.drop_lsn = Some(lsn);
|
||||||
|
|
||||||
|
info!("dropped relation {} at {}", self.rel, lsn);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Write the this in-memory layer to disk, as a snapshot layer.
|
||||||
|
///
|
||||||
|
/// The cutoff point for the layer that's written to disk is 'end_lsn'.
|
||||||
|
///
|
||||||
|
/// Returns new layers that replace this one. Always returns a
|
||||||
|
/// SnapshotLayer containing the page versions that were written to disk,
|
||||||
|
/// but if there were page versions newer than 'end_lsn', also return a new
|
||||||
|
/// in-memory layer containing those page versions. The caller replaces
|
||||||
|
/// this layer with the returned layers in the layer map.
|
||||||
|
///
|
||||||
|
fn freeze(
|
||||||
|
&self,
|
||||||
|
cutoff_lsn: Lsn,
|
||||||
|
walredo_mgr: &dyn WalRedoManager,
|
||||||
|
) -> Result<Vec<Arc<dyn Layer>>> {
|
||||||
|
info!(
|
||||||
|
"freezing in memory layer for {} on timeline {} at {}",
|
||||||
|
self.rel, self.timelineid, cutoff_lsn
|
||||||
|
);
|
||||||
|
|
||||||
|
let inner = self.inner.lock().unwrap();
|
||||||
|
|
||||||
|
// Normally, use the cutoff LSN as the end of the frozen layer.
|
||||||
|
// But if the relation was dropped, we know that there are no
|
||||||
|
// more changes coming in for it, and in particular we know that
|
||||||
|
// there are no changes "in flight" for the LSN anymore, so we use
|
||||||
|
// the drop LSN instead. The drop-LSN could be ahead of the
|
||||||
|
// caller-specified LSN!
|
||||||
|
let dropped = inner.drop_lsn.is_some();
|
||||||
|
let end_lsn =
|
||||||
|
if dropped {
|
||||||
|
inner.drop_lsn.unwrap()
|
||||||
|
} else {
|
||||||
|
cutoff_lsn
|
||||||
|
};
|
||||||
|
|
||||||
|
// Divide all the page versions into old and new at the 'end_lsn' cutoff point.
|
||||||
|
let mut before_page_versions;
|
||||||
|
let mut before_relsizes;
|
||||||
|
let mut after_page_versions;
|
||||||
|
let mut after_relsizes;
|
||||||
|
if !dropped {
|
||||||
|
before_relsizes = BTreeMap::new();
|
||||||
|
after_relsizes = BTreeMap::new();
|
||||||
|
for (lsn, size) in inner.relsizes.iter() {
|
||||||
|
if *lsn > end_lsn {
|
||||||
|
after_relsizes.insert(*lsn, *size);
|
||||||
|
} else {
|
||||||
|
before_relsizes.insert(*lsn, *size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
before_page_versions = BTreeMap::new();
|
||||||
|
after_page_versions = BTreeMap::new();
|
||||||
|
for ((blknum, lsn), pv) in inner.page_versions.iter() {
|
||||||
|
if *lsn > end_lsn {
|
||||||
|
after_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||||
|
} else {
|
||||||
|
before_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
before_page_versions = inner.page_versions.clone();
|
||||||
|
before_relsizes = inner.relsizes.clone();
|
||||||
|
after_relsizes = BTreeMap::new();
|
||||||
|
after_page_versions = BTreeMap::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
// we can release the lock now.
|
||||||
|
drop(inner);
|
||||||
|
|
||||||
|
// Write the page versions before the cutoff to disk.
|
||||||
|
let snapfile = SnapshotLayer::create(
|
||||||
|
self.conf,
|
||||||
|
self.timelineid,
|
||||||
|
self.tenantid,
|
||||||
|
self.rel,
|
||||||
|
self.start_lsn,
|
||||||
|
end_lsn,
|
||||||
|
dropped,
|
||||||
|
before_page_versions,
|
||||||
|
before_relsizes,
|
||||||
|
)?;
|
||||||
|
let mut result: Vec<Arc<dyn Layer>> = Vec::new();
|
||||||
|
|
||||||
|
// If there were any page versions after the cutoff, initialize a new in-memory layer
|
||||||
|
// to hold them
|
||||||
|
if !after_relsizes.is_empty() || !after_page_versions.is_empty() {
|
||||||
|
info!("created new in-mem layer for {} {}-", self.rel, end_lsn);
|
||||||
|
|
||||||
|
let new_layer = Self::copy_snapshot(
|
||||||
|
self.conf,
|
||||||
|
walredo_mgr,
|
||||||
|
&snapfile,
|
||||||
|
self.timelineid,
|
||||||
|
self.tenantid,
|
||||||
|
end_lsn,
|
||||||
|
)?;
|
||||||
|
let mut new_inner = new_layer.inner.lock().unwrap();
|
||||||
|
new_inner.page_versions.append(&mut after_page_versions);
|
||||||
|
new_inner.relsizes.append(&mut after_relsizes);
|
||||||
|
drop(new_inner);
|
||||||
|
|
||||||
|
result.push(Arc::new(new_layer));
|
||||||
|
}
|
||||||
|
result.push(Arc::new(snapfile));
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn delete(&self) -> Result<()> {
|
||||||
|
// Nothing to do. When the reference is dropped, the memory is released.
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unload(&self) -> Result<()> {
|
||||||
|
// cannot unload in-memory layer. Freeze instead
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl InMemoryLayer {
|
||||||
|
///
|
||||||
|
/// Create a new, empty, in-memory layer
|
||||||
|
///
|
||||||
|
pub fn create(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
timelineid: ZTimelineId,
|
||||||
|
tenantid: ZTenantId,
|
||||||
|
rel: RelishTag,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
) -> Result<InMemoryLayer> {
|
||||||
|
trace!(
|
||||||
|
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
|
||||||
|
rel,
|
||||||
|
timelineid,
|
||||||
|
start_lsn
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(InMemoryLayer {
|
||||||
|
conf,
|
||||||
|
timelineid,
|
||||||
|
tenantid,
|
||||||
|
rel,
|
||||||
|
start_lsn,
|
||||||
|
inner: Mutex::new(InMemoryLayerInner {
|
||||||
|
drop_lsn: None,
|
||||||
|
page_versions: BTreeMap::new(),
|
||||||
|
relsizes: BTreeMap::new(),
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Initialize a new InMemoryLayer for, by copying the state at the given
|
||||||
|
/// point in time from given existing layer.
|
||||||
|
///
|
||||||
|
pub fn copy_snapshot(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
walredo_mgr: &dyn WalRedoManager,
|
||||||
|
src: &dyn Layer,
|
||||||
|
timelineid: ZTimelineId,
|
||||||
|
tenantid: ZTenantId,
|
||||||
|
lsn: Lsn,
|
||||||
|
) -> Result<InMemoryLayer> {
|
||||||
|
trace!(
|
||||||
|
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
|
||||||
|
src.get_relish_tag(),
|
||||||
|
timelineid,
|
||||||
|
lsn
|
||||||
|
);
|
||||||
|
let mut page_versions = BTreeMap::new();
|
||||||
|
let mut relsizes = BTreeMap::new();
|
||||||
|
|
||||||
|
let size;
|
||||||
|
if src.get_relish_tag().is_blocky() {
|
||||||
|
if let Some(sz) = src.get_relish_size(lsn)? {
|
||||||
|
relsizes.insert(lsn, sz);
|
||||||
|
size = sz;
|
||||||
|
} else {
|
||||||
|
bail!("no size found or {} at {}", src.get_relish_tag(), lsn);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
size = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for blknum in 0..size {
|
||||||
|
let img = src.get_page_at_lsn(walredo_mgr, blknum, lsn)?;
|
||||||
|
let pv = PageVersion {
|
||||||
|
page_image: Some(img),
|
||||||
|
record: None,
|
||||||
|
};
|
||||||
|
page_versions.insert((blknum, lsn), pv);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(InMemoryLayer {
|
||||||
|
conf,
|
||||||
|
timelineid,
|
||||||
|
tenantid,
|
||||||
|
rel: src.get_relish_tag(),
|
||||||
|
start_lsn: lsn,
|
||||||
|
inner: Mutex::new(InMemoryLayerInner {
|
||||||
|
drop_lsn: None,
|
||||||
|
page_versions: page_versions,
|
||||||
|
relsizes: relsizes,
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// debugging function to print out the contents of the layer
|
||||||
|
#[allow(unused)]
|
||||||
|
pub fn dump(&self) -> String {
|
||||||
|
let mut result = format!(
|
||||||
|
"----- inmemory layer for {} {}-> ----\n",
|
||||||
|
self.rel, self.start_lsn
|
||||||
|
);
|
||||||
|
|
||||||
|
let inner = self.inner.lock().unwrap();
|
||||||
|
|
||||||
|
for (k, v) in inner.relsizes.iter() {
|
||||||
|
result += &format!("{}: {}\n", k, v);
|
||||||
|
}
|
||||||
|
for (k, v) in inner.page_versions.iter() {
|
||||||
|
result += &format!(
|
||||||
|
"blk {} at {}: {}/{}\n",
|
||||||
|
k.0,
|
||||||
|
k.1,
|
||||||
|
v.page_image.is_some(),
|
||||||
|
v.record.is_some()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
132
pageserver/src/layered_repository/layer_map.rs
Normal file
132
pageserver/src/layered_repository/layer_map.rs
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
//!
|
||||||
|
//! The layer map tracks what layers exist for all the relations in a timeline.
|
||||||
|
//!
|
||||||
|
//! When the timeline is first accessed, the server lists of all snapshot files
|
||||||
|
//! in the timelines/<timelineid> directory, and populates this map with
|
||||||
|
//! SnapshotLayers corresponding to each file. When new WAL is received,
|
||||||
|
//! we create InMemoryLayers to hold the incoming records. Now and then,
|
||||||
|
//! in the checkpoint() function, the in-memory layers are frozen, forming
|
||||||
|
//! new snapshot layers and corresponding files are written to disk.
|
||||||
|
//!
|
||||||
|
|
||||||
|
use crate::layered_repository::storage_layer::Layer;
|
||||||
|
use crate::relish::*;
|
||||||
|
use anyhow::Result;
|
||||||
|
use log::*;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::ops::Bound::Included;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use zenith_utils::lsn::Lsn;
|
||||||
|
|
||||||
|
/// LayerMap is a BTreeMap keyed by RelishTag and the layer's start LSN.
|
||||||
|
/// It provides a couple of convenience functions over a plain BTreeMap
|
||||||
|
pub struct LayerMap {
|
||||||
|
pub inner: BTreeMap<(RelishTag, Lsn), Arc<dyn Layer>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LayerMap {
|
||||||
|
///
|
||||||
|
/// Look up using the given rel tag and LSN. This differs from a plain
|
||||||
|
/// key-value lookup in that if there is any layer that covers the
|
||||||
|
/// given LSN, or precedes the given LSN, it is returned. In other words,
|
||||||
|
/// you don't need to know the exact start LSN of the layer.
|
||||||
|
///
|
||||||
|
pub fn get(&self, tag: RelishTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||||
|
let startkey = (tag, Lsn(0));
|
||||||
|
let endkey = (tag, lsn);
|
||||||
|
|
||||||
|
if let Some((_k, v)) = self
|
||||||
|
.inner
|
||||||
|
.range((Included(startkey), Included(endkey)))
|
||||||
|
.next_back()
|
||||||
|
{
|
||||||
|
Some(Arc::clone(v))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert(&mut self, layer: Arc<dyn Layer>) {
|
||||||
|
let rel = layer.get_relish_tag();
|
||||||
|
let start_lsn = layer.get_start_lsn();
|
||||||
|
|
||||||
|
self.inner.insert((rel, start_lsn), Arc::clone(&layer));
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn remove(&mut self, layer: &dyn Layer) {
|
||||||
|
let rel = layer.get_relish_tag();
|
||||||
|
let start_lsn = layer.get_start_lsn();
|
||||||
|
|
||||||
|
self.inner.remove(&(rel, start_lsn));
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn list_rels(&self, spcnode: u32, dbnode: u32) -> Result<HashSet<RelTag>> {
|
||||||
|
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||||
|
|
||||||
|
// Scan the timeline directory to get all rels in this timeline.
|
||||||
|
for ((rel, _lsn), _l) in self.inner.iter() {
|
||||||
|
if let RelishTag::Relation(reltag) = rel {
|
||||||
|
// FIXME: skip if it was dropped before the requested LSN. But there is no
|
||||||
|
// LSN argument
|
||||||
|
|
||||||
|
if (spcnode == 0 || reltag.spcnode == spcnode)
|
||||||
|
&& (dbnode == 0 || reltag.dbnode == dbnode)
|
||||||
|
{
|
||||||
|
rels.insert(*reltag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(rels)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn list_nonrels(&self, _lsn: Lsn) -> Result<HashSet<RelishTag>> {
|
||||||
|
let mut rels: HashSet<RelishTag> = HashSet::new();
|
||||||
|
|
||||||
|
// Scan the timeline directory to get all rels in this timeline.
|
||||||
|
for ((rel, _lsn), _l) in self.inner.iter() {
|
||||||
|
// FIXME: skip if it was dropped before the requested LSN.
|
||||||
|
|
||||||
|
if let RelishTag::Relation(_) = rel {
|
||||||
|
} else {
|
||||||
|
rels.insert(*rel);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(rels)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is there a newer layer for given relation?
|
||||||
|
pub fn newer_layer_exists(&self, rel: RelishTag, lsn: Lsn) -> bool {
|
||||||
|
let startkey = (rel, lsn);
|
||||||
|
let endkey = (rel, Lsn(u64::MAX));
|
||||||
|
|
||||||
|
for ((_rel, newer_lsn), layer) in self.inner.range((Included(startkey), Included(endkey))) {
|
||||||
|
if layer.get_end_lsn() > lsn {
|
||||||
|
trace!(
|
||||||
|
"found later layer for rel {}, {} {}-{}",
|
||||||
|
rel,
|
||||||
|
lsn,
|
||||||
|
newer_lsn,
|
||||||
|
layer.get_end_lsn()
|
||||||
|
);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
trace!(
|
||||||
|
"found singleton layer for rel {}, {} {}",
|
||||||
|
rel, lsn, newer_lsn
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
trace!("no later layer found for rel {}, {}", rel, lsn);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for LayerMap {
|
||||||
|
fn default() -> Self {
|
||||||
|
LayerMap {
|
||||||
|
inner: BTreeMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
631
pageserver/src/layered_repository/snapshot_layer.rs
Normal file
631
pageserver/src/layered_repository/snapshot_layer.rs
Normal file
@@ -0,0 +1,631 @@
|
|||||||
|
//!
|
||||||
|
//! A SnapshotLayer represents one snapshot file on disk. One file holds all page
|
||||||
|
//! version and size information of one relation, in a range of LSN.
|
||||||
|
//! The name "snapshot file" is a bit of a misnomer because a snapshot file doesn't
|
||||||
|
//! contain a snapshot at a specific LSN, but rather all the page versions in a range
|
||||||
|
//! of LSNs.
|
||||||
|
//!
|
||||||
|
//! Currently, a snapshot file contains full information needed to reconstruct any
|
||||||
|
//! page version in the LSN range, without consulting any other snapshot files. When
|
||||||
|
//! a new snapshot file is created for writing, the full contents of relation are
|
||||||
|
//! materialized as it is at the beginning of the LSN range. That can be very expensive,
|
||||||
|
//! we should find a way to store differential files. But this keeps the read-side
|
||||||
|
//! of things simple. You can find the correct snapshot file based on RelishTag and
|
||||||
|
//! timeline+LSN, and once you've located it, you have all the data you need to in that
|
||||||
|
//! file.
|
||||||
|
//!
|
||||||
|
//! When a snapshot file needs to be accessed, we slurp the whole file into memory, into
|
||||||
|
//! the SnapshotLayer struct. See load() and unload() functions.
|
||||||
|
//!
|
||||||
|
//! On disk, the snapshot files are stored in timelines/<timelineid> directory.
|
||||||
|
//! Currently, there are no subdirectories, and each snapshot file is named like this:
|
||||||
|
//!
|
||||||
|
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
|
||||||
|
//!
|
||||||
|
//! For example:
|
||||||
|
//!
|
||||||
|
//! 1663_13990_2609_0_000000000169C348_000000000169C349
|
||||||
|
//!
|
||||||
|
//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
|
||||||
|
//! So the above example would become:
|
||||||
|
//!
|
||||||
|
//! 1663_13990_2609_0_000000000169C348_000000000169C349_DROPPED
|
||||||
|
//!
|
||||||
|
//! The end LSN indicates when it was dropped in that case, we don't store it in the
|
||||||
|
//! file contents in any way.
|
||||||
|
//!
|
||||||
|
//! A snapshot file is constructed using the 'bookfile' crate. Each file consists of two
|
||||||
|
//! parts: the page versions and the relation sizes. They are stored as separate chapters.
|
||||||
|
//!
|
||||||
|
use crate::layered_repository::storage_layer::Layer;
|
||||||
|
use crate::layered_repository::storage_layer::PageVersion;
|
||||||
|
use crate::layered_repository::storage_layer::ZERO_PAGE;
|
||||||
|
use crate::relish::*;
|
||||||
|
use crate::repository::WALRecord;
|
||||||
|
use crate::walredo::WalRedoManager;
|
||||||
|
use crate::PageServerConf;
|
||||||
|
use crate::{ZTenantId, ZTimelineId};
|
||||||
|
use anyhow::{bail, Result};
|
||||||
|
use bytes::Bytes;
|
||||||
|
use log::*;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::fmt;
|
||||||
|
use std::fs;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::Write;
|
||||||
|
use std::ops::Bound::Included;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::{Arc, Mutex, MutexGuard};
|
||||||
|
|
||||||
|
use bookfile::{Book, BookWriter};
|
||||||
|
|
||||||
|
use zenith_utils::bin_ser::BeSer;
|
||||||
|
use zenith_utils::lsn::Lsn;
|
||||||
|
|
||||||
|
// Magic constant to identify a Zenith snapshot file
|
||||||
|
static SNAPSHOT_FILE_MAGIC: u32 = 0x5A616E01;
|
||||||
|
|
||||||
|
static PAGE_VERSIONS_CHAPTER: u64 = 1;
|
||||||
|
static REL_SIZES_CHAPTER: u64 = 2;
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||||
|
struct SnapshotFileName {
|
||||||
|
rel: RelishTag,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
dropped: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SnapshotFileName {
|
||||||
|
fn from_str(fname: &str) -> Option<Self> {
|
||||||
|
// Split the filename into parts
|
||||||
|
//
|
||||||
|
// <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
|
||||||
|
//
|
||||||
|
// or if it was dropped:
|
||||||
|
//
|
||||||
|
// <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>_DROPPED
|
||||||
|
//
|
||||||
|
let rel;
|
||||||
|
let mut parts;
|
||||||
|
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||||
|
parts = rest.split('_');
|
||||||
|
rel = RelishTag::Relation(RelTag {
|
||||||
|
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||||
|
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||||
|
relnode: parts.next()?.parse::<u32>().ok()?,
|
||||||
|
forknum: parts.next()?.parse::<u8>().ok()?,
|
||||||
|
});
|
||||||
|
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
|
||||||
|
parts = rest.split('_');
|
||||||
|
rel = RelishTag::Slru {
|
||||||
|
slru: SlruKind::Clog,
|
||||||
|
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||||
|
};
|
||||||
|
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
|
||||||
|
parts = rest.split('_');
|
||||||
|
rel = RelishTag::Slru {
|
||||||
|
slru: SlruKind::MultiXactMembers,
|
||||||
|
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||||
|
};
|
||||||
|
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
|
||||||
|
parts = rest.split('_');
|
||||||
|
rel = RelishTag::Slru {
|
||||||
|
slru: SlruKind::MultiXactOffsets,
|
||||||
|
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||||
|
};
|
||||||
|
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
|
||||||
|
parts = rest.split('_');
|
||||||
|
rel = RelishTag::FileNodeMap {
|
||||||
|
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||||
|
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||||
|
};
|
||||||
|
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
|
||||||
|
parts = rest.split('_');
|
||||||
|
rel = RelishTag::TwoPhase {
|
||||||
|
xid: parts.next()?.parse::<u32>().ok()?,
|
||||||
|
};
|
||||||
|
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
|
||||||
|
parts = rest.split('_');
|
||||||
|
rel = RelishTag::Checkpoint;
|
||||||
|
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
|
||||||
|
parts = rest.split('_');
|
||||||
|
rel = RelishTag::ControlFile;
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||||
|
let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||||
|
|
||||||
|
let mut dropped = false;
|
||||||
|
if let Some(suffix) = parts.next() {
|
||||||
|
if suffix == "DROPPED" {
|
||||||
|
dropped = true;
|
||||||
|
} else {
|
||||||
|
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if parts.next().is_some() {
|
||||||
|
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(SnapshotFileName {
|
||||||
|
rel,
|
||||||
|
start_lsn,
|
||||||
|
end_lsn,
|
||||||
|
dropped,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_string(&self) -> String {
|
||||||
|
let basename = match self.rel {
|
||||||
|
RelishTag::Relation(reltag) => format!(
|
||||||
|
"rel_{}_{}_{}_{}",
|
||||||
|
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
|
||||||
|
),
|
||||||
|
RelishTag::Slru {
|
||||||
|
slru: SlruKind::Clog,
|
||||||
|
segno,
|
||||||
|
} => format!("pg_xact_{:04X}", segno),
|
||||||
|
RelishTag::Slru {
|
||||||
|
slru: SlruKind::MultiXactMembers,
|
||||||
|
segno,
|
||||||
|
} => format!("pg_multixact_members_{:04X}", segno),
|
||||||
|
RelishTag::Slru {
|
||||||
|
slru: SlruKind::MultiXactOffsets,
|
||||||
|
segno,
|
||||||
|
} => format!("pg_multixact_offsets_{:04X}", segno),
|
||||||
|
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||||
|
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||||
|
}
|
||||||
|
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||||
|
RelishTag::Checkpoint => format!("pg_control_checkpoint"),
|
||||||
|
RelishTag::ControlFile => format!("pg_control"),
|
||||||
|
};
|
||||||
|
|
||||||
|
format!(
|
||||||
|
"{}_{:016X}_{:016X}{}",
|
||||||
|
basename,
|
||||||
|
u64::from(self.start_lsn),
|
||||||
|
u64::from(self.end_lsn),
|
||||||
|
if self.dropped { "_DROPPED" } else { "" }
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for SnapshotFileName {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
write!(f, "{}", self.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// SnapshotLayer is the in-memory data structure associated with an
|
||||||
|
/// on-disk snapshot file. We keep a SnapshotLayer in memory for each
|
||||||
|
/// file, in the LayerMap. If a layer is in "loaded" state, we have a
|
||||||
|
/// copy of the file in memory, in 'inner'. Otherwise the struct is
|
||||||
|
/// just a placeholder for a file that exists on disk, and it needs to
|
||||||
|
/// be loaded before using it in queries.
|
||||||
|
///
|
||||||
|
pub struct SnapshotLayer {
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
pub tenantid: ZTenantId,
|
||||||
|
pub timelineid: ZTimelineId,
|
||||||
|
pub rel: RelishTag,
|
||||||
|
|
||||||
|
//
|
||||||
|
// This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
|
||||||
|
// start is inclusive, and end is exclusive.
|
||||||
|
pub start_lsn: Lsn,
|
||||||
|
pub end_lsn: Lsn,
|
||||||
|
|
||||||
|
dropped: bool,
|
||||||
|
|
||||||
|
inner: Mutex<SnapshotLayerInner>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct SnapshotLayerInner {
|
||||||
|
/// If false, the 'page_versions' and 'relsizes' have not been
|
||||||
|
/// loaded into memory yet.
|
||||||
|
loaded: bool,
|
||||||
|
|
||||||
|
/// All versions of all pages in the file are are kept here.
|
||||||
|
/// Indexed by block number and LSN.
|
||||||
|
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||||
|
|
||||||
|
/// `relsizes` tracks the size of the relation at different points in time.
|
||||||
|
relsizes: BTreeMap<Lsn, u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Layer for SnapshotLayer {
|
||||||
|
fn is_frozen(&self) -> bool {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_timeline_id(&self) -> ZTimelineId {
|
||||||
|
return self.timelineid;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_relish_tag(&self) -> RelishTag {
|
||||||
|
return self.rel;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_dropped(&self) -> bool {
|
||||||
|
return self.dropped;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_start_lsn(&self) -> Lsn {
|
||||||
|
return self.start_lsn;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_end_lsn(&self) -> Lsn {
|
||||||
|
return self.end_lsn;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Look up given page in the cache.
|
||||||
|
fn get_page_at_lsn(
|
||||||
|
&self,
|
||||||
|
walredo_mgr: &dyn WalRedoManager,
|
||||||
|
blknum: u32,
|
||||||
|
lsn: Lsn,
|
||||||
|
) -> Result<Bytes> {
|
||||||
|
// Scan the BTreeMap backwards, starting from the given entry.
|
||||||
|
let mut records: Vec<WALRecord> = Vec::new();
|
||||||
|
let mut page_img: Option<Bytes> = None;
|
||||||
|
let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
|
||||||
|
{
|
||||||
|
let inner = self.load()?;
|
||||||
|
let minkey = (blknum, Lsn(0));
|
||||||
|
let maxkey = (blknum, lsn);
|
||||||
|
let mut iter = inner
|
||||||
|
.page_versions
|
||||||
|
.range((Included(&minkey), Included(&maxkey)));
|
||||||
|
while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
|
||||||
|
if let Some(img) = &entry.page_image {
|
||||||
|
page_img = Some(img.clone());
|
||||||
|
need_base_image_lsn = None;
|
||||||
|
break;
|
||||||
|
} else if let Some(rec) = &entry.record {
|
||||||
|
records.push(rec.clone());
|
||||||
|
if rec.will_init {
|
||||||
|
// This WAL record initializes the page, so no need to go further back
|
||||||
|
need_base_image_lsn = None;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
need_base_image_lsn = Some(*entry_lsn);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No base image, and no WAL record. Huh?
|
||||||
|
bail!("no page image or WAL record for requested page");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// release lock on 'inner'
|
||||||
|
}
|
||||||
|
records.reverse();
|
||||||
|
|
||||||
|
// If we needed a base image to apply the WAL records against, we should have found it in memory.
|
||||||
|
if let Some(lsn) = need_base_image_lsn {
|
||||||
|
if records.is_empty() {
|
||||||
|
// no records, and no base image. This can happen if PostgreSQL extends a relation
|
||||||
|
// but never writes the page.
|
||||||
|
//
|
||||||
|
// Would be nice to detect that situation better.
|
||||||
|
warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
|
||||||
|
return Ok(ZERO_PAGE.clone());
|
||||||
|
}
|
||||||
|
bail!(
|
||||||
|
"No base image found for page {} blk {} at {}/{}",
|
||||||
|
self.rel,
|
||||||
|
blknum,
|
||||||
|
self.timelineid,
|
||||||
|
lsn
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have a page image, and no WAL, we're all set
|
||||||
|
if records.is_empty() {
|
||||||
|
if let Some(img) = page_img {
|
||||||
|
trace!(
|
||||||
|
"found page image for blk {} in {} at {}/{}, no WAL redo required",
|
||||||
|
blknum,
|
||||||
|
self.rel,
|
||||||
|
self.timelineid,
|
||||||
|
lsn
|
||||||
|
);
|
||||||
|
Ok(img)
|
||||||
|
} else {
|
||||||
|
// FIXME: this ought to be an error?
|
||||||
|
warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
|
||||||
|
Ok(ZERO_PAGE.clone())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// We need to do WAL redo.
|
||||||
|
//
|
||||||
|
// If we don't have a base image, then the oldest WAL record better initialize
|
||||||
|
// the page
|
||||||
|
if page_img.is_none() && !records.first().unwrap().will_init {
|
||||||
|
// FIXME: this ought to be an error?
|
||||||
|
warn!(
|
||||||
|
"Base image for page {} blk {} at {} not found, but got {} WAL records",
|
||||||
|
self.rel,
|
||||||
|
blknum,
|
||||||
|
lsn,
|
||||||
|
records.len()
|
||||||
|
);
|
||||||
|
Ok(ZERO_PAGE.clone())
|
||||||
|
} else {
|
||||||
|
if page_img.is_some() {
|
||||||
|
trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
|
||||||
|
} else {
|
||||||
|
trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
|
||||||
|
}
|
||||||
|
let img = walredo_mgr.request_redo(self.rel, blknum, lsn, page_img, records)?;
|
||||||
|
|
||||||
|
// FIXME: Should we memoize the page image in memory, so that
|
||||||
|
// we wouldn't need to reconstruct it again, if it's requested again?
|
||||||
|
//self.put_page_image(blknum, lsn, img.clone())?;
|
||||||
|
|
||||||
|
Ok(img)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get size of the relation at given LSN
|
||||||
|
fn get_relish_size(&self, lsn: Lsn) -> Result<Option<u32>> {
|
||||||
|
// Scan the BTreeMap backwards, starting from the given entry.
|
||||||
|
let inner = self.load()?;
|
||||||
|
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||||
|
|
||||||
|
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||||
|
let result = *entry;
|
||||||
|
drop(inner);
|
||||||
|
trace!("get_relsize: {} at {} -> {}", self.rel, lsn, result);
|
||||||
|
Ok(Some(result))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Does this relation exist at given LSN?
|
||||||
|
fn get_rel_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||||
|
// Is the requested LSN after the rel was dropped?
|
||||||
|
if self.dropped && lsn >= self.end_lsn {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise, it exists.
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unsupported write operations
|
||||||
|
fn put_page_version(&self, blknum: u32, lsn: Lsn, _pv: PageVersion) -> Result<()> {
|
||||||
|
panic!(
|
||||||
|
"cannot modify historical snapshot layer, rel {} blk {} at {}/{}, {}-{}",
|
||||||
|
self.rel, blknum, self.timelineid, lsn, self.start_lsn, self.end_lsn
|
||||||
|
);
|
||||||
|
}
|
||||||
|
fn put_truncation(&self, _lsn: Lsn, _relsize: u32) -> anyhow::Result<()> {
|
||||||
|
bail!("cannot modify historical snapshot layer");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn put_unlink(&self, _lsn: Lsn) -> anyhow::Result<()> {
|
||||||
|
bail!("cannot modify historical snapshot layer");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn freeze(
|
||||||
|
&self,
|
||||||
|
_end_lsn: Lsn,
|
||||||
|
_walredo_mgr: &dyn WalRedoManager,
|
||||||
|
) -> Result<Vec<Arc<dyn Layer>>> {
|
||||||
|
bail!("cannot freeze historical snapshot layer");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn delete(&self) -> Result<()> {
|
||||||
|
// delete underlying file
|
||||||
|
fs::remove_file(self.path())?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Release most of the memory used by this layer. If it's accessed again later,
|
||||||
|
/// it will need to be loaded back.
|
||||||
|
///
|
||||||
|
fn unload(&self) -> Result<()> {
|
||||||
|
let mut inner = self.inner.lock().unwrap();
|
||||||
|
inner.page_versions = BTreeMap::new();
|
||||||
|
inner.relsizes = BTreeMap::new();
|
||||||
|
inner.loaded = false;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SnapshotLayer {
|
||||||
|
fn path(&self) -> PathBuf {
|
||||||
|
Self::path_for(
|
||||||
|
self.conf,
|
||||||
|
self.timelineid,
|
||||||
|
self.tenantid,
|
||||||
|
&SnapshotFileName {
|
||||||
|
rel: self.rel,
|
||||||
|
start_lsn: self.start_lsn,
|
||||||
|
end_lsn: self.end_lsn,
|
||||||
|
dropped: self.dropped,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn path_for(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
timelineid: ZTimelineId,
|
||||||
|
tenantid: ZTenantId,
|
||||||
|
fname: &SnapshotFileName,
|
||||||
|
) -> PathBuf {
|
||||||
|
conf.timeline_path(&timelineid, &tenantid)
|
||||||
|
.join(fname.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new snapshot file, using the given btreemaps containing the page versions and
|
||||||
|
/// relsizes.
|
||||||
|
///
|
||||||
|
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
|
||||||
|
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
|
||||||
|
/// expedient.
|
||||||
|
pub fn create(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
timelineid: ZTimelineId,
|
||||||
|
tenantid: ZTenantId,
|
||||||
|
rel: RelishTag,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
dropped: bool,
|
||||||
|
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||||
|
relsizes: BTreeMap<Lsn, u32>,
|
||||||
|
) -> Result<SnapshotLayer> {
|
||||||
|
let snapfile = SnapshotLayer {
|
||||||
|
conf: conf,
|
||||||
|
timelineid: timelineid,
|
||||||
|
tenantid: tenantid,
|
||||||
|
rel: rel,
|
||||||
|
start_lsn: start_lsn,
|
||||||
|
end_lsn,
|
||||||
|
dropped,
|
||||||
|
inner: Mutex::new(SnapshotLayerInner {
|
||||||
|
loaded: true,
|
||||||
|
page_versions: page_versions,
|
||||||
|
relsizes: relsizes,
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
let inner = snapfile.inner.lock().unwrap();
|
||||||
|
|
||||||
|
// Write the in-memory btreemaps into a file
|
||||||
|
let path = snapfile.path();
|
||||||
|
|
||||||
|
// Note: This overwrites any existing file. There shouldn't be any.
|
||||||
|
// FIXME: throw an error instead?
|
||||||
|
let file = File::create(&path)?;
|
||||||
|
let book = BookWriter::new(file, SNAPSHOT_FILE_MAGIC)?;
|
||||||
|
|
||||||
|
// Write out page versions
|
||||||
|
let mut chapter = book.new_chapter(PAGE_VERSIONS_CHAPTER);
|
||||||
|
let buf = BTreeMap::ser(&inner.page_versions)?;
|
||||||
|
chapter.write_all(&buf)?;
|
||||||
|
let book = chapter.close()?;
|
||||||
|
|
||||||
|
// and relsizes to separate chapter
|
||||||
|
let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
|
||||||
|
let buf = BTreeMap::ser(&inner.relsizes)?;
|
||||||
|
chapter.write_all(&buf)?;
|
||||||
|
let book = chapter.close()?;
|
||||||
|
|
||||||
|
book.close()?;
|
||||||
|
|
||||||
|
trace!("saved {}", &path.display());
|
||||||
|
|
||||||
|
drop(inner);
|
||||||
|
|
||||||
|
Ok(snapfile)
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Load the contents of the file into memory
|
||||||
|
///
|
||||||
|
fn load(&self) -> Result<MutexGuard<SnapshotLayerInner>> {
|
||||||
|
// quick exit if already loaded
|
||||||
|
let mut inner = self.inner.lock().unwrap();
|
||||||
|
|
||||||
|
if inner.loaded {
|
||||||
|
return Ok(inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
let path = Self::path_for(
|
||||||
|
self.conf,
|
||||||
|
self.timelineid,
|
||||||
|
self.tenantid,
|
||||||
|
&SnapshotFileName {
|
||||||
|
rel: self.rel,
|
||||||
|
start_lsn: self.start_lsn,
|
||||||
|
end_lsn: self.end_lsn,
|
||||||
|
dropped: self.dropped,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
let file = File::open(&path)?;
|
||||||
|
let book = Book::new(file)?;
|
||||||
|
|
||||||
|
let chapter = book.read_chapter(PAGE_VERSIONS_CHAPTER)?;
|
||||||
|
let page_versions = BTreeMap::des(&chapter)?;
|
||||||
|
|
||||||
|
let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
|
||||||
|
let relsizes = BTreeMap::des(&chapter)?;
|
||||||
|
|
||||||
|
debug!("loaded from {}", &path.display());
|
||||||
|
|
||||||
|
*inner = SnapshotLayerInner {
|
||||||
|
loaded: true,
|
||||||
|
page_versions,
|
||||||
|
relsizes,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(inner)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create SnapshotLayers representing all files on dik
|
||||||
|
///
|
||||||
|
// TODO: returning an Iterator would be more idiomatic
|
||||||
|
pub fn list_snapshot_files(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
timelineid: ZTimelineId,
|
||||||
|
tenantid: ZTenantId,
|
||||||
|
) -> Result<Vec<Arc<dyn Layer>>> {
|
||||||
|
let path = conf.timeline_path(&timelineid, &tenantid);
|
||||||
|
|
||||||
|
let mut snapfiles: Vec<Arc<dyn Layer>> = Vec::new();
|
||||||
|
for direntry in fs::read_dir(path)? {
|
||||||
|
let fname = direntry?.file_name();
|
||||||
|
let fname = fname.to_str().unwrap();
|
||||||
|
|
||||||
|
if let Some(snapfilename) = SnapshotFileName::from_str(fname) {
|
||||||
|
let snapfile = SnapshotLayer {
|
||||||
|
conf,
|
||||||
|
timelineid,
|
||||||
|
tenantid,
|
||||||
|
rel: snapfilename.rel,
|
||||||
|
start_lsn: snapfilename.start_lsn,
|
||||||
|
end_lsn: snapfilename.end_lsn,
|
||||||
|
dropped: snapfilename.dropped,
|
||||||
|
inner: Mutex::new(SnapshotLayerInner {
|
||||||
|
loaded: false,
|
||||||
|
page_versions: BTreeMap::new(),
|
||||||
|
relsizes: BTreeMap::new(),
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
|
||||||
|
snapfiles.push(Arc::new(snapfile));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Ok(snapfiles);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// debugging function to print out the contents of the layer
|
||||||
|
#[allow(unused)]
|
||||||
|
pub fn dump(&self) -> String {
|
||||||
|
let mut result = format!(
|
||||||
|
"----- snapshot layer for {} {}-{} ----\n",
|
||||||
|
self.rel, self.start_lsn, self.end_lsn
|
||||||
|
);
|
||||||
|
|
||||||
|
let inner = self.inner.lock().unwrap();
|
||||||
|
for (k, v) in inner.relsizes.iter() {
|
||||||
|
result += &format!("{}: {}\n", k, v);
|
||||||
|
}
|
||||||
|
//for (k, v) in inner.page_versions.iter() {
|
||||||
|
// result += &format!("blk {} at {}: {}/{}\n", k.0, k.1, v.page_image.is_some(), v.record.is_some());
|
||||||
|
//}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
123
pageserver/src/layered_repository/storage_layer.rs
Normal file
123
pageserver/src/layered_repository/storage_layer.rs
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
use crate::relish::RelishTag;
|
||||||
|
use crate::repository::WALRecord;
|
||||||
|
use crate::walredo::WalRedoManager;
|
||||||
|
use crate::ZTimelineId;
|
||||||
|
use anyhow::Result;
|
||||||
|
use bytes::Bytes;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use zenith_utils::lsn::Lsn;
|
||||||
|
|
||||||
|
pub static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Represents a version of a page at a specific LSN. The LSN is the key of the
|
||||||
|
/// entry in the 'page_versions' hash, it is not duplicated here.
|
||||||
|
///
|
||||||
|
/// A page version can be stored as a full page image, or as WAL record that needs
|
||||||
|
/// to be applied over the previous page version to reconstruct this version.
|
||||||
|
///
|
||||||
|
/// It's also possible to have both a WAL record and a page image in the same
|
||||||
|
/// PageVersion. That happens if page version is originally stored as a WAL record
|
||||||
|
/// but it is later reconstructed by a GetPage@LSN request by performing WAL
|
||||||
|
/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
|
||||||
|
/// the WAL record in that case. TODO: That's pretty accidental, not the result
|
||||||
|
/// of any grand design. If we want to keep reconstructed page versions around, we
|
||||||
|
/// probably should have a separate buffer cache so that we could control the
|
||||||
|
/// replacement policy globally. Or if we keep a reconstructed page image, we
|
||||||
|
/// could throw away the WAL record.
|
||||||
|
///
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct PageVersion {
|
||||||
|
/// an 8kb page image
|
||||||
|
pub page_image: Option<Bytes>,
|
||||||
|
/// WAL record to get from previous page version to this one.
|
||||||
|
pub record: Option<WALRecord>,
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// A Layer holds all page versions for one relish, in a range of LSNs.
|
||||||
|
/// There are two kinds of layers, in-memory and snapshot layers. In-memory
|
||||||
|
/// layers are used to ingest incoming WAL, and provide fast access
|
||||||
|
/// to the recent page versions. Snaphot layers are stored on disk, and
|
||||||
|
/// are immutable.
|
||||||
|
///
|
||||||
|
/// Each layer contains a full snapshot of the relish at the start
|
||||||
|
/// LSN. In addition to that, it contains WAL (or more page images)
|
||||||
|
/// needed to recontruct any page version up to the end LSN.
|
||||||
|
///
|
||||||
|
pub trait Layer: Send + Sync {
|
||||||
|
// These functions identify the relish and the LSN range that this Layer
|
||||||
|
// holds.
|
||||||
|
fn get_timeline_id(&self) -> ZTimelineId;
|
||||||
|
fn get_relish_tag(&self) -> RelishTag;
|
||||||
|
fn get_start_lsn(&self) -> Lsn;
|
||||||
|
fn get_end_lsn(&self) -> Lsn;
|
||||||
|
fn is_dropped(&self) -> bool;
|
||||||
|
|
||||||
|
/// Frozen layers are stored on disk, an cannot accept cannot accept new WAL
|
||||||
|
/// records, whereas an unfrozen layer can still be modified, but is not
|
||||||
|
/// durable in case of a crash. Snapshot layers are always frozen, and
|
||||||
|
/// in-memory layers are always unfrozen.
|
||||||
|
fn is_frozen(&self) -> bool;
|
||||||
|
|
||||||
|
// Functions that correspond to the Timeline trait functions.
|
||||||
|
fn get_page_at_lsn(
|
||||||
|
&self,
|
||||||
|
walredo_mgr: &dyn WalRedoManager,
|
||||||
|
blknum: u32,
|
||||||
|
lsn: Lsn,
|
||||||
|
) -> Result<Bytes>;
|
||||||
|
|
||||||
|
fn get_relish_size(&self, lsn: Lsn) -> Result<Option<u32>>;
|
||||||
|
|
||||||
|
fn get_rel_exists(&self, lsn: Lsn) -> Result<bool>;
|
||||||
|
|
||||||
|
fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()>;
|
||||||
|
|
||||||
|
fn put_truncation(&self, lsn: Lsn, relsize: u32) -> anyhow::Result<()>;
|
||||||
|
|
||||||
|
fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()>;
|
||||||
|
|
||||||
|
/// Remember new page version, as a WAL record over previous version
|
||||||
|
fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> Result<()> {
|
||||||
|
self.put_page_version(
|
||||||
|
blknum,
|
||||||
|
rec.lsn,
|
||||||
|
PageVersion {
|
||||||
|
page_image: None,
|
||||||
|
record: Some(rec),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remember new page version, as a full page image
|
||||||
|
fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> {
|
||||||
|
self.put_page_version(
|
||||||
|
blknum,
|
||||||
|
lsn,
|
||||||
|
PageVersion {
|
||||||
|
page_image: Some(img),
|
||||||
|
record: None,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Split off an immutable layer from existing layer.
|
||||||
|
///
|
||||||
|
/// Returns new layers that replace this one.
|
||||||
|
///
|
||||||
|
fn freeze(&self, end_lsn: Lsn, walredo_mgr: &dyn WalRedoManager)
|
||||||
|
-> Result<Vec<Arc<dyn Layer>>>;
|
||||||
|
|
||||||
|
/// Permanently delete this layer
|
||||||
|
fn delete(&self) -> Result<()>;
|
||||||
|
|
||||||
|
/// Try to release memory used by this layer. This is currently
|
||||||
|
/// only used by snapshot layers, to free the copy of the file
|
||||||
|
/// from memory. (TODO: a smarter, more granular caching scheme
|
||||||
|
/// would be nice)
|
||||||
|
fn unload(&self) -> Result<()>;
|
||||||
|
}
|
||||||
@@ -9,6 +9,7 @@ use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
|
|||||||
|
|
||||||
pub mod basebackup;
|
pub mod basebackup;
|
||||||
pub mod branches;
|
pub mod branches;
|
||||||
|
pub mod layered_repository;
|
||||||
pub mod logger;
|
pub mod logger;
|
||||||
pub mod object_key;
|
pub mod object_key;
|
||||||
pub mod object_repository;
|
pub mod object_repository;
|
||||||
@@ -54,6 +55,14 @@ pub struct PageServerConf {
|
|||||||
pub auth_type: AuthType,
|
pub auth_type: AuthType,
|
||||||
|
|
||||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||||
|
|
||||||
|
pub repository_format: RepositoryFormat,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum RepositoryFormat {
|
||||||
|
Layered,
|
||||||
|
RocksDb,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PageServerConf {
|
impl PageServerConf {
|
||||||
|
|||||||
@@ -2,11 +2,12 @@
|
|||||||
//! page server.
|
//! page server.
|
||||||
|
|
||||||
use crate::branches;
|
use crate::branches;
|
||||||
|
use crate::layered_repository::LayeredRepository;
|
||||||
use crate::object_repository::ObjectRepository;
|
use crate::object_repository::ObjectRepository;
|
||||||
use crate::repository::Repository;
|
use crate::repository::Repository;
|
||||||
use crate::rocksdb_storage::RocksObjectStore;
|
use crate::rocksdb_storage::RocksObjectStore;
|
||||||
use crate::walredo::PostgresRedoManager;
|
use crate::walredo::PostgresRedoManager;
|
||||||
use crate::PageServerConf;
|
use crate::{PageServerConf, RepositoryFormat};
|
||||||
use anyhow::{anyhow, bail, Result};
|
use anyhow::{anyhow, bail, Result};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use log::info;
|
use log::info;
|
||||||
@@ -27,16 +28,35 @@ pub fn init(conf: &'static PageServerConf) {
|
|||||||
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
|
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
|
||||||
let tenantid =
|
let tenantid =
|
||||||
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
|
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
|
||||||
let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
|
|
||||||
|
|
||||||
// Set up a WAL redo manager, for applying WAL records.
|
// Set up a WAL redo manager, for applying WAL records.
|
||||||
let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
|
let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
|
||||||
|
|
||||||
// Set up an object repository, for actual data storage.
|
// Set up an object repository, for actual data storage.
|
||||||
let repo =
|
let repo: Arc<dyn Repository + Sync + Send> = match conf.repository_format {
|
||||||
ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
|
RepositoryFormat::Layered => {
|
||||||
|
let repo = Arc::new(LayeredRepository::new(
|
||||||
|
conf,
|
||||||
|
Arc::new(walredo_mgr),
|
||||||
|
tenantid,
|
||||||
|
));
|
||||||
|
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||||
|
repo
|
||||||
|
}
|
||||||
|
RepositoryFormat::RocksDb => {
|
||||||
|
let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
|
||||||
|
|
||||||
|
Arc::new(ObjectRepository::new(
|
||||||
|
conf,
|
||||||
|
Arc::new(obj_store),
|
||||||
|
Arc::new(walredo_mgr),
|
||||||
|
tenantid,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
info!("initialized storage for tenant: {}", &tenantid);
|
info!("initialized storage for tenant: {}", &tenantid);
|
||||||
m.insert(tenantid, Arc::new(repo));
|
m.insert(tenantid, repo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,7 +73,7 @@ pub fn create_repository_for_tenant(
|
|||||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
|
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
|
||||||
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
|
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
|
||||||
|
|
||||||
m.insert(tenantid, Arc::new(repo));
|
m.insert(tenantid, repo);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -693,6 +693,18 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
RowDescriptor::int8_col(b"control_deleted"),
|
RowDescriptor::int8_col(b"control_deleted"),
|
||||||
RowDescriptor::int8_col(b"filenodemap_deleted"),
|
RowDescriptor::int8_col(b"filenodemap_deleted"),
|
||||||
RowDescriptor::int8_col(b"dropped"),
|
RowDescriptor::int8_col(b"dropped"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_relfiles_total"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
|
||||||
|
RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
|
||||||
RowDescriptor::int8_col(b"elapsed"),
|
RowDescriptor::int8_col(b"elapsed"),
|
||||||
]))?
|
]))?
|
||||||
.write_message_noflush(&BeMessage::DataRow(&[
|
.write_message_noflush(&BeMessage::DataRow(&[
|
||||||
@@ -705,6 +717,43 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
Some(&result.control_deleted.to_string().as_bytes()),
|
Some(&result.control_deleted.to_string().as_bytes()),
|
||||||
Some(&result.filenodemap_deleted.to_string().as_bytes()),
|
Some(&result.filenodemap_deleted.to_string().as_bytes()),
|
||||||
Some(&result.dropped.to_string().as_bytes()),
|
Some(&result.dropped.to_string().as_bytes()),
|
||||||
|
Some(&result.snapshot_relfiles_total.to_string().as_bytes()),
|
||||||
|
Some(
|
||||||
|
&result
|
||||||
|
.snapshot_relfiles_needed_by_cutoff
|
||||||
|
.to_string()
|
||||||
|
.as_bytes(),
|
||||||
|
),
|
||||||
|
Some(
|
||||||
|
&result
|
||||||
|
.snapshot_relfiles_needed_by_branches
|
||||||
|
.to_string()
|
||||||
|
.as_bytes(),
|
||||||
|
),
|
||||||
|
Some(&result.snapshot_relfiles_not_updated.to_string().as_bytes()),
|
||||||
|
Some(&result.snapshot_relfiles_removed.to_string().as_bytes()),
|
||||||
|
Some(&result.snapshot_relfiles_dropped.to_string().as_bytes()),
|
||||||
|
Some(&result.snapshot_nonrelfiles_total.to_string().as_bytes()),
|
||||||
|
Some(
|
||||||
|
&result
|
||||||
|
.snapshot_nonrelfiles_needed_by_cutoff
|
||||||
|
.to_string()
|
||||||
|
.as_bytes(),
|
||||||
|
),
|
||||||
|
Some(
|
||||||
|
&result
|
||||||
|
.snapshot_nonrelfiles_needed_by_branches
|
||||||
|
.to_string()
|
||||||
|
.as_bytes(),
|
||||||
|
),
|
||||||
|
Some(
|
||||||
|
&result
|
||||||
|
.snapshot_nonrelfiles_not_updated
|
||||||
|
.to_string()
|
||||||
|
.as_bytes(),
|
||||||
|
),
|
||||||
|
Some(&result.snapshot_nonrelfiles_removed.to_string().as_bytes()),
|
||||||
|
Some(&result.snapshot_nonrelfiles_dropped.to_string().as_bytes()),
|
||||||
Some(&result.elapsed.as_millis().to_string().as_bytes()),
|
Some(&result.elapsed.as_millis().to_string().as_bytes()),
|
||||||
]))?
|
]))?
|
||||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
|||||||
@@ -120,7 +120,16 @@ impl RelishTag {
|
|||||||
|
|
||||||
// and these don't
|
// and these don't
|
||||||
| RelishTag::ControlFile
|
| RelishTag::ControlFile
|
||||||
| RelishTag::Checkpoint => false,
|
| RelishTag::Checkpoint => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// convenience function to check if this relish is a normal relation.
|
||||||
|
pub const fn is_relation(&self) -> bool {
|
||||||
|
if let RelishTag::Relation(_) = self {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::iter::Iterator;
|
use std::iter::Iterator;
|
||||||
|
use std::ops::AddAssign;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use zenith_utils::lsn::Lsn;
|
use zenith_utils::lsn::Lsn;
|
||||||
@@ -56,6 +57,8 @@ pub trait Repository: Send + Sync {
|
|||||||
///
|
///
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct GcResult {
|
pub struct GcResult {
|
||||||
|
// FIXME: These counters make sense for the ObjectRepository. They are not used
|
||||||
|
// by the LayeredRepository.
|
||||||
pub n_relations: u64,
|
pub n_relations: u64,
|
||||||
pub inspected: u64,
|
pub inspected: u64,
|
||||||
pub truncated: u64,
|
pub truncated: u64,
|
||||||
@@ -66,9 +69,51 @@ pub struct GcResult {
|
|||||||
pub control_deleted: u64, // RelishTag::ControlFile
|
pub control_deleted: u64, // RelishTag::ControlFile
|
||||||
pub filenodemap_deleted: u64, // RelishTag::FileNodeMap
|
pub filenodemap_deleted: u64, // RelishTag::FileNodeMap
|
||||||
pub dropped: u64,
|
pub dropped: u64,
|
||||||
|
|
||||||
|
// These are used for the LayeredRepository instead
|
||||||
|
pub snapshot_relfiles_total: u64,
|
||||||
|
pub snapshot_relfiles_needed_by_cutoff: u64,
|
||||||
|
pub snapshot_relfiles_needed_by_branches: u64,
|
||||||
|
pub snapshot_relfiles_not_updated: u64,
|
||||||
|
pub snapshot_relfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
|
||||||
|
pub snapshot_relfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
|
||||||
|
|
||||||
|
pub snapshot_nonrelfiles_total: u64,
|
||||||
|
pub snapshot_nonrelfiles_needed_by_cutoff: u64,
|
||||||
|
pub snapshot_nonrelfiles_needed_by_branches: u64,
|
||||||
|
pub snapshot_nonrelfiles_not_updated: u64,
|
||||||
|
pub snapshot_nonrelfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
|
||||||
|
pub snapshot_nonrelfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
|
||||||
|
|
||||||
pub elapsed: Duration,
|
pub elapsed: Duration,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl AddAssign for GcResult {
|
||||||
|
fn add_assign(&mut self, other: Self) {
|
||||||
|
self.n_relations += other.n_relations;
|
||||||
|
self.truncated += other.truncated;
|
||||||
|
self.deleted += other.deleted;
|
||||||
|
self.dropped += other.dropped;
|
||||||
|
|
||||||
|
self.snapshot_relfiles_total += other.snapshot_relfiles_total;
|
||||||
|
self.snapshot_relfiles_needed_by_cutoff += other.snapshot_relfiles_needed_by_cutoff;
|
||||||
|
self.snapshot_relfiles_needed_by_branches += other.snapshot_relfiles_needed_by_branches;
|
||||||
|
self.snapshot_relfiles_not_updated += other.snapshot_relfiles_not_updated;
|
||||||
|
self.snapshot_relfiles_removed += other.snapshot_relfiles_removed;
|
||||||
|
self.snapshot_relfiles_dropped += other.snapshot_relfiles_dropped;
|
||||||
|
|
||||||
|
self.snapshot_nonrelfiles_total += other.snapshot_nonrelfiles_total;
|
||||||
|
self.snapshot_nonrelfiles_needed_by_cutoff += other.snapshot_nonrelfiles_needed_by_cutoff;
|
||||||
|
self.snapshot_nonrelfiles_needed_by_branches +=
|
||||||
|
other.snapshot_nonrelfiles_needed_by_branches;
|
||||||
|
self.snapshot_nonrelfiles_not_updated += other.snapshot_nonrelfiles_not_updated;
|
||||||
|
self.snapshot_nonrelfiles_removed += other.snapshot_nonrelfiles_removed;
|
||||||
|
self.snapshot_nonrelfiles_dropped += other.snapshot_nonrelfiles_dropped;
|
||||||
|
|
||||||
|
self.elapsed += other.elapsed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub trait Timeline: Send + Sync {
|
pub trait Timeline: Send + Sync {
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Public GET functions
|
// Public GET functions
|
||||||
@@ -234,11 +279,12 @@ impl WALRecord {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use crate::layered_repository::LayeredRepository;
|
||||||
use crate::object_repository::ObjectRepository;
|
use crate::object_repository::ObjectRepository;
|
||||||
use crate::object_repository::{ObjectValue, PageEntry, RelationSizeEntry};
|
use crate::object_repository::{ObjectValue, PageEntry, RelationSizeEntry};
|
||||||
use crate::rocksdb_storage::RocksObjectStore;
|
use crate::rocksdb_storage::RocksObjectStore;
|
||||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||||
use crate::PageServerConf;
|
use crate::{PageServerConf, RepositoryFormat};
|
||||||
use postgres_ffi::pg_constants;
|
use postgres_ffi::pg_constants;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
@@ -272,10 +318,16 @@ mod tests {
|
|||||||
buf.freeze()
|
buf.freeze()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||||
|
|
||||||
|
fn get_test_repo(
|
||||||
|
test_name: &str,
|
||||||
|
repository_format: RepositoryFormat,
|
||||||
|
) -> Result<Box<dyn Repository>> {
|
||||||
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
|
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
|
||||||
let _ = fs::remove_dir_all(&repo_dir);
|
let _ = fs::remove_dir_all(&repo_dir);
|
||||||
fs::create_dir_all(&repo_dir).unwrap();
|
fs::create_dir_all(&repo_dir)?;
|
||||||
|
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||||
|
|
||||||
let conf = PageServerConf {
|
let conf = PageServerConf {
|
||||||
daemonize: false,
|
daemonize: false,
|
||||||
@@ -288,6 +340,7 @@ mod tests {
|
|||||||
pg_distrib_dir: "".into(),
|
pg_distrib_dir: "".into(),
|
||||||
auth_type: AuthType::Trust,
|
auth_type: AuthType::Trust,
|
||||||
auth_validation_public_key_path: None,
|
auth_validation_public_key_path: None,
|
||||||
|
repository_format,
|
||||||
};
|
};
|
||||||
// Make a static copy of the config. This can never be free'd, but that's
|
// Make a static copy of the config. This can never be free'd, but that's
|
||||||
// OK in a test.
|
// OK in a test.
|
||||||
@@ -295,24 +348,47 @@ mod tests {
|
|||||||
let tenantid = ZTenantId::generate();
|
let tenantid = ZTenantId::generate();
|
||||||
fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
|
fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
|
||||||
|
|
||||||
let obj_store = RocksObjectStore::create(conf, &tenantid)?;
|
|
||||||
|
|
||||||
let walredo_mgr = TestRedoManager {};
|
let walredo_mgr = TestRedoManager {};
|
||||||
|
|
||||||
let repo =
|
let repo: Box<dyn Repository + Sync + Send> = match conf.repository_format {
|
||||||
ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
|
RepositoryFormat::Layered => Box::new(LayeredRepository::new(
|
||||||
|
conf,
|
||||||
|
Arc::new(walredo_mgr),
|
||||||
|
tenantid,
|
||||||
|
)),
|
||||||
|
RepositoryFormat::RocksDb => {
|
||||||
|
let obj_store = RocksObjectStore::create(conf, &tenantid)?;
|
||||||
|
|
||||||
Ok(Box::new(repo))
|
Box::new(ObjectRepository::new(
|
||||||
|
conf,
|
||||||
|
Arc::new(obj_store),
|
||||||
|
Arc::new(walredo_mgr),
|
||||||
|
tenantid,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(repo)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Test get_relsize() and truncation.
|
/// Test get_relsize() and truncation.
|
||||||
#[test]
|
#[test]
|
||||||
fn test_relsize() -> Result<()> {
|
fn test_relsize_rocksdb() -> Result<()> {
|
||||||
|
let repo = get_test_repo("test_relsize_rocksdb", RepositoryFormat::RocksDb)?;
|
||||||
|
test_relsize(&*repo)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_relsize_layered() -> Result<()> {
|
||||||
|
let repo = get_test_repo("test_relsize_layered", RepositoryFormat::Layered)?;
|
||||||
|
test_relsize(&*repo)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_relsize(repo: &dyn Repository) -> Result<()> {
|
||||||
// get_timeline() with non-existent timeline id should fail
|
// get_timeline() with non-existent timeline id should fail
|
||||||
//repo.get_timeline("11223344556677881122334455667788");
|
//repo.get_timeline("11223344556677881122334455667788");
|
||||||
|
|
||||||
// Create timeline to work on
|
// Create timeline to work on
|
||||||
let repo = get_test_repo("test_relsize")?;
|
|
||||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||||
|
|
||||||
@@ -397,14 +473,24 @@ mod tests {
|
|||||||
/// This isn't very interesting with the RocksDb implementation, as we don't pay
|
/// This isn't very interesting with the RocksDb implementation, as we don't pay
|
||||||
/// any attention to Postgres segment boundaries there.
|
/// any attention to Postgres segment boundaries there.
|
||||||
#[test]
|
#[test]
|
||||||
fn test_large_rel() -> Result<()> {
|
fn test_large_rel_rocksdb() -> Result<()> {
|
||||||
let repo = get_test_repo("test_large_rel")?;
|
let repo = get_test_repo("test_large_rel_rocksdb", RepositoryFormat::RocksDb)?;
|
||||||
|
test_large_rel(&*repo)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_large_rel_layered() -> Result<()> {
|
||||||
|
let repo = get_test_repo("test_large_rel_layered", RepositoryFormat::Layered)?;
|
||||||
|
test_large_rel(&*repo)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_large_rel(repo: &dyn Repository) -> Result<()> {
|
||||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||||
|
|
||||||
tline.init_valid_lsn(Lsn(1));
|
tline.init_valid_lsn(Lsn(1));
|
||||||
|
|
||||||
let mut lsn = 0;
|
let mut lsn = 1;
|
||||||
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||||
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||||
lsn += 1;
|
lsn += 1;
|
||||||
@@ -450,15 +536,29 @@ mod tests {
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_branch_rocksdb() -> Result<()> {
|
||||||
|
let repo = get_test_repo("test_branch_rocksdb", RepositoryFormat::RocksDb)?;
|
||||||
|
test_branch(&*repo)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_branch_layered() -> Result<()> {
|
||||||
|
let repo = get_test_repo("test_branch_layered", RepositoryFormat::Layered)?;
|
||||||
|
test_branch(&*repo)
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Test branch creation
|
/// Test branch creation
|
||||||
///
|
///
|
||||||
#[test]
|
fn test_branch(repo: &dyn Repository) -> Result<()> {
|
||||||
fn test_branch() -> Result<()> {
|
|
||||||
let repo = get_test_repo("test_branch")?;
|
|
||||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||||
|
|
||||||
|
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||||
|
// after branching fails below
|
||||||
|
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(1), ZERO_PAGE.clone(), false)?;
|
||||||
|
|
||||||
// Create a relation on the timeline
|
// Create a relation on the timeline
|
||||||
tline.init_valid_lsn(Lsn(1));
|
tline.init_valid_lsn(Lsn(1));
|
||||||
tline.put_page_image(TESTREL_A, 0, Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
tline.put_page_image(TESTREL_A, 0, Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||||
@@ -500,8 +600,19 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_history() -> Result<()> {
|
fn test_history_rocksdb() -> Result<()> {
|
||||||
let repo = get_test_repo("test_snapshot")?;
|
let repo = get_test_repo("test_history_rocksdb", RepositoryFormat::RocksDb)?;
|
||||||
|
test_history(&*repo)
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
// TODO: This doesn't work with the layered storage, the functions needed for push/pull
|
||||||
|
// functionality haven't been implemented yet.
|
||||||
|
#[ignore]
|
||||||
|
fn test_history_layered() -> Result<()> {
|
||||||
|
let repo = get_test_repo("test_history_layered", RepositoryFormat::Layered)?;
|
||||||
|
test_history(&*repo)
|
||||||
|
}
|
||||||
|
fn test_history(repo: &dyn Repository) -> Result<()> {
|
||||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||||
|
|
||||||
|
|||||||
@@ -132,6 +132,7 @@ pub fn import_timeline_from_postgres_datadir(
|
|||||||
}
|
}
|
||||||
// TODO: Scan pg_tblspc
|
// TODO: Scan pg_tblspc
|
||||||
|
|
||||||
|
timeline.advance_last_valid_lsn(lsn);
|
||||||
timeline.checkpoint()?;
|
timeline.checkpoint()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -425,12 +426,12 @@ pub fn save_decoded_record(
|
|||||||
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
|
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
|
||||||
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
|
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
|
||||||
info!(
|
info!(
|
||||||
"unlink twophaseFile for xid {} parsed_xact.xid {} here",
|
"unlink twophaseFile for xid {} parsed_xact.xid {} here at {}",
|
||||||
decoded.xl_xid, parsed_xact.xid
|
decoded.xl_xid, parsed_xact.xid, lsn
|
||||||
);
|
);
|
||||||
timeline.put_unlink(
|
timeline.put_unlink(
|
||||||
RelishTag::TwoPhase {
|
RelishTag::TwoPhase {
|
||||||
xid: decoded.xl_xid,
|
xid: parsed_xact.xid,
|
||||||
},
|
},
|
||||||
lsn,
|
lsn,
|
||||||
)?;
|
)?;
|
||||||
@@ -795,7 +796,13 @@ fn save_clog_truncate_record(
|
|||||||
// Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
|
// Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
|
||||||
// TODO This implementation is very inefficient -
|
// TODO This implementation is very inefficient -
|
||||||
// it scans all non-rels only to find Clog
|
// it scans all non-rels only to find Clog
|
||||||
for obj in timeline.list_nonrels(lsn)? {
|
//
|
||||||
|
// We cannot pass 'lsn' to the Timeline.list_nonrels(), or it
|
||||||
|
// will block waiting for the last valid LSN to advance up to
|
||||||
|
// it. So we use the previous record's LSN in the get calls
|
||||||
|
// instead.
|
||||||
|
let req_lsn = min(timeline.get_last_record_lsn(), lsn);
|
||||||
|
for obj in timeline.list_nonrels(req_lsn)? {
|
||||||
match obj {
|
match obj {
|
||||||
RelishTag::Slru { slru, segno } => {
|
RelishTag::Slru { slru, segno } => {
|
||||||
if slru == SlruKind::Clog {
|
if slru == SlruKind::Clog {
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use crate::page_cache;
|
|||||||
use crate::relish::*;
|
use crate::relish::*;
|
||||||
use crate::restore_local_repo;
|
use crate::restore_local_repo;
|
||||||
use crate::waldecoder::*;
|
use crate::waldecoder::*;
|
||||||
use crate::PageServerConf;
|
use crate::{PageServerConf, RepositoryFormat};
|
||||||
use anyhow::{Error, Result};
|
use anyhow::{Error, Result};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use log::*;
|
use log::*;
|
||||||
@@ -264,7 +264,11 @@ fn walreceiver_main(
|
|||||||
)?;
|
)?;
|
||||||
|
|
||||||
if newest_segno - oldest_segno >= 10 {
|
if newest_segno - oldest_segno >= 10 {
|
||||||
timeline.checkpoint()?;
|
// FIXME: The layered repository performs checkpointing in a separate thread, so this
|
||||||
|
// isn't needed anymore. Remove 'checkpoint' from the Timeline trait altogether?
|
||||||
|
if conf.repository_format == RepositoryFormat::RocksDb {
|
||||||
|
timeline.checkpoint()?;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||||
|
|||||||
@@ -14,7 +14,8 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
|||||||
#
|
#
|
||||||
@pytest.mark.skip(reason=""""
|
@pytest.mark.skip(reason=""""
|
||||||
Current GC test is flaky and overly strict. Since we are migrating to the layered repo format
|
Current GC test is flaky and overly strict. Since we are migrating to the layered repo format
|
||||||
with different GC implementation let's just silence this test for now.
|
with different GC implementation let's just silence this test for now. This test only
|
||||||
|
works with the RocksDB implementation.
|
||||||
""")
|
""")
|
||||||
def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||||
zenith_cli.run(["branch", "test_gc", "empty"])
|
zenith_cli.run(["branch", "test_gc", "empty"])
|
||||||
|
|||||||
122
test_runner/batch_others/test_snapfiles_gc.py
Normal file
122
test_runner/batch_others/test_snapfiles_gc.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
from contextlib import closing
|
||||||
|
import psycopg2.extras
|
||||||
|
import time;
|
||||||
|
|
||||||
|
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||||
|
|
||||||
|
def print_gc_result(row):
|
||||||
|
print("GC duration {elapsed} ms".format_map(row));
|
||||||
|
print(" REL total: {snapshot_relfiles_total}, needed_by_cutoff {snapshot_relfiles_needed_by_cutoff}, needed_by_branches: {snapshot_relfiles_needed_by_branches}, not_updated: {snapshot_relfiles_not_updated}, removed: {snapshot_relfiles_removed}, dropped: {snapshot_relfiles_dropped}".format_map(row))
|
||||||
|
print(" NONREL total: {snapshot_nonrelfiles_total}, needed_by_cutoff {snapshot_nonrelfiles_needed_by_cutoff}, needed_by_branches: {snapshot_nonrelfiles_needed_by_branches}, not_updated: {snapshot_nonrelfiles_not_updated}, removed: {snapshot_nonrelfiles_removed}, dropped: {snapshot_nonrelfiles_dropped}".format_map(row))
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Test Garbage Collection of old snapshot files
|
||||||
|
#
|
||||||
|
# This test is pretty tightly coupled with the current implementation of layered
|
||||||
|
# storage, in layered_repository.rs.
|
||||||
|
#
|
||||||
|
def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||||
|
zenith_cli.run(["branch", "test_snapfiles_gc", "empty"])
|
||||||
|
pg = postgres.create_start('test_snapfiles_gc')
|
||||||
|
|
||||||
|
with closing(pg.connect()) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
with closing(pageserver.connect()) as psconn:
|
||||||
|
with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
|
||||||
|
|
||||||
|
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||||
|
cur.execute("SHOW zenith.zenith_timeline")
|
||||||
|
timeline = cur.fetchone()[0]
|
||||||
|
|
||||||
|
# Create a test table
|
||||||
|
cur.execute("CREATE TABLE foo(x integer)")
|
||||||
|
|
||||||
|
print("Inserting two more rows and running GC")
|
||||||
|
cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
|
||||||
|
row = cur.fetchone();
|
||||||
|
print("relfilenode is {}", row[0]);
|
||||||
|
|
||||||
|
# Run GC, to clear out any garbage left behind in the catalogs by
|
||||||
|
# the CREATE TABLE command. We want to have a clean slate with no garbage
|
||||||
|
# before running the actual tests below, otherwise the counts won't match
|
||||||
|
# what we expect.
|
||||||
|
#
|
||||||
|
# Also run vacuum first to make it less likely that autovacuum or pruning
|
||||||
|
# kicks in and confuses our numbers.
|
||||||
|
cur.execute("VACUUM")
|
||||||
|
|
||||||
|
print("Running GC before test")
|
||||||
|
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||||
|
row = pscur.fetchone()
|
||||||
|
print_gc_result(row);
|
||||||
|
# remember the number of files
|
||||||
|
snapshot_relfiles_remain = row['snapshot_relfiles_total'] - row['snapshot_relfiles_removed']
|
||||||
|
assert snapshot_relfiles_remain > 0
|
||||||
|
|
||||||
|
# Insert a row. The first insert will also create a metadata entry for the
|
||||||
|
# relation, with size == 1 block. Hence, bump up the expected relation count.
|
||||||
|
snapshot_relfiles_remain += 1;
|
||||||
|
print("Inserting one row and running GC")
|
||||||
|
cur.execute("INSERT INTO foo VALUES (1)")
|
||||||
|
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||||
|
row = pscur.fetchone()
|
||||||
|
print_gc_result(row);
|
||||||
|
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
|
||||||
|
assert row['snapshot_relfiles_removed'] == 0
|
||||||
|
assert row['snapshot_relfiles_dropped'] == 0
|
||||||
|
|
||||||
|
# Insert two more rows and run GC.
|
||||||
|
# This should create a new snapshot file with the new contents, and
|
||||||
|
# remove the old one.
|
||||||
|
print("Inserting two more rows and running GC")
|
||||||
|
cur.execute("INSERT INTO foo VALUES (2)")
|
||||||
|
cur.execute("INSERT INTO foo VALUES (3)")
|
||||||
|
|
||||||
|
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||||
|
row = pscur.fetchone()
|
||||||
|
print_gc_result(row);
|
||||||
|
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
|
||||||
|
assert row['snapshot_relfiles_removed'] == 1
|
||||||
|
assert row['snapshot_relfiles_dropped'] == 0
|
||||||
|
|
||||||
|
# Do it again. Should again create a new snapshot file and remove old one.
|
||||||
|
print("Inserting two more rows and running GC")
|
||||||
|
cur.execute("INSERT INTO foo VALUES (2)")
|
||||||
|
cur.execute("INSERT INTO foo VALUES (3)")
|
||||||
|
|
||||||
|
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||||
|
row = pscur.fetchone()
|
||||||
|
print_gc_result(row);
|
||||||
|
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
|
||||||
|
assert row['snapshot_relfiles_removed'] == 1
|
||||||
|
assert row['snapshot_relfiles_dropped'] == 0
|
||||||
|
|
||||||
|
# Run GC again, with no changes in the database. Should not remove anything.
|
||||||
|
print("Run GC again, with nothing to do")
|
||||||
|
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||||
|
row = pscur.fetchone()
|
||||||
|
print_gc_result(row);
|
||||||
|
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
|
||||||
|
assert row['snapshot_relfiles_removed'] == 0
|
||||||
|
assert row['snapshot_relfiles_dropped'] == 0
|
||||||
|
|
||||||
|
#
|
||||||
|
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||||
|
#
|
||||||
|
print("Drop table and run GC again");
|
||||||
|
cur.execute("DROP TABLE foo")
|
||||||
|
|
||||||
|
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||||
|
row = pscur.fetchone()
|
||||||
|
print_gc_result(row);
|
||||||
|
|
||||||
|
# Each relation fork is counted separately, hence 3.
|
||||||
|
assert row['snapshot_relfiles_dropped'] == 3
|
||||||
|
|
||||||
|
# The catalog updates also create new snapshot files of the catalogs, which
|
||||||
|
# are counted as 'removed'
|
||||||
|
assert row['snapshot_relfiles_removed'] > 0
|
||||||
|
|
||||||
|
# TODO: perhaps we should count catalog and user relations separately,
|
||||||
|
# to make this kind of testing more robust
|
||||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 04cfa326a5...e3175fe60a
@@ -61,6 +61,13 @@ fn main() -> Result<()> {
|
|||||||
.long("enable-auth")
|
.long("enable-auth")
|
||||||
.takes_value(false)
|
.takes_value(false)
|
||||||
.help("Enable authentication using ZenithJWT")
|
.help("Enable authentication using ZenithJWT")
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("repository-format")
|
||||||
|
.long("repository-format")
|
||||||
|
.takes_value(false)
|
||||||
|
.value_name("repository-format")
|
||||||
|
.help("Choose repository format, 'layered' or 'rocksdb'")
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
@@ -131,8 +138,8 @@ fn main() -> Result<()> {
|
|||||||
} else {
|
} else {
|
||||||
AuthType::Trust
|
AuthType::Trust
|
||||||
};
|
};
|
||||||
|
let repository_format = init_match.value_of("repository-format");
|
||||||
local_env::init(pageserver_uri, tenantid, auth_type)
|
local_env::init(pageserver_uri, tenantid, auth_type, repository_format)
|
||||||
.with_context(|| "Failed to create config file")?;
|
.with_context(|| "Failed to create config file")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,6 +158,7 @@ fn main() -> Result<()> {
|
|||||||
if let Err(e) = pageserver.init(
|
if let Err(e) = pageserver.init(
|
||||||
Some(&env.tenantid.to_string()),
|
Some(&env.tenantid.to_string()),
|
||||||
init_match.is_present("enable-auth"),
|
init_match.is_present("enable-auth"),
|
||||||
|
init_match.value_of("repository-format"),
|
||||||
) {
|
) {
|
||||||
eprintln!("pageserver init failed: {}", e);
|
eprintln!("pageserver init failed: {}", e);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
|||||||
@@ -126,7 +126,7 @@ macro_rules! zid_newtype {
|
|||||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||||
/// is usually printed out as a hex string.
|
/// is usually printed out as a hex string.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
|
||||||
pub struct ZTimelineId(ZId);
|
pub struct ZTimelineId(ZId);
|
||||||
|
|
||||||
zid_newtype!(ZTimelineId);
|
zid_newtype!(ZTimelineId);
|
||||||
|
|||||||
Reference in New Issue
Block a user