Remove rocksdb implementation.

The layered storage format is good enough that we don't need the rocksdb
implementation anymore. There are a lot of known issues but we'll keep
working on them.
This commit is contained in:
Heikki Linnakangas
2021-08-25 18:06:34 +03:00
parent 250ae643a8
commit 5998744bcc
17 changed files with 21 additions and 2171 deletions

87
Cargo.lock generated
View File

@@ -152,25 +152,6 @@ dependencies = [
"serde",
]
[[package]]
name = "bindgen"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd4865004a46a0aafb2a0a5eb19d3c9fc46ee5f063a6cfc605c69ac9ecf5263d"
dependencies = [
"bitflags",
"cexpr 0.4.0",
"clang-sys",
"lazy_static",
"lazycell",
"peeking_take_while",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex 0.1.1",
]
[[package]]
name = "bindgen"
version = "0.59.1"
@@ -178,7 +159,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375"
dependencies = [
"bitflags",
"cexpr 0.5.0",
"cexpr",
"clang-sys",
"clap",
"env_logger",
@@ -190,7 +171,7 @@ dependencies = [
"quote",
"regex",
"rustc-hash",
"shlex 1.0.0",
"shlex",
"which",
]
@@ -265,18 +246,6 @@ name = "cc"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e70cc2f62c6ce1868963827bd677764c62d07c3d9a3e1fb1177ee1a9ab199eb2"
dependencies = [
"jobserver",
]
[[package]]
name = "cexpr"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27"
dependencies = [
"nom 5.1.2",
]
[[package]]
name = "cexpr"
@@ -284,7 +253,7 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89"
dependencies = [
"nom 6.1.2",
"nom",
]
[[package]]
@@ -903,15 +872,6 @@ version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]]
name = "jobserver"
version = "0.1.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "972f5ae5d1cb9c6ae417789196c803205313edde988685da5e3aae0827b9e7fd"
dependencies = [
"libc",
]
[[package]]
name = "js-sys"
version = "0.3.51"
@@ -963,18 +923,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "librocksdb-sys"
version = "6.17.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5da125e1c0f22c7cae785982115523a0738728498547f415c9054cb17c7e89f9"
dependencies = [
"bindgen 0.57.0",
"cc",
"glob",
"libc",
]
[[package]]
name = "lock_api"
version = "0.4.4"
@@ -1109,16 +1057,6 @@ dependencies = [
"libc",
]
[[package]]
name = "nom"
version = "5.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af"
dependencies = [
"memchr",
"version_check",
]
[[package]]
name = "nom"
version = "6.1.2"
@@ -1260,7 +1198,6 @@ dependencies = [
"postgres_ffi",
"rand",
"regex",
"rocksdb",
"routerify",
"rust-s3",
"scopeguard",
@@ -1442,7 +1379,7 @@ name = "postgres_ffi"
version = "0.1.0"
dependencies = [
"anyhow",
"bindgen 0.59.1",
"bindgen",
"byteorder",
"bytes",
"chrono",
@@ -1683,16 +1620,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "rocksdb"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c749134fda8bfc90d0de643d59bfc841dcb3ac8a1062e12b6754bd60235c48b3"
dependencies = [
"libc",
"librocksdb-sys",
]
[[package]]
name = "routerify"
version = "2.2.0"
@@ -1916,12 +1843,6 @@ dependencies = [
"opaque-debug",
]
[[package]]
name = "shlex"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2"
[[package]]
name = "shlex"
version = "1.0.0"

View File

@@ -42,7 +42,7 @@ pub struct LocalEnv {
#[serde(with = "hex")]
pub tenantid: ZTenantId,
// Repository format, 'rocksdb' or 'layered' or None for default
// Repository format, only 'layered' supported currently, or None for default
pub repository_format: Option<String>,
// jwt auth token used for communication with pageserver

View File

@@ -30,8 +30,6 @@ tokio-stream = { version = "0.1.5" }
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
# by default rust-rocksdb tries to build a lot of compression algos. Use lz4 only for now as it is simplest dependency.
rocksdb = { version = "0.16.0", features = ["lz4"], default-features = false }
routerify = "2"
anyhow = "1.0"
crc32c = "0.6.0"

View File

@@ -74,9 +74,8 @@ Repository corresponds to one .zenith directory.
Repository is needed to manage Timelines.
Each repository has associated WAL redo service.
Now we have two implementations of Repository:
- ObjectRepository uses RocksDB as a storage
- LayeredRepository uses custom storage format, described in layered_repository/README.md
There is currently only one implementation of the Repository trait: LayeredRepository.
It uses custom storage format, described in layered_repository/README.md
#### Timeline
Timeline is a page cache workhorse that accepts page changes
@@ -89,13 +88,6 @@ Each Branch lives in a corresponding timeline and has an ancestor.
To get full snapshot of data at certain moment we need to traverse timeline and its ancestors.
#### ObjectRepository
ObjectRepository implements Repository and has associated ObjectStore and WAL redo service.
#### ObjectStore
ObjectStore is an interface for key-value store for page images and wal records.
Currently it has one implementation - RocksDB.
#### WAL redo service
WAL redo service - service that runs PostgreSQL in a special wal_redo mode
to apply given WAL records over an old page image and return new page image.

View File

@@ -138,10 +138,9 @@ impl CfgFileParams {
}
let repository_format = match self.repository_format.as_ref() {
Some(repo_format_str) if repo_format_str == "rocksdb" => RepositoryFormat::RocksDb,
Some(repo_format_str) if repo_format_str == "layered" => RepositoryFormat::Layered,
Some(repo_format_str) => anyhow::bail!(
"invalid --repository-format '{}', must be 'rocksdb' or 'layered'",
"invalid --repository-format '{}', only 'layered' supported",
repo_format_str
),
None => RepositoryFormat::Layered, // default
@@ -240,7 +239,7 @@ fn main() -> Result<()> {
Arg::with_name("repository-format")
.long("repository-format")
.takes_value(true)
.help("Which repository implementation to use, 'rocksdb' or 'layered'"),
.help("Which repository implementation to use, only 'layered' supported currently"),
)
.get_matches();

View File

@@ -20,7 +20,6 @@ use log::*;
use zenith_utils::lsn::Lsn;
use crate::logger;
use crate::object_repository::ObjectRepository;
use crate::page_cache;
use crate::restore_local_repo;
use crate::walredo::WalRedoManager;
@@ -102,16 +101,6 @@ pub fn create_repo(
RepositoryFormat::Layered => Arc::new(
crate::layered_repository::LayeredRepository::new(conf, wal_redo_manager, tenantid),
),
RepositoryFormat::RocksDb => {
let obj_store = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
Arc::new(ObjectRepository::new(
conf,
Arc::new(obj_store),
wal_redo_manager,
tenantid,
))
}
};
// Load data into pageserver

View File

@@ -12,15 +12,11 @@ pub mod branches;
pub mod http;
pub mod layered_repository;
pub mod logger;
pub mod object_key;
pub mod object_repository;
pub mod object_store;
pub mod page_cache;
pub mod page_service;
pub mod relish;
pub mod repository;
pub mod restore_local_repo;
pub mod rocksdb_storage;
pub mod waldecoder;
pub mod walreceiver;
pub mod walredo;
@@ -63,7 +59,6 @@ pub struct PageServerConf {
#[derive(Debug, Clone, PartialEq)]
pub enum RepositoryFormat {
Layered,
RocksDb,
}
impl PageServerConf {

View File

@@ -1,49 +0,0 @@
//!
//! Common structs shared by object_repository.rs and object_store.rs.
//!
use crate::relish::RelishTag;
use serde::{Deserialize, Serialize};
use zenith_utils::zid::ZTimelineId;
///
/// ObjectKey is the key type used to identify objects stored in an object
/// repository. It is shared between object_repository.rs and object_store.rs.
/// It is mostly opaque to ObjectStore, it just stores and retrieves objects
/// using the key given by the caller.
///
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ObjectKey {
pub timeline: ZTimelineId,
pub tag: ObjectTag,
}
///
/// ObjectTag is a part of ObjectKey that is specific to the type of
/// the stored object.
///
/// NB: the order of the enum values is significant! In particular,
/// rocksdb_storage.rs assumes that TimelineMetadataTag is first
///
/// Buffer is the kind of object that is accessible by the public
/// get_page_at_lsn() / put_page_image() / put_wal_record() functions in
/// the repository.rs interface. The rest are internal objects stored in
/// the key-value store, to store various metadata. They're not directly
/// accessible outside object_repository.rs
///
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub enum ObjectTag {
// dummy tag preceeding all other keys
FirstTag,
// Metadata about a timeline. Not versioned.
TimelineMetadataTag,
// These objects store metadata about one relish. Currently it's used
// just to track the relish's size. It's not used for non-blocky relishes
// at all.
RelationMetadata(RelishTag),
// These are the pages exposed in the public Repository/Timeline interface.
Buffer(RelishTag, u32),
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,92 +0,0 @@
//! Low-level key-value storage abstraction.
//!
use crate::object_key::*;
use crate::relish::*;
use anyhow::Result;
use std::collections::HashSet;
use std::iter::Iterator;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::ZTimelineId;
///
/// Low-level storage abstraction.
///
/// All the data in the repository is stored in a key-value store. This trait
/// abstracts the details of the key-value store.
///
/// A simple key-value store would support just GET and PUT operations with
/// a key, but the upper layer needs slightly complicated read operations
///
/// The most frequently used function is 'object_versions'. It is used
/// to look up a page version. It is LSN aware, in that the caller
/// specifies an LSN, and the function returns all values for that
/// block with the same or older LSN.
///
pub trait ObjectStore: Send + Sync {
///
/// Store a value with given key.
///
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()>;
/// Read entry with the exact given key.
///
/// This is used for retrieving metadata with special key that doesn't
/// correspond to any real relation.
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>>;
/// Read key greater or equal than specified
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>>;
/// Iterate through all page versions of one object.
///
/// Returns all page versions in descending LSN order, along with the LSN
/// of each page version.
fn object_versions<'a>(
&'a self,
key: &ObjectKey,
lsn: Lsn,
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>>;
/// Iterate through versions of all objects in a timeline.
///
/// Returns objects in increasing key-version order.
/// Returns all versions up to and including the specified LSN.
fn objects<'a>(
&'a self,
timeline: ZTimelineId,
lsn: Lsn,
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>>;
/// Iterate through all keys with given tablespace and database ID, and LSN <= 'lsn'.
/// Both dbnode and spcnode can be InvalidId (0) which means get all relations in tablespace/cluster
///
/// This is used to implement 'create database'
fn list_rels(
&self,
timelineid: ZTimelineId,
spcnode: u32,
dbnode: u32,
lsn: Lsn,
) -> Result<HashSet<RelTag>>;
/// Iterate through non-rel relishes
///
/// This is used to prepare tarball for new node startup.
fn list_nonrels<'a>(&'a self, timelineid: ZTimelineId, lsn: Lsn) -> Result<HashSet<RelishTag>>;
/// Iterate through objects tags. If nonrel_only, then only non-relationa data is iterated.
///
/// This is used to implement GC and preparing tarball for new node startup
/// Returns objects in increasing key-version order.
fn list_objects<'a>(
&'a self,
timelineid: ZTimelineId,
lsn: Lsn,
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
/// Unlink object (used by GC). This mehod may actually delete object or just mark it for deletion.
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()>;
// Compact storage and remove versions marged for deletion
fn compact(&self);
}

View File

@@ -3,9 +3,7 @@
use crate::branches;
use crate::layered_repository::LayeredRepository;
use crate::object_repository::ObjectRepository;
use crate::repository::Repository;
use crate::rocksdb_storage::RocksObjectStore;
use crate::walredo::PostgresRedoManager;
use crate::{PageServerConf, RepositoryFormat};
use anyhow::{anyhow, bail, Result};
@@ -43,16 +41,6 @@ pub fn init(conf: &'static PageServerConf) {
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
repo
}
RepositoryFormat::RocksDb => {
let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
Arc::new(ObjectRepository::new(
conf,
Arc::new(obj_store),
Arc::new(walredo_mgr),
tenantid,
))
}
};
info!("initialized storage for tenant: {}", &tenantid);

View File

@@ -24,13 +24,13 @@ use std::{io, net::TcpStream};
use zenith_metrics::{register_histogram_vec, HistogramVec};
use zenith_utils::auth::{self, JwtAuth};
use zenith_utils::auth::{Claims, Scope};
use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend::PostgresBackend;
use zenith_utils::postgres_backend::{self, AuthType};
use zenith_utils::pq_proto::{
BeMessage, FeMessage, RowDescriptor, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC,
};
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use zenith_utils::lsn::Lsn;
use crate::basebackup;
use crate::branches;
@@ -596,15 +596,6 @@ impl postgres_backend::Handler for PageServerHandler {
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"n_relations"),
RowDescriptor::int8_col(b"truncated"),
RowDescriptor::int8_col(b"deleted"),
RowDescriptor::int8_col(b"prep_deleted"),
RowDescriptor::int8_col(b"slru_deleted"),
RowDescriptor::int8_col(b"chkp_deleted"),
RowDescriptor::int8_col(b"control_deleted"),
RowDescriptor::int8_col(b"filenodemap_deleted"),
RowDescriptor::int8_col(b"dropped"),
RowDescriptor::int8_col(b"snapshot_relfiles_total"),
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
@@ -620,15 +611,6 @@ impl postgres_backend::Handler for PageServerHandler {
RowDescriptor::int8_col(b"elapsed"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(&result.n_relations.to_string().as_bytes()),
Some(&result.truncated.to_string().as_bytes()),
Some(&result.deleted.to_string().as_bytes()),
Some(&result.prep_deleted.to_string().as_bytes()),
Some(&result.slru_deleted.to_string().as_bytes()),
Some(&result.chkp_deleted.to_string().as_bytes()),
Some(&result.control_deleted.to_string().as_bytes()),
Some(&result.filenodemap_deleted.to_string().as_bytes()),
Some(&result.dropped.to_string().as_bytes()),
Some(&result.snapshot_relfiles_total.to_string().as_bytes()),
Some(
&result

View File

@@ -55,20 +55,6 @@ pub trait Repository: Send + Sync {
///
#[derive(Default)]
pub struct GcResult {
// FIXME: These counters make sense for the ObjectRepository. They are not used
// by the LayeredRepository.
pub n_relations: u64,
pub inspected: u64,
pub truncated: u64,
pub deleted: u64,
pub prep_deleted: u64, // RelishTag::Twophase
pub slru_deleted: u64, // RelishTag::Slru
pub chkp_deleted: u64, // RelishTag::Checkpoint
pub control_deleted: u64, // RelishTag::ControlFile
pub filenodemap_deleted: u64, // RelishTag::FileNodeMap
pub dropped: u64,
// These are used for the LayeredRepository instead
pub snapshot_relfiles_total: u64,
pub snapshot_relfiles_needed_by_cutoff: u64,
pub snapshot_relfiles_needed_by_branches: u64,
@@ -88,11 +74,6 @@ pub struct GcResult {
impl AddAssign for GcResult {
fn add_assign(&mut self, other: Self) {
self.n_relations += other.n_relations;
self.truncated += other.truncated;
self.deleted += other.deleted;
self.dropped += other.dropped;
self.snapshot_relfiles_total += other.snapshot_relfiles_total;
self.snapshot_relfiles_needed_by_cutoff += other.snapshot_relfiles_needed_by_cutoff;
self.snapshot_relfiles_needed_by_branches += other.snapshot_relfiles_needed_by_branches;
@@ -241,8 +222,6 @@ impl WALRecord {
mod tests {
use super::*;
use crate::layered_repository::LayeredRepository;
use crate::object_repository::ObjectRepository;
use crate::rocksdb_storage::RocksObjectStore;
use crate::walredo::{WalRedoError, WalRedoManager};
use crate::{PageServerConf, RepositoryFormat};
use postgres_ffi::pg_constants;
@@ -279,10 +258,7 @@ mod tests {
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
fn get_test_repo(
test_name: &str,
repository_format: RepositoryFormat,
) -> Result<Box<dyn Repository>> {
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
let _ = fs::remove_dir_all(&repo_dir);
fs::create_dir_all(&repo_dir)?;
@@ -299,7 +275,7 @@ mod tests {
pg_distrib_dir: "".into(),
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
repository_format,
repository_format: RepositoryFormat::Layered,
};
// Make a static copy of the config. This can never be free'd, but that's
// OK in a test.
@@ -315,35 +291,14 @@ mod tests {
Arc::new(walredo_mgr),
tenantid,
)),
RepositoryFormat::RocksDb => {
let obj_store = RocksObjectStore::create(conf, &tenantid)?;
Box::new(ObjectRepository::new(
conf,
Arc::new(obj_store),
Arc::new(walredo_mgr),
tenantid,
))
}
};
Ok(repo)
}
/// Test get_relsize() and truncation.
#[test]
fn test_relsize_rocksdb() -> Result<()> {
let repo = get_test_repo("test_relsize_rocksdb", RepositoryFormat::RocksDb)?;
test_relsize(&*repo)
}
#[test]
fn test_relsize_layered() -> Result<()> {
let repo = get_test_repo("test_relsize_layered", RepositoryFormat::Layered)?;
test_relsize(&*repo)
}
fn test_relsize(repo: &dyn Repository) -> Result<()> {
fn test_relsize() -> Result<()> {
let repo = get_test_repo("test_relsize")?;
// get_timeline() with non-existent timeline id should fail
//repo.get_timeline("11223344556677881122334455667788");
@@ -428,22 +383,9 @@ mod tests {
/// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
/// split into multiple 1 GB segments in Postgres.
///
/// This isn't very interesting with the RocksDb implementation, as we don't pay
/// any attention to Postgres segment boundaries there.
#[test]
fn test_large_rel_rocksdb() -> Result<()> {
let repo = get_test_repo("test_large_rel_rocksdb", RepositoryFormat::RocksDb)?;
test_large_rel(&*repo)
}
#[test]
fn test_large_rel_layered() -> Result<()> {
let repo = get_test_repo("test_large_rel_layered", RepositoryFormat::Layered)?;
test_large_rel(&*repo)
}
fn test_large_rel(repo: &dyn Repository) -> Result<()> {
fn test_large_rel() -> Result<()> {
let repo = get_test_repo("test_large_rel")?;
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
@@ -498,22 +440,12 @@ mod tests {
Ok(())
}
#[test]
fn test_branch_rocksdb() -> Result<()> {
let repo = get_test_repo("test_branch_rocksdb", RepositoryFormat::RocksDb)?;
test_branch(&*repo)
}
#[test]
fn test_branch_layered() -> Result<()> {
let repo = get_test_repo("test_branch_layered", RepositoryFormat::Layered)?;
test_branch(&*repo)
}
///
/// Test branch creation
///
fn test_branch(repo: &dyn Repository) -> Result<()> {
#[test]
fn test_branch() -> Result<()> {
let repo = get_test_repo("test_branch")?;
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

View File

@@ -1,475 +0,0 @@
//!
//! An implementation of the ObjectStore interface, backed by RocksDB
//!
use crate::object_key::*;
use crate::object_store::ObjectStore;
use crate::relish::*;
use crate::PageServerConf;
use anyhow::{bail, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::sync::{Arc, Mutex};
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::ZTenantId;
use zenith_utils::zid::ZTimelineId;
#[derive(Debug, Clone, Serialize, Deserialize)]
struct StorageKey {
obj_key: ObjectKey,
lsn: Lsn,
}
impl StorageKey {
/// The first key for a given timeline
fn timeline_start(timeline: ZTimelineId) -> Self {
Self {
obj_key: ObjectKey {
timeline,
tag: ObjectTag::TimelineMetadataTag,
},
lsn: Lsn(0),
}
}
}
///
/// RocksDB very inefficiently delete random record. Instead of it we have to use merge
/// filter, which allows to throw away records at LSM merge phase.
/// Unfortunately, it is hard (if ever possible) to determine whether version can be removed
/// at merge time. Version ca be removed if:
/// 1. It is above PITR horizon (we need to get current LSN and gc_horizon from config)
/// 2. Page is reconstructed at horizon (all WAL records above horizon are applied and can be removed)
///
/// So we have GC process which reconstructs pages at horizon and mark deteriorated WAL record
/// for deletion. To mark object for deletion we can either set some flag in object itself.
/// But it is complicated with new object value format, because RocksDB storage knows nothing about
/// this format. Also updating whole record just to set one bit seems to be inefficient in any case.
/// This is why we keep keys of marked for deletion versions in HashSet in memory.
/// When LSM merge filter found key in this map, it removes it from the set preventing memory overflow.
///
struct GarbageCollector {
garbage: Mutex<HashSet<Vec<u8>>>,
}
impl GarbageCollector {
fn new() -> GarbageCollector {
GarbageCollector {
garbage: Mutex::new(HashSet::new()),
}
}
/// Called by GC to mark version as delete
fn mark_for_deletion(&self, key: &[u8]) {
let mut garbage = self.garbage.lock().unwrap();
garbage.insert(key.to_vec());
}
/// Called by LSM merge filter. If it finds key in the set, then
/// it doesn't merge it and removes from this set.
fn was_deleted(&self, key: &[u8]) -> bool {
let key = key.to_vec();
let mut garbage = self.garbage.lock().unwrap();
garbage.remove(&key)
}
}
pub struct RocksObjectStore {
_conf: &'static PageServerConf,
// RocksDB handle
db: rocksdb::DB,
gc: Arc<GarbageCollector>,
}
impl ObjectStore for RocksObjectStore {
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
let val = self.db.get(StorageKey::ser(&StorageKey {
obj_key: key.clone(),
lsn,
})?)?;
if let Some(val) = val {
Ok(val)
} else {
bail!("could not find page {:?}", key);
}
}
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
let mut iter = self.db.raw_iterator();
let search_key = StorageKey {
obj_key: key.clone(),
lsn: Lsn(0),
};
iter.seek(search_key.ser()?);
if !iter.valid() {
Ok(None)
} else {
let key = StorageKey::des(iter.key().unwrap())?;
Ok(Some(key.obj_key.clone()))
}
}
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
self.db.put(
StorageKey::ser(&StorageKey {
obj_key: key.clone(),
lsn,
})?,
value,
)?;
Ok(())
}
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
self.gc.mark_for_deletion(&StorageKey::ser(&StorageKey {
obj_key: key.clone(),
lsn,
})?);
Ok(())
}
/// Iterate through page versions of given page, starting from the given LSN.
/// The versions are walked in descending LSN order.
fn object_versions<'a>(
&'a self,
key: &ObjectKey,
lsn: Lsn,
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
let iter = RocksObjectVersionIter::new(&self.db, key, lsn)?;
Ok(Box::new(iter))
}
/// Iterate through all timeline objects
fn list_objects<'a>(
&'a self,
timeline: ZTimelineId,
lsn: Lsn,
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
let iter = RocksObjectIter::new(&self.db, timeline, lsn)?;
Ok(Box::new(iter))
}
/// Get a list of all distinct relations in given tablespace and database.
///
/// TODO: This implementation is very inefficient, it scans
/// through all entries in the given database. In practice, this
/// is used for CREATE DATABASE, and usually the template database is small.
/// But if it's not, this will be slow.
fn list_rels(
&self,
timelineid: ZTimelineId,
spcnode: u32,
dbnode: u32,
lsn: Lsn,
) -> Result<HashSet<RelTag>> {
// FIXME: This scans everything. Very slow
let mut rels: HashSet<RelTag> = HashSet::new();
let mut search_rel_tag = RelTag {
spcnode,
dbnode,
relnode: 0,
forknum: 0u8,
};
let mut iter = self.db.raw_iterator();
loop {
let search_key = StorageKey {
obj_key: ObjectKey {
timeline: timelineid,
tag: ObjectTag::RelationMetadata(RelishTag::Relation(search_rel_tag)),
},
lsn: Lsn(0),
};
iter.seek(search_key.ser()?);
if !iter.valid() {
break;
}
let key = StorageKey::des(iter.key().unwrap())?;
if let ObjectTag::RelationMetadata(RelishTag::Relation(rel_tag)) = key.obj_key.tag {
if spcnode != 0 && rel_tag.spcnode != spcnode
|| dbnode != 0 && rel_tag.dbnode != dbnode
{
break;
}
if key.lsn <= lsn {
// visible in this snapshot
rels.insert(rel_tag);
}
search_rel_tag = rel_tag;
// skip to next relation
// FIXME: What if relnode is u32::MAX ?
search_rel_tag.relnode += 1;
} else {
// no more relation metadata entries
break;
}
}
Ok(rels)
}
/// Get a list of all distinct NON-relations in timeline
/// that are visible at given lsn.
///
/// TODO: This implementation is very inefficient, it scans
/// through all non-rel page versions in the system. In practice, this
/// is used when initializing a new compute node, and the non-rel files
/// are never very large nor change very frequently, so this will do for now.
fn list_nonrels(&self, timelineid: ZTimelineId, lsn: Lsn) -> Result<HashSet<RelishTag>> {
let mut rels: HashSet<RelishTag> = HashSet::new();
let search_key = StorageKey {
obj_key: ObjectKey {
timeline: timelineid,
tag: ObjectTag::Buffer(FIRST_NONREL_RELISH_TAG, 0),
},
lsn: Lsn(0),
};
let mut iter = self.db.raw_iterator();
iter.seek(search_key.ser()?);
while iter.valid() {
let key = StorageKey::des(iter.key().unwrap())?;
if key.obj_key.timeline != timelineid {
// reached end of this timeline in the store
break;
}
if let ObjectTag::Buffer(rel_tag, _blknum) = key.obj_key.tag {
if key.lsn <= lsn {
// visible in this snapshot
rels.insert(rel_tag);
}
}
// TODO: we could skip to next relation here like we do in list_rels(),
// but hopefully there are not that many SLRU segments or other non-rel
// entries for it to matter.
iter.next();
}
Ok(rels)
}
/// Iterate through versions of all objects in a timeline.
///
/// Returns objects in increasing key-version order.
/// Returns all versions up to and including the specified LSN.
fn objects<'a>(
&'a self,
timeline: ZTimelineId,
lsn: Lsn,
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
let start_key = StorageKey::timeline_start(timeline);
let start_key_bytes = StorageKey::ser(&start_key)?;
let iter = self.db.iterator(rocksdb::IteratorMode::From(
&start_key_bytes,
rocksdb::Direction::Forward,
));
Ok(Box::new(RocksObjects {
iter,
timeline,
lsn,
}))
}
fn compact(&self) {
self.db.compact_range::<&[u8], &[u8]>(None, None);
}
}
impl RocksObjectStore {
/// Open a RocksDB database.
pub fn open(conf: &'static PageServerConf, tenantid: &ZTenantId) -> Result<RocksObjectStore> {
let opts = Self::get_rocksdb_opts();
let obj_store = Self::new(conf, opts, tenantid)?;
Ok(obj_store)
}
/// Create a new, empty RocksDB database.
pub fn create(conf: &'static PageServerConf, tenantid: &ZTenantId) -> Result<RocksObjectStore> {
let path = conf.tenant_path(&tenantid).join("rocksdb-storage");
std::fs::create_dir(&path)?;
let mut opts = Self::get_rocksdb_opts();
opts.create_if_missing(true);
opts.set_error_if_exists(true);
let obj_store = Self::new(conf, opts, tenantid)?;
Ok(obj_store)
}
fn new(
conf: &'static PageServerConf,
mut opts: rocksdb::Options,
tenantid: &ZTenantId,
) -> Result<RocksObjectStore> {
let path = conf.tenant_path(&tenantid).join("rocksdb-storage");
let gc = Arc::new(GarbageCollector::new());
let gc_ref = gc.clone();
opts.set_compaction_filter("ttl", move |_level: u32, key: &[u8], _val: &[u8]| {
if gc_ref.was_deleted(key) {
rocksdb::compaction_filter::Decision::Remove
} else {
rocksdb::compaction_filter::Decision::Keep
}
});
let db = rocksdb::DB::open(&opts, &path)?;
let obj_store = RocksObjectStore {
_conf: conf,
db,
gc,
};
Ok(obj_store)
}
/// common options used by `open` and `create`
fn get_rocksdb_opts() -> rocksdb::Options {
let mut opts = rocksdb::Options::default();
opts.set_use_fsync(true);
opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
opts
}
}
///
/// Iterator for `object_versions`. Returns all page versions of a given block, in
/// reverse LSN order.
///
struct RocksObjectVersionIter<'a> {
obj_key: ObjectKey,
dbiter: rocksdb::DBRawIterator<'a>,
first_call: bool,
}
impl<'a> RocksObjectVersionIter<'a> {
fn new(
db: &'a rocksdb::DB,
obj_key: &ObjectKey,
lsn: Lsn,
) -> Result<RocksObjectVersionIter<'a>> {
let key = StorageKey {
obj_key: obj_key.clone(),
lsn,
};
let mut dbiter = db.raw_iterator();
dbiter.seek_for_prev(StorageKey::ser(&key)?); // locate last entry
Ok(RocksObjectVersionIter {
first_call: true,
obj_key: obj_key.clone(),
dbiter,
})
}
}
impl<'a> Iterator for RocksObjectVersionIter<'a> {
type Item = (Lsn, Vec<u8>);
fn next(&mut self) -> std::option::Option<Self::Item> {
if self.first_call {
self.first_call = false;
} else {
self.dbiter.prev(); // walk backwards
}
if !self.dbiter.valid() {
return None;
}
let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
if key.obj_key.tag != self.obj_key.tag {
return None;
}
let val = self.dbiter.value().unwrap();
let result = val.to_vec();
Some((key.lsn, result))
}
}
struct RocksObjects<'r> {
iter: rocksdb::DBIterator<'r>,
timeline: ZTimelineId,
lsn: Lsn,
}
impl<'r> Iterator for RocksObjects<'r> {
// TODO consider returning Box<[u8]>
type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
fn next(&mut self) -> Option<Self::Item> {
self.next_result().transpose()
}
}
impl<'r> RocksObjects<'r> {
fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
for (key_bytes, v) in &mut self.iter {
let key = StorageKey::des(&key_bytes)?;
if key.obj_key.timeline != self.timeline {
return Ok(None);
}
if key.lsn > self.lsn {
// TODO can speed up by seeking iterator
continue;
}
return Ok(Some((key.obj_key.tag, key.lsn, v.to_vec())));
}
Ok(None)
}
}
///
/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
///
struct RocksObjectIter<'a> {
timeline: ZTimelineId,
key: StorageKey,
lsn: Lsn,
dbiter: rocksdb::DBRawIterator<'a>,
}
impl<'a> RocksObjectIter<'a> {
fn new(db: &'a rocksdb::DB, timeline: ZTimelineId, lsn: Lsn) -> Result<RocksObjectIter<'a>> {
let key = StorageKey {
obj_key: ObjectKey {
timeline,
tag: ObjectTag::FirstTag,
},
lsn: Lsn(0),
};
let dbiter = db.raw_iterator();
Ok(RocksObjectIter {
key,
timeline,
lsn,
dbiter,
})
}
}
impl<'a> Iterator for RocksObjectIter<'a> {
type Item = ObjectTag;
fn next(&mut self) -> std::option::Option<Self::Item> {
loop {
self.dbiter.seek(StorageKey::ser(&self.key).unwrap());
if !self.dbiter.valid() {
return None;
}
let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
if key.obj_key.timeline != self.timeline {
// End of this timeline
return None;
}
self.key = key.clone();
self.key.lsn = Lsn(u64::MAX); // next seek should skip all versions
if key.lsn <= self.lsn {
// visible in this snapshot
return Some(key.obj_key.tag);
}
}
}
}

View File

@@ -8,7 +8,7 @@ use crate::page_cache;
use crate::relish::*;
use crate::restore_local_repo;
use crate::waldecoder::*;
use crate::{PageServerConf, RepositoryFormat};
use crate::PageServerConf;
use anyhow::{Error, Result};
use lazy_static::lazy_static;
use log::*;
@@ -264,12 +264,6 @@ fn walreceiver_main(
)?;
if newest_segno - oldest_segno >= 10 {
// FIXME: The layered repository performs checkpointing in a separate thread, so this
// isn't needed anymore. Remove 'checkpoint' from the Timeline trait altogether?
if conf.repository_format == RepositoryFormat::RocksDb {
timeline.checkpoint()?;
}
// TODO: This is where we could remove WAL older than last_rec_lsn.
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
}

View File

@@ -1,105 +0,0 @@
import pytest
from contextlib import closing
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
import psycopg2.extras
pytest_plugins = ("fixtures.zenith_fixtures")
#
# Test Garbage Collection of old page versions.
#
# This test is pretty tightly coupled with the current implementation of page version storage
# and garbage collection in object_repository.rs.
#
@pytest.mark.skip(reason=""""
Current GC test is flaky and overly strict. Since we are migrating to the layered repo format
with different GC implementation let's just silence this test for now. This test only
works with the RocksDB implementation.
""")
def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
zenith_cli.run(["branch", "test_gc", "empty"])
pg = postgres.create_start('test_gc')
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
with closing(pageserver.connect()) as psconn:
with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
# Get the timeline ID of our branch. We need it for the 'do_gc' command
cur.execute("SHOW zenith.zenith_timeline")
timeline = cur.fetchone()[0]
# Create a test table
cur.execute("CREATE TABLE foo(x integer)")
# Run GC, to clear out any old page versions left behind in the catalogs by
# the CREATE TABLE command. We want to have a clean slate with no garbage
# before running the actual tests below, otherwise the counts won't match
# what we expect.
print("Running GC before test")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
# remember the number of relations
n_relations = row['n_relations']
assert n_relations > 0
# Insert a row. The first insert will also create a metadata entry for the
# relation, with size == 1 block. Hence, bump up the expected relation count.
n_relations += 1
print("Inserting one row and running GC")
cur.execute("INSERT INTO foo VALUES (1)")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
assert row['n_relations'] == n_relations
assert row['dropped'] == 0
assert row['truncated'] == 31
assert row['deleted'] == 4
# Insert two more rows and run GC.
print("Inserting two more rows and running GC")
cur.execute("INSERT INTO foo VALUES (2)")
cur.execute("INSERT INTO foo VALUES (3)")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
assert row['n_relations'] == n_relations
assert row['dropped'] == 0
assert row['truncated'] == 31
assert row['deleted'] == 4
# Insert one more row. It creates one more page version, but doesn't affect the
# relation size.
print("Inserting one more row")
cur.execute("INSERT INTO foo VALUES (3)")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
assert row['n_relations'] == n_relations
assert row['dropped'] == 0
assert row['truncated'] == 31
assert row['deleted'] == 2
# Run GC again, with no changes in the database. Should not remove anything.
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
assert row['n_relations'] == n_relations
assert row['dropped'] == 0
assert row['truncated'] == 31
assert row['deleted'] == 0
#
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
#
cur.execute("DROP TABLE foo")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
# Each relation fork is counted separately, hence 3.
assert row['dropped'] == 3

View File

@@ -67,7 +67,7 @@ fn main() -> Result<()> {
.long("repository-format")
.takes_value(false)
.value_name("repository-format")
.help("Choose repository format, 'layered' or 'rocksdb'")
.help("Choose repository format, only 'layered' supported currently")
),
)
.subcommand(