mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 00:42:54 +00:00
Remove rocksdb implementation.
The layered storage format is good enough that we don't need the rocksdb implementation anymore. There are a lot of known issues but we'll keep working on them.
This commit is contained in:
87
Cargo.lock
generated
87
Cargo.lock
generated
@@ -152,25 +152,6 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd4865004a46a0aafb2a0a5eb19d3c9fc46ee5f063a6cfc605c69ac9ecf5263d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cexpr 0.4.0",
|
||||
"clang-sys",
|
||||
"lazy_static",
|
||||
"lazycell",
|
||||
"peeking_take_while",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex 0.1.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.59.1"
|
||||
@@ -178,7 +159,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cexpr 0.5.0",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"clap",
|
||||
"env_logger",
|
||||
@@ -190,7 +171,7 @@ dependencies = [
|
||||
"quote",
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex 1.0.0",
|
||||
"shlex",
|
||||
"which",
|
||||
]
|
||||
|
||||
@@ -265,18 +246,6 @@ name = "cc"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e70cc2f62c6ce1868963827bd677764c62d07c3d9a3e1fb1177ee1a9ab199eb2"
|
||||
dependencies = [
|
||||
"jobserver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27"
|
||||
dependencies = [
|
||||
"nom 5.1.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
@@ -284,7 +253,7 @@ version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89"
|
||||
dependencies = [
|
||||
"nom 6.1.2",
|
||||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -903,15 +872,6 @@ version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "972f5ae5d1cb9c6ae417789196c803205313edde988685da5e3aae0827b9e7fd"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.51"
|
||||
@@ -963,18 +923,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "librocksdb-sys"
|
||||
version = "6.17.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5da125e1c0f22c7cae785982115523a0738728498547f415c9054cb17c7e89f9"
|
||||
dependencies = [
|
||||
"bindgen 0.57.0",
|
||||
"cc",
|
||||
"glob",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.4"
|
||||
@@ -1109,16 +1057,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "5.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "6.1.2"
|
||||
@@ -1260,7 +1198,6 @@ dependencies = [
|
||||
"postgres_ffi",
|
||||
"rand",
|
||||
"regex",
|
||||
"rocksdb",
|
||||
"routerify",
|
||||
"rust-s3",
|
||||
"scopeguard",
|
||||
@@ -1442,7 +1379,7 @@ name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bindgen 0.59.1",
|
||||
"bindgen",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
@@ -1683,16 +1620,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rocksdb"
|
||||
version = "0.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c749134fda8bfc90d0de643d59bfc841dcb3ac8a1062e12b6754bd60235c48b3"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"librocksdb-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "routerify"
|
||||
version = "2.2.0"
|
||||
@@ -1916,12 +1843,6 @@ dependencies = [
|
||||
"opaque-debug",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2"
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.0.0"
|
||||
|
||||
@@ -42,7 +42,7 @@ pub struct LocalEnv {
|
||||
#[serde(with = "hex")]
|
||||
pub tenantid: ZTenantId,
|
||||
|
||||
// Repository format, 'rocksdb' or 'layered' or None for default
|
||||
// Repository format, only 'layered' supported currently, or None for default
|
||||
pub repository_format: Option<String>,
|
||||
|
||||
// jwt auth token used for communication with pageserver
|
||||
|
||||
@@ -30,8 +30,6 @@ tokio-stream = { version = "0.1.5" }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
# by default rust-rocksdb tries to build a lot of compression algos. Use lz4 only for now as it is simplest dependency.
|
||||
rocksdb = { version = "0.16.0", features = ["lz4"], default-features = false }
|
||||
routerify = "2"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
|
||||
@@ -74,9 +74,8 @@ Repository corresponds to one .zenith directory.
|
||||
Repository is needed to manage Timelines.
|
||||
Each repository has associated WAL redo service.
|
||||
|
||||
Now we have two implementations of Repository:
|
||||
- ObjectRepository uses RocksDB as a storage
|
||||
- LayeredRepository uses custom storage format, described in layered_repository/README.md
|
||||
There is currently only one implementation of the Repository trait: LayeredRepository.
|
||||
It uses custom storage format, described in layered_repository/README.md
|
||||
|
||||
#### Timeline
|
||||
Timeline is a page cache workhorse that accepts page changes
|
||||
@@ -89,13 +88,6 @@ Each Branch lives in a corresponding timeline and has an ancestor.
|
||||
|
||||
To get full snapshot of data at certain moment we need to traverse timeline and its ancestors.
|
||||
|
||||
#### ObjectRepository
|
||||
ObjectRepository implements Repository and has associated ObjectStore and WAL redo service.
|
||||
|
||||
#### ObjectStore
|
||||
ObjectStore is an interface for key-value store for page images and wal records.
|
||||
Currently it has one implementation - RocksDB.
|
||||
|
||||
#### WAL redo service
|
||||
WAL redo service - service that runs PostgreSQL in a special wal_redo mode
|
||||
to apply given WAL records over an old page image and return new page image.
|
||||
|
||||
@@ -138,10 +138,9 @@ impl CfgFileParams {
|
||||
}
|
||||
|
||||
let repository_format = match self.repository_format.as_ref() {
|
||||
Some(repo_format_str) if repo_format_str == "rocksdb" => RepositoryFormat::RocksDb,
|
||||
Some(repo_format_str) if repo_format_str == "layered" => RepositoryFormat::Layered,
|
||||
Some(repo_format_str) => anyhow::bail!(
|
||||
"invalid --repository-format '{}', must be 'rocksdb' or 'layered'",
|
||||
"invalid --repository-format '{}', only 'layered' supported",
|
||||
repo_format_str
|
||||
),
|
||||
None => RepositoryFormat::Layered, // default
|
||||
@@ -240,7 +239,7 @@ fn main() -> Result<()> {
|
||||
Arg::with_name("repository-format")
|
||||
.long("repository-format")
|
||||
.takes_value(true)
|
||||
.help("Which repository implementation to use, 'rocksdb' or 'layered'"),
|
||||
.help("Which repository implementation to use, only 'layered' supported currently"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
|
||||
@@ -20,7 +20,6 @@ use log::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::logger;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::page_cache;
|
||||
use crate::restore_local_repo;
|
||||
use crate::walredo::WalRedoManager;
|
||||
@@ -102,16 +101,6 @@ pub fn create_repo(
|
||||
RepositoryFormat::Layered => Arc::new(
|
||||
crate::layered_repository::LayeredRepository::new(conf, wal_redo_manager, tenantid),
|
||||
),
|
||||
RepositoryFormat::RocksDb => {
|
||||
let obj_store = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
|
||||
|
||||
Arc::new(ObjectRepository::new(
|
||||
conf,
|
||||
Arc::new(obj_store),
|
||||
wal_redo_manager,
|
||||
tenantid,
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
// Load data into pageserver
|
||||
|
||||
@@ -12,15 +12,11 @@ pub mod branches;
|
||||
pub mod http;
|
||||
pub mod layered_repository;
|
||||
pub mod logger;
|
||||
pub mod object_key;
|
||||
pub mod object_repository;
|
||||
pub mod object_store;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod relish;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod rocksdb_storage;
|
||||
pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
@@ -63,7 +59,6 @@ pub struct PageServerConf {
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum RepositoryFormat {
|
||||
Layered,
|
||||
RocksDb,
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
//!
|
||||
//! Common structs shared by object_repository.rs and object_store.rs.
|
||||
//!
|
||||
|
||||
use crate::relish::RelishTag;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
///
|
||||
/// ObjectKey is the key type used to identify objects stored in an object
|
||||
/// repository. It is shared between object_repository.rs and object_store.rs.
|
||||
/// It is mostly opaque to ObjectStore, it just stores and retrieves objects
|
||||
/// using the key given by the caller.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ObjectKey {
|
||||
pub timeline: ZTimelineId,
|
||||
pub tag: ObjectTag,
|
||||
}
|
||||
|
||||
///
|
||||
/// ObjectTag is a part of ObjectKey that is specific to the type of
|
||||
/// the stored object.
|
||||
///
|
||||
/// NB: the order of the enum values is significant! In particular,
|
||||
/// rocksdb_storage.rs assumes that TimelineMetadataTag is first
|
||||
///
|
||||
/// Buffer is the kind of object that is accessible by the public
|
||||
/// get_page_at_lsn() / put_page_image() / put_wal_record() functions in
|
||||
/// the repository.rs interface. The rest are internal objects stored in
|
||||
/// the key-value store, to store various metadata. They're not directly
|
||||
/// accessible outside object_repository.rs
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum ObjectTag {
|
||||
// dummy tag preceeding all other keys
|
||||
FirstTag,
|
||||
|
||||
// Metadata about a timeline. Not versioned.
|
||||
TimelineMetadataTag,
|
||||
|
||||
// These objects store metadata about one relish. Currently it's used
|
||||
// just to track the relish's size. It's not used for non-blocky relishes
|
||||
// at all.
|
||||
RelationMetadata(RelishTag),
|
||||
|
||||
// These are the pages exposed in the public Repository/Timeline interface.
|
||||
Buffer(RelishTag, u32),
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,92 +0,0 @@
|
||||
//! Low-level key-value storage abstraction.
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::relish::*;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashSet;
|
||||
use std::iter::Iterator;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
///
|
||||
/// Low-level storage abstraction.
|
||||
///
|
||||
/// All the data in the repository is stored in a key-value store. This trait
|
||||
/// abstracts the details of the key-value store.
|
||||
///
|
||||
/// A simple key-value store would support just GET and PUT operations with
|
||||
/// a key, but the upper layer needs slightly complicated read operations
|
||||
///
|
||||
/// The most frequently used function is 'object_versions'. It is used
|
||||
/// to look up a page version. It is LSN aware, in that the caller
|
||||
/// specifies an LSN, and the function returns all values for that
|
||||
/// block with the same or older LSN.
|
||||
///
|
||||
pub trait ObjectStore: Send + Sync {
|
||||
///
|
||||
/// Store a value with given key.
|
||||
///
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()>;
|
||||
|
||||
/// Read entry with the exact given key.
|
||||
///
|
||||
/// This is used for retrieving metadata with special key that doesn't
|
||||
/// correspond to any real relation.
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>>;
|
||||
|
||||
/// Read key greater or equal than specified
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>>;
|
||||
|
||||
/// Iterate through all page versions of one object.
|
||||
///
|
||||
/// Returns all page versions in descending LSN order, along with the LSN
|
||||
/// of each page version.
|
||||
fn object_versions<'a>(
|
||||
&'a self,
|
||||
key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>>;
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
/// Returns all versions up to and including the specified LSN.
|
||||
fn objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>>;
|
||||
|
||||
/// Iterate through all keys with given tablespace and database ID, and LSN <= 'lsn'.
|
||||
/// Both dbnode and spcnode can be InvalidId (0) which means get all relations in tablespace/cluster
|
||||
///
|
||||
/// This is used to implement 'create database'
|
||||
fn list_rels(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>>;
|
||||
|
||||
/// Iterate through non-rel relishes
|
||||
///
|
||||
/// This is used to prepare tarball for new node startup.
|
||||
fn list_nonrels<'a>(&'a self, timelineid: ZTimelineId, lsn: Lsn) -> Result<HashSet<RelishTag>>;
|
||||
|
||||
/// Iterate through objects tags. If nonrel_only, then only non-relationa data is iterated.
|
||||
///
|
||||
/// This is used to implement GC and preparing tarball for new node startup
|
||||
/// Returns objects in increasing key-version order.
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timelineid: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
|
||||
|
||||
/// Unlink object (used by GC). This mehod may actually delete object or just mark it for deletion.
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()>;
|
||||
|
||||
// Compact storage and remove versions marged for deletion
|
||||
fn compact(&self);
|
||||
}
|
||||
@@ -3,9 +3,7 @@
|
||||
|
||||
use crate::branches;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::repository::Repository;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::{PageServerConf, RepositoryFormat};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
@@ -43,16 +41,6 @@ pub fn init(conf: &'static PageServerConf) {
|
||||
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
repo
|
||||
}
|
||||
RepositoryFormat::RocksDb => {
|
||||
let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
|
||||
|
||||
Arc::new(ObjectRepository::new(
|
||||
conf,
|
||||
Arc::new(obj_store),
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
info!("initialized storage for tenant: {}", &tenantid);
|
||||
|
||||
@@ -24,13 +24,13 @@ use std::{io, net::TcpStream};
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::auth::{self, JwtAuth};
|
||||
use zenith_utils::auth::{Claims, Scope};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::postgres_backend::{self, AuthType};
|
||||
use zenith_utils::pq_proto::{
|
||||
BeMessage, FeMessage, RowDescriptor, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC,
|
||||
};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::branches;
|
||||
@@ -596,15 +596,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"n_relations"),
|
||||
RowDescriptor::int8_col(b"truncated"),
|
||||
RowDescriptor::int8_col(b"deleted"),
|
||||
RowDescriptor::int8_col(b"prep_deleted"),
|
||||
RowDescriptor::int8_col(b"slru_deleted"),
|
||||
RowDescriptor::int8_col(b"chkp_deleted"),
|
||||
RowDescriptor::int8_col(b"control_deleted"),
|
||||
RowDescriptor::int8_col(b"filenodemap_deleted"),
|
||||
RowDescriptor::int8_col(b"dropped"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_total"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
|
||||
@@ -620,15 +611,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
RowDescriptor::int8_col(b"elapsed"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(&result.n_relations.to_string().as_bytes()),
|
||||
Some(&result.truncated.to_string().as_bytes()),
|
||||
Some(&result.deleted.to_string().as_bytes()),
|
||||
Some(&result.prep_deleted.to_string().as_bytes()),
|
||||
Some(&result.slru_deleted.to_string().as_bytes()),
|
||||
Some(&result.chkp_deleted.to_string().as_bytes()),
|
||||
Some(&result.control_deleted.to_string().as_bytes()),
|
||||
Some(&result.filenodemap_deleted.to_string().as_bytes()),
|
||||
Some(&result.dropped.to_string().as_bytes()),
|
||||
Some(&result.snapshot_relfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
&result
|
||||
|
||||
@@ -55,20 +55,6 @@ pub trait Repository: Send + Sync {
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct GcResult {
|
||||
// FIXME: These counters make sense for the ObjectRepository. They are not used
|
||||
// by the LayeredRepository.
|
||||
pub n_relations: u64,
|
||||
pub inspected: u64,
|
||||
pub truncated: u64,
|
||||
pub deleted: u64,
|
||||
pub prep_deleted: u64, // RelishTag::Twophase
|
||||
pub slru_deleted: u64, // RelishTag::Slru
|
||||
pub chkp_deleted: u64, // RelishTag::Checkpoint
|
||||
pub control_deleted: u64, // RelishTag::ControlFile
|
||||
pub filenodemap_deleted: u64, // RelishTag::FileNodeMap
|
||||
pub dropped: u64,
|
||||
|
||||
// These are used for the LayeredRepository instead
|
||||
pub snapshot_relfiles_total: u64,
|
||||
pub snapshot_relfiles_needed_by_cutoff: u64,
|
||||
pub snapshot_relfiles_needed_by_branches: u64,
|
||||
@@ -88,11 +74,6 @@ pub struct GcResult {
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.n_relations += other.n_relations;
|
||||
self.truncated += other.truncated;
|
||||
self.deleted += other.deleted;
|
||||
self.dropped += other.dropped;
|
||||
|
||||
self.snapshot_relfiles_total += other.snapshot_relfiles_total;
|
||||
self.snapshot_relfiles_needed_by_cutoff += other.snapshot_relfiles_needed_by_cutoff;
|
||||
self.snapshot_relfiles_needed_by_branches += other.snapshot_relfiles_needed_by_branches;
|
||||
@@ -241,8 +222,6 @@ impl WALRecord {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::{PageServerConf, RepositoryFormat};
|
||||
use postgres_ffi::pg_constants;
|
||||
@@ -279,10 +258,7 @@ mod tests {
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
fn get_test_repo(
|
||||
test_name: &str,
|
||||
repository_format: RepositoryFormat,
|
||||
) -> Result<Box<dyn Repository>> {
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
@@ -299,7 +275,7 @@ mod tests {
|
||||
pg_distrib_dir: "".into(),
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
repository_format,
|
||||
repository_format: RepositoryFormat::Layered,
|
||||
};
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
@@ -315,35 +291,14 @@ mod tests {
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
)),
|
||||
RepositoryFormat::RocksDb => {
|
||||
let obj_store = RocksObjectStore::create(conf, &tenantid)?;
|
||||
|
||||
Box::new(ObjectRepository::new(
|
||||
conf,
|
||||
Arc::new(obj_store),
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation.
|
||||
#[test]
|
||||
fn test_relsize_rocksdb() -> Result<()> {
|
||||
let repo = get_test_repo("test_relsize_rocksdb", RepositoryFormat::RocksDb)?;
|
||||
test_relsize(&*repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize_layered() -> Result<()> {
|
||||
let repo = get_test_repo("test_relsize_layered", RepositoryFormat::Layered)?;
|
||||
test_relsize(&*repo)
|
||||
}
|
||||
|
||||
fn test_relsize(repo: &dyn Repository) -> Result<()> {
|
||||
fn test_relsize() -> Result<()> {
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
@@ -428,22 +383,9 @@ mod tests {
|
||||
|
||||
/// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
///
|
||||
/// This isn't very interesting with the RocksDb implementation, as we don't pay
|
||||
/// any attention to Postgres segment boundaries there.
|
||||
#[test]
|
||||
fn test_large_rel_rocksdb() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel_rocksdb", RepositoryFormat::RocksDb)?;
|
||||
test_large_rel(&*repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_large_rel_layered() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel_layered", RepositoryFormat::Layered)?;
|
||||
test_large_rel(&*repo)
|
||||
}
|
||||
|
||||
fn test_large_rel(repo: &dyn Repository) -> Result<()> {
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
@@ -498,22 +440,12 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_branch_rocksdb() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch_rocksdb", RepositoryFormat::RocksDb)?;
|
||||
test_branch(&*repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_branch_layered() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch_layered", RepositoryFormat::Layered)?;
|
||||
test_branch(&*repo)
|
||||
}
|
||||
|
||||
///
|
||||
/// Test branch creation
|
||||
///
|
||||
fn test_branch(repo: &dyn Repository) -> Result<()> {
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
|
||||
@@ -1,475 +0,0 @@
|
||||
//!
|
||||
//! An implementation of the ObjectStore interface, backed by RocksDB
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::relish::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct StorageKey {
|
||||
obj_key: ObjectKey,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl StorageKey {
|
||||
/// The first key for a given timeline
|
||||
fn timeline_start(timeline: ZTimelineId) -> Self {
|
||||
Self {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::TimelineMetadataTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// RocksDB very inefficiently delete random record. Instead of it we have to use merge
|
||||
/// filter, which allows to throw away records at LSM merge phase.
|
||||
/// Unfortunately, it is hard (if ever possible) to determine whether version can be removed
|
||||
/// at merge time. Version ca be removed if:
|
||||
/// 1. It is above PITR horizon (we need to get current LSN and gc_horizon from config)
|
||||
/// 2. Page is reconstructed at horizon (all WAL records above horizon are applied and can be removed)
|
||||
///
|
||||
/// So we have GC process which reconstructs pages at horizon and mark deteriorated WAL record
|
||||
/// for deletion. To mark object for deletion we can either set some flag in object itself.
|
||||
/// But it is complicated with new object value format, because RocksDB storage knows nothing about
|
||||
/// this format. Also updating whole record just to set one bit seems to be inefficient in any case.
|
||||
/// This is why we keep keys of marked for deletion versions in HashSet in memory.
|
||||
/// When LSM merge filter found key in this map, it removes it from the set preventing memory overflow.
|
||||
///
|
||||
struct GarbageCollector {
|
||||
garbage: Mutex<HashSet<Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl GarbageCollector {
|
||||
fn new() -> GarbageCollector {
|
||||
GarbageCollector {
|
||||
garbage: Mutex::new(HashSet::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Called by GC to mark version as delete
|
||||
fn mark_for_deletion(&self, key: &[u8]) {
|
||||
let mut garbage = self.garbage.lock().unwrap();
|
||||
garbage.insert(key.to_vec());
|
||||
}
|
||||
|
||||
/// Called by LSM merge filter. If it finds key in the set, then
|
||||
/// it doesn't merge it and removes from this set.
|
||||
fn was_deleted(&self, key: &[u8]) -> bool {
|
||||
let key = key.to_vec();
|
||||
let mut garbage = self.garbage.lock().unwrap();
|
||||
garbage.remove(&key)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RocksObjectStore {
|
||||
_conf: &'static PageServerConf,
|
||||
|
||||
// RocksDB handle
|
||||
db: rocksdb::DB,
|
||||
gc: Arc<GarbageCollector>,
|
||||
}
|
||||
|
||||
impl ObjectStore for RocksObjectStore {
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
|
||||
let val = self.db.get(StorageKey::ser(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
})?)?;
|
||||
if let Some(val) = val {
|
||||
Ok(val)
|
||||
} else {
|
||||
bail!("could not find page {:?}", key);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
|
||||
let mut iter = self.db.raw_iterator();
|
||||
let search_key = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
iter.seek(search_key.ser()?);
|
||||
if !iter.valid() {
|
||||
Ok(None)
|
||||
} else {
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
Ok(Some(key.obj_key.clone()))
|
||||
}
|
||||
}
|
||||
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
|
||||
self.db.put(
|
||||
StorageKey::ser(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
})?,
|
||||
value,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
|
||||
self.gc.mark_for_deletion(&StorageKey::ser(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
})?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Iterate through page versions of given page, starting from the given LSN.
|
||||
/// The versions are walked in descending LSN order.
|
||||
fn object_versions<'a>(
|
||||
&'a self,
|
||||
key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
|
||||
let iter = RocksObjectVersionIter::new(&self.db, key, lsn)?;
|
||||
Ok(Box::new(iter))
|
||||
}
|
||||
|
||||
/// Iterate through all timeline objects
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
|
||||
let iter = RocksObjectIter::new(&self.db, timeline, lsn)?;
|
||||
Ok(Box::new(iter))
|
||||
}
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
///
|
||||
/// TODO: This implementation is very inefficient, it scans
|
||||
/// through all entries in the given database. In practice, this
|
||||
/// is used for CREATE DATABASE, and usually the template database is small.
|
||||
/// But if it's not, this will be slow.
|
||||
fn list_rels(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>> {
|
||||
// FIXME: This scans everything. Very slow
|
||||
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
let mut search_rel_tag = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: 0u8,
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
loop {
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationMetadata(RelishTag::Relation(search_rel_tag)),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
iter.seek(search_key.ser()?);
|
||||
if !iter.valid() {
|
||||
break;
|
||||
}
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
|
||||
if let ObjectTag::RelationMetadata(RelishTag::Relation(rel_tag)) = key.obj_key.tag {
|
||||
if spcnode != 0 && rel_tag.spcnode != spcnode
|
||||
|| dbnode != 0 && rel_tag.dbnode != dbnode
|
||||
{
|
||||
break;
|
||||
}
|
||||
if key.lsn <= lsn {
|
||||
// visible in this snapshot
|
||||
rels.insert(rel_tag);
|
||||
}
|
||||
search_rel_tag = rel_tag;
|
||||
// skip to next relation
|
||||
// FIXME: What if relnode is u32::MAX ?
|
||||
search_rel_tag.relnode += 1;
|
||||
} else {
|
||||
// no more relation metadata entries
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Get a list of all distinct NON-relations in timeline
|
||||
/// that are visible at given lsn.
|
||||
///
|
||||
/// TODO: This implementation is very inefficient, it scans
|
||||
/// through all non-rel page versions in the system. In practice, this
|
||||
/// is used when initializing a new compute node, and the non-rel files
|
||||
/// are never very large nor change very frequently, so this will do for now.
|
||||
fn list_nonrels(&self, timelineid: ZTimelineId, lsn: Lsn) -> Result<HashSet<RelishTag>> {
|
||||
let mut rels: HashSet<RelishTag> = HashSet::new();
|
||||
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::Buffer(FIRST_NONREL_RELISH_TAG, 0),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(search_key.ser()?);
|
||||
while iter.valid() {
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
|
||||
if key.obj_key.timeline != timelineid {
|
||||
// reached end of this timeline in the store
|
||||
break;
|
||||
}
|
||||
|
||||
if let ObjectTag::Buffer(rel_tag, _blknum) = key.obj_key.tag {
|
||||
if key.lsn <= lsn {
|
||||
// visible in this snapshot
|
||||
rels.insert(rel_tag);
|
||||
}
|
||||
}
|
||||
// TODO: we could skip to next relation here like we do in list_rels(),
|
||||
// but hopefully there are not that many SLRU segments or other non-rel
|
||||
// entries for it to matter.
|
||||
iter.next();
|
||||
}
|
||||
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
/// Returns all versions up to and including the specified LSN.
|
||||
fn objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
|
||||
let start_key = StorageKey::timeline_start(timeline);
|
||||
let start_key_bytes = StorageKey::ser(&start_key)?;
|
||||
let iter = self.db.iterator(rocksdb::IteratorMode::From(
|
||||
&start_key_bytes,
|
||||
rocksdb::Direction::Forward,
|
||||
));
|
||||
|
||||
Ok(Box::new(RocksObjects {
|
||||
iter,
|
||||
timeline,
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
fn compact(&self) {
|
||||
self.db.compact_range::<&[u8], &[u8]>(None, None);
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksObjectStore {
|
||||
/// Open a RocksDB database.
|
||||
pub fn open(conf: &'static PageServerConf, tenantid: &ZTenantId) -> Result<RocksObjectStore> {
|
||||
let opts = Self::get_rocksdb_opts();
|
||||
let obj_store = Self::new(conf, opts, tenantid)?;
|
||||
Ok(obj_store)
|
||||
}
|
||||
|
||||
/// Create a new, empty RocksDB database.
|
||||
pub fn create(conf: &'static PageServerConf, tenantid: &ZTenantId) -> Result<RocksObjectStore> {
|
||||
let path = conf.tenant_path(&tenantid).join("rocksdb-storage");
|
||||
std::fs::create_dir(&path)?;
|
||||
|
||||
let mut opts = Self::get_rocksdb_opts();
|
||||
opts.create_if_missing(true);
|
||||
opts.set_error_if_exists(true);
|
||||
let obj_store = Self::new(conf, opts, tenantid)?;
|
||||
Ok(obj_store)
|
||||
}
|
||||
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
mut opts: rocksdb::Options,
|
||||
tenantid: &ZTenantId,
|
||||
) -> Result<RocksObjectStore> {
|
||||
let path = conf.tenant_path(&tenantid).join("rocksdb-storage");
|
||||
let gc = Arc::new(GarbageCollector::new());
|
||||
let gc_ref = gc.clone();
|
||||
opts.set_compaction_filter("ttl", move |_level: u32, key: &[u8], _val: &[u8]| {
|
||||
if gc_ref.was_deleted(key) {
|
||||
rocksdb::compaction_filter::Decision::Remove
|
||||
} else {
|
||||
rocksdb::compaction_filter::Decision::Keep
|
||||
}
|
||||
});
|
||||
let db = rocksdb::DB::open(&opts, &path)?;
|
||||
let obj_store = RocksObjectStore {
|
||||
_conf: conf,
|
||||
db,
|
||||
gc,
|
||||
};
|
||||
Ok(obj_store)
|
||||
}
|
||||
|
||||
/// common options used by `open` and `create`
|
||||
fn get_rocksdb_opts() -> rocksdb::Options {
|
||||
let mut opts = rocksdb::Options::default();
|
||||
opts.set_use_fsync(true);
|
||||
opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
|
||||
opts
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `object_versions`. Returns all page versions of a given block, in
|
||||
/// reverse LSN order.
|
||||
///
|
||||
struct RocksObjectVersionIter<'a> {
|
||||
obj_key: ObjectKey,
|
||||
dbiter: rocksdb::DBRawIterator<'a>,
|
||||
first_call: bool,
|
||||
}
|
||||
impl<'a> RocksObjectVersionIter<'a> {
|
||||
fn new(
|
||||
db: &'a rocksdb::DB,
|
||||
obj_key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<RocksObjectVersionIter<'a>> {
|
||||
let key = StorageKey {
|
||||
obj_key: obj_key.clone(),
|
||||
lsn,
|
||||
};
|
||||
let mut dbiter = db.raw_iterator();
|
||||
dbiter.seek_for_prev(StorageKey::ser(&key)?); // locate last entry
|
||||
Ok(RocksObjectVersionIter {
|
||||
first_call: true,
|
||||
obj_key: obj_key.clone(),
|
||||
dbiter,
|
||||
})
|
||||
}
|
||||
}
|
||||
impl<'a> Iterator for RocksObjectVersionIter<'a> {
|
||||
type Item = (Lsn, Vec<u8>);
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
if self.first_call {
|
||||
self.first_call = false;
|
||||
} else {
|
||||
self.dbiter.prev(); // walk backwards
|
||||
}
|
||||
|
||||
if !self.dbiter.valid() {
|
||||
return None;
|
||||
}
|
||||
let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
|
||||
if key.obj_key.tag != self.obj_key.tag {
|
||||
return None;
|
||||
}
|
||||
let val = self.dbiter.value().unwrap();
|
||||
let result = val.to_vec();
|
||||
|
||||
Some((key.lsn, result))
|
||||
}
|
||||
}
|
||||
|
||||
struct RocksObjects<'r> {
|
||||
iter: rocksdb::DBIterator<'r>,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<'r> Iterator for RocksObjects<'r> {
|
||||
// TODO consider returning Box<[u8]>
|
||||
type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_result().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r> RocksObjects<'r> {
|
||||
fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
|
||||
for (key_bytes, v) in &mut self.iter {
|
||||
let key = StorageKey::des(&key_bytes)?;
|
||||
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if key.lsn > self.lsn {
|
||||
// TODO can speed up by seeking iterator
|
||||
continue;
|
||||
}
|
||||
|
||||
return Ok(Some((key.obj_key.tag, key.lsn, v.to_vec())));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
|
||||
///
|
||||
struct RocksObjectIter<'a> {
|
||||
timeline: ZTimelineId,
|
||||
key: StorageKey,
|
||||
lsn: Lsn,
|
||||
dbiter: rocksdb::DBRawIterator<'a>,
|
||||
}
|
||||
impl<'a> RocksObjectIter<'a> {
|
||||
fn new(db: &'a rocksdb::DB, timeline: ZTimelineId, lsn: Lsn) -> Result<RocksObjectIter<'a>> {
|
||||
let key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::FirstTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let dbiter = db.raw_iterator();
|
||||
Ok(RocksObjectIter {
|
||||
key,
|
||||
timeline,
|
||||
lsn,
|
||||
dbiter,
|
||||
})
|
||||
}
|
||||
}
|
||||
impl<'a> Iterator for RocksObjectIter<'a> {
|
||||
type Item = ObjectTag;
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
loop {
|
||||
self.dbiter.seek(StorageKey::ser(&self.key).unwrap());
|
||||
if !self.dbiter.valid() {
|
||||
return None;
|
||||
}
|
||||
let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
// End of this timeline
|
||||
return None;
|
||||
}
|
||||
self.key = key.clone();
|
||||
self.key.lsn = Lsn(u64::MAX); // next seek should skip all versions
|
||||
if key.lsn <= self.lsn {
|
||||
// visible in this snapshot
|
||||
return Some(key.obj_key.tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@ use crate::page_cache;
|
||||
use crate::relish::*;
|
||||
use crate::restore_local_repo;
|
||||
use crate::waldecoder::*;
|
||||
use crate::{PageServerConf, RepositoryFormat};
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
@@ -264,12 +264,6 @@ fn walreceiver_main(
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
// FIXME: The layered repository performs checkpointing in a separate thread, so this
|
||||
// isn't needed anymore. Remove 'checkpoint' from the Timeline trait altogether?
|
||||
if conf.repository_format == RepositoryFormat::RocksDb {
|
||||
timeline.checkpoint()?;
|
||||
}
|
||||
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||
}
|
||||
|
||||
@@ -1,105 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
import psycopg2.extras
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Test Garbage Collection of old page versions.
|
||||
#
|
||||
# This test is pretty tightly coupled with the current implementation of page version storage
|
||||
# and garbage collection in object_repository.rs.
|
||||
#
|
||||
@pytest.mark.skip(reason=""""
|
||||
Current GC test is flaky and overly strict. Since we are migrating to the layered repo format
|
||||
with different GC implementation let's just silence this test for now. This test only
|
||||
works with the RocksDB implementation.
|
||||
""")
|
||||
def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||
zenith_cli.run(["branch", "test_gc", "empty"])
|
||||
pg = postgres.create_start('test_gc')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
with closing(pageserver.connect()) as psconn:
|
||||
with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
# Create a test table
|
||||
cur.execute("CREATE TABLE foo(x integer)")
|
||||
|
||||
# Run GC, to clear out any old page versions left behind in the catalogs by
|
||||
# the CREATE TABLE command. We want to have a clean slate with no garbage
|
||||
# before running the actual tests below, otherwise the counts won't match
|
||||
# what we expect.
|
||||
print("Running GC before test")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
# remember the number of relations
|
||||
n_relations = row['n_relations']
|
||||
assert n_relations > 0
|
||||
|
||||
# Insert a row. The first insert will also create a metadata entry for the
|
||||
# relation, with size == 1 block. Hence, bump up the expected relation count.
|
||||
n_relations += 1
|
||||
print("Inserting one row and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 31
|
||||
assert row['deleted'] == 4
|
||||
|
||||
# Insert two more rows and run GC.
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 31
|
||||
assert row['deleted'] == 4
|
||||
|
||||
# Insert one more row. It creates one more page version, but doesn't affect the
|
||||
# relation size.
|
||||
print("Inserting one more row")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 31
|
||||
assert row['deleted'] == 2
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 31
|
||||
assert row['deleted'] == 0
|
||||
|
||||
#
|
||||
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||
#
|
||||
cur.execute("DROP TABLE foo")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
assert row['dropped'] == 3
|
||||
@@ -67,7 +67,7 @@ fn main() -> Result<()> {
|
||||
.long("repository-format")
|
||||
.takes_value(false)
|
||||
.value_name("repository-format")
|
||||
.help("Choose repository format, 'layered' or 'rocksdb'")
|
||||
.help("Choose repository format, only 'layered' supported currently")
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
|
||||
Reference in New Issue
Block a user