diff --git a/.circleci/config.yml b/.circleci/config.yml index 70e9753153..24d151f765 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -24,6 +24,12 @@ jobs: # A job to build postgres build-postgres: executor: zenith-build-executor + parameters: + build_type: + type: enum + enum: ["debug", "release"] + environment: + BUILD_TYPE: << parameters.build_type >> steps: # Checkout the git repo (circleci doesn't have a flag to enable submodules here) - checkout @@ -39,7 +45,7 @@ jobs: name: Restore postgres cache keys: # Restore ONLY if the rev key matches exactly - - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }} + - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} # FIXME We could cache our own docker container, instead of installing packages every time. - run: @@ -59,12 +65,12 @@ jobs: if [ ! -e tmp_install/bin/postgres ]; then # "depth 1" saves some time by not cloning the whole repo git submodule update --init --depth 1 - make postgres + make postgres -j8 fi - save_cache: name: Save postgres cache - key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }} + key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} paths: - tmp_install @@ -96,7 +102,7 @@ jobs: name: Restore postgres cache keys: # Restore ONLY if the rev key matches exactly - - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }} + - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - restore_cache: name: Restore rust cache @@ -254,7 +260,7 @@ jobs: when: always command: | du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" -delete + find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete du -sh /tmp/test_output/* - store_artifacts: path: /tmp/test_output @@ -328,14 +334,18 @@ workflows: build_and_test: jobs: - check-codestyle - - build-postgres + - build-postgres: + name: build-postgres-<< matrix.build_type >> + matrix: + parameters: + build_type: ["debug", "release"] - build-zenith: name: build-zenith-<< matrix.build_type >> matrix: parameters: build_type: ["debug", "release"] requires: - - build-postgres + - build-postgres-<< matrix.build_type >> - run-pytest: name: pg_regress-tests-<< matrix.build_type >> matrix: diff --git a/.dockerignore b/.dockerignore index ccad1e364c..352336496f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,12 +2,17 @@ **/__pycache__ **/.pytest_cache -/target -/tmp_check -/tmp_install -/tmp_check_cli -/test_output -/.vscode -/.zenith -/integration_tests/.zenith -/Dockerfile +.git +target +tmp_check +tmp_install +tmp_check_cli +test_output +.vscode +.zenith +integration_tests/.zenith +.mypy_cache + +Dockerfile +.dockerignore + diff --git a/Cargo.lock b/Cargo.lock index 6fb4545e50..c217dfbebb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -307,6 +307,26 @@ dependencies = [ "vec_map", ] +[[package]] +name = "const_format" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4556f63e28a78fa5e6f310cfea5647a25636def49a338ab69e33b34a3382057b" +dependencies = [ + "const_format_proc_macros", +] + +[[package]] +name = "const_format_proc_macros" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "552782506c398da94466b364973b563887e0ca078bf33a76d4163736165e3594" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + [[package]] name = "control_plane" version = "0.1.0" @@ -1179,10 +1199,12 @@ dependencies = [ "bytes", "chrono", "clap", + "const_format", "crc32c", "daemonize", "futures", "hex", + "hex-literal", "humantime", "hyper", "lazy_static", @@ -1406,6 +1428,7 @@ dependencies = [ "hex", "md5", "rand", + "reqwest", "rustls", "serde", "serde_json", @@ -2330,6 +2353,7 @@ dependencies = [ "regex", "rust-s3", "serde", + "serde_json", "tokio", "tokio-stream", "walkdir", @@ -2554,6 +2578,7 @@ version = "0.1.0" dependencies = [ "lazy_static", "libc", + "once_cell", "prometheus", ] @@ -2583,6 +2608,7 @@ dependencies = [ "slog-scope", "slog-stdlog", "slog-term", + "tempfile", "thiserror", "tokio", "webpki", diff --git a/Dockerfile b/Dockerfile index 001adfdbf6..b38bac4480 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,39 +10,21 @@ FROM zenithdb/build:buster AS pg-build WORKDIR /zenith COPY ./vendor/postgres vendor/postgres COPY ./Makefile Makefile +ENV BUILD_TYPE release RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres - -# -# Calculate cargo dependencies. -# This will always run, but only generate recipe.json with list of dependencies without -# installing them. -# -FROM zenithdb/build:buster AS cargo-deps-inspect -WORKDIR /zenith -COPY . . -RUN cargo chef prepare --recipe-path /zenith/recipe.json - -# -# Build cargo dependencies. -# This temp cantainner should be rebuilt only if recipe.json was changed. -# -FROM zenithdb/build:buster AS deps-build -WORKDIR /zenith -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=cargo-deps-inspect /usr/local/cargo/bin/cargo-chef /usr/local/cargo/bin/ -COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json -RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json +RUN rm -rf postgres_install/build # # Build zenith binaries # +# TODO: build cargo deps as separate layer. We used cargo-chef before but that was +# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work. +# FROM zenithdb/build:buster AS build WORKDIR /zenith -COPY . . -# Copy cached dependencies COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=deps-build /zenith/target target -COPY --from=deps-build /usr/local/cargo/ /usr/local/cargo/ + +COPY . . RUN cargo build --release # @@ -51,7 +33,7 @@ RUN cargo build --release FROM debian:buster-slim WORKDIR /data -RUN apt-get update && apt-get -yq install librocksdb-dev libseccomp-dev openssl && \ +RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \ mkdir zenith_install COPY --from=build /zenith/target/release/pageserver /usr/local/bin diff --git a/Dockerfile.build b/Dockerfile.build index 92b2c21ffd..066bf2226e 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -9,7 +9,7 @@ WORKDIR /zenith # Install postgres and zenith build dependencies # clang is for rocksdb RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libseccomp-dev pkg-config libssl-dev librocksdb-dev clang + libseccomp-dev pkg-config libssl-dev clang # Install rust tools -RUN rustup component add clippy && cargo install cargo-chef cargo-audit +RUN rustup component add clippy && cargo install cargo-audit diff --git a/Makefile b/Makefile index 7daf3697c7..2edf2a6b4a 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,18 @@ else SECCOMP = endif +# +# We differentiate between release / debug build types using the BUILD_TYPE +# environment variable. +# +ifeq ($(BUILD_TYPE),release) + PG_CONFIGURE_OPTS = --enable-debug + PG_CFLAGS = -O2 -g3 ${CFLAGS} +else + PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend + PG_CFLAGS = -O0 -g3 ${CFLAGS} +endif + # # Top level Makefile to build Zenith and PostgreSQL # @@ -30,10 +42,8 @@ tmp_install/build/config.status: +@echo "Configuring postgres build" mkdir -p tmp_install/build (cd tmp_install/build && \ - ../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \ - --enable-cassert \ - --enable-debug \ - --enable-depend \ + ../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \ + $(PG_CONFIGURE_OPTS) \ $(SECCOMP) \ --prefix=$(abspath tmp_install) > configure.log) diff --git a/README.md b/README.md index 5d0afca5cc..1e0f20fd45 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Pageserver consists of: On Ubuntu or Debian this set of packages should be sufficient to build the code: ```text apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang +libssl-dev clang pkg-config libpq-dev ``` [Rust] 1.52 or later is also required. @@ -108,6 +108,13 @@ postgres=# insert into t values(2,2); INSERT 0 1 ``` +6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started: +```sh +> ./target/debug/zenith pg stop migration_check +> ./target/debug/zenith pg stop main +> ./target/debug/zenith stop +``` + ## Running tests ```sh diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 18616752f1..5b4313494b 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -1,17 +1,16 @@ -use std::fs::{self, File, OpenOptions}; +use std::collections::BTreeMap; +use std::fs::{self, File}; use std::io::Write; use std::net::SocketAddr; use std::net::TcpStream; use std::os::unix::fs::PermissionsExt; +use std::path::PathBuf; use std::process::{Command, Stdio}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use std::{collections::BTreeMap, path::PathBuf}; use anyhow::{Context, Result}; -use lazy_static::lazy_static; -use regex::Regex; use zenith_utils::connstring::connection_host_port; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; @@ -19,6 +18,7 @@ use zenith_utils::zid::ZTenantId; use zenith_utils::zid::ZTimelineId; use crate::local_env::LocalEnv; +use crate::postgresql_conf::PostgresConf; use crate::storage::PageServerNode; // @@ -144,76 +144,25 @@ impl PostgresNode { ); } - lazy_static! { - static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap(); - static ref CONF_TIMELINE_RE: Regex = - Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap(); - static ref CONF_TENANT_RE: Regex = - Regex::new(r"(?m)^\s*zenith.zenith_tenant\s*=\s*'(\w+)'\s*$").unwrap(); - } - // parse data directory name let fname = entry.file_name(); let name = fname.to_str().unwrap().to_string(); - // find out tcp port in config file + // Read config file into memory let cfg_path = entry.path().join("postgresql.conf"); - let config = fs::read_to_string(cfg_path.clone()).with_context(|| { - format!( - "failed to read config file in {}", - cfg_path.to_str().unwrap() - ) - })?; + let cfg_path_str = cfg_path.to_string_lossy(); + let mut conf_file = File::open(&cfg_path) + .with_context(|| format!("failed to open config file in {}", cfg_path_str))?; + let conf = PostgresConf::read(&mut conf_file) + .with_context(|| format!("failed to read config file in {}", cfg_path_str))?; - // parse port - let err_msg = format!( - "failed to find port definition in config file {}", - cfg_path.to_str().unwrap() - ); - let port: u16 = CONF_PORT_RE - .captures(config.as_str()) - .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))? - .iter() - .last() - .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))? - .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))? - .as_str() - .parse() - .with_context(|| err_msg)?; + // Read a few options from the config file + let context = format!("in config file {}", cfg_path_str); + let port: u16 = conf.parse_field("port", &context)?; + let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; + let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; - // parse timeline - let err_msg = format!( - "failed to find timeline definition in config file {}", - cfg_path.to_str().unwrap() - ); - let timelineid: ZTimelineId = CONF_TIMELINE_RE - .captures(config.as_str()) - .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))? - .iter() - .last() - .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))? - .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))? - .as_str() - .parse() - .with_context(|| err_msg)?; - - // parse tenant - let err_msg = format!( - "failed to find tenant definition in config file {}", - cfg_path.to_str().unwrap() - ); - let tenantid = CONF_TENANT_RE - .captures(config.as_str()) - .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))? - .iter() - .last() - .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))? - .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))? - .as_str() - .parse() - .with_context(|| err_msg)?; - - let uses_wal_proposer = config.contains("wal_acceptors"); + let uses_wal_proposer = conf.get("wal_acceptors").is_some(); // ok now Ok(PostgresNode { @@ -308,72 +257,58 @@ impl PostgresNode { // Connect to a page server, get base backup, and untar it to initialize a // new data directory fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> { - File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?; - + let mut conf = PostgresConf::new(); + conf.append("max_wal_senders", "10"); // wal_log_hints is mandatory when running against pageserver (see gh issue#192) // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE? - self.append_conf( - "postgresql.conf", - &format!( - "max_wal_senders = 10\n\ - wal_log_hints = on\n\ - max_replication_slots = 10\n\ - hot_standby = on\n\ - shared_buffers = 1MB\n\ - fsync = off\n\ - max_connections = 100\n\ - wal_sender_timeout = 10s\n\ - wal_level = replica\n\ - listen_addresses = '{address}'\n\ - port = {port}\n", - address = self.address.ip(), - port = self.address.port() - ), - )?; + conf.append("wal_log_hints", "on"); + conf.append("max_replication_slots", "10"); + conf.append("hot_standby", "on"); + conf.append("shared_buffers", "1MB"); + conf.append("fsync", "off"); + conf.append("max_connections", "100"); + conf.append("wal_sender_timeout", "10s"); + conf.append("wal_level", "replica"); + conf.append("listen_addresses", &self.address.ip().to_string()); + conf.append("port", &self.address.port().to_string()); // Never clean up old WAL. TODO: We should use a replication // slot or something proper, to prevent the compute node // from removing WAL that hasn't been streamed to the safekeeper or // page server yet. (gh issue #349) - self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?; + conf.append("wal_keep_size", "10TB"); - // set up authentication - let password = if let AuthType::ZenithJWT = auth_type { - "$ZENITH_AUTH_TOKEN" - } else { - "" + // Configure the node to fetch pages from pageserver + let pageserver_connstr = { + let (host, port) = connection_host_port(&self.pageserver.pg_connection_config); + + // Set up authentication + // + // $ZENITH_AUTH_TOKEN will be replaced with value from environment + // variable during compute pg startup. It is done this way because + // otherwise user will be able to retrieve the value using SHOW + // command or pg_settings + let password = if let AuthType::ZenithJWT = auth_type { + "$ZENITH_AUTH_TOKEN" + } else { + "" + }; + + format!("host={} port={} password={}", host, port, password) }; - - // Configure that node to take pages from pageserver - let (host, port) = connection_host_port(&self.pageserver.pg_connection_config); - self.append_conf( - "postgresql.conf", - format!( - concat!( - "shared_preload_libraries = zenith\n", - // $ZENITH_AUTH_TOKEN will be replaced with value from environment variable during compute pg startup - // it is done this way because otherwise user will be able to retrieve the value using SHOW command or pg_settings - "zenith.page_server_connstring = 'host={} port={} password={}'\n", - "zenith.zenith_timeline='{}'\n", - "zenith.zenith_tenant='{}'\n", - ), - host, port, password, self.timelineid, self.tenantid, - ) - .as_str(), - )?; + conf.append("shared_preload_libraries", "zenith"); + conf.append_line(""); + conf.append("zenith.page_server_connstring", &pageserver_connstr); + conf.append("zenith.zenith_tenant", &self.tenantid.to_string()); + conf.append("zenith.zenith_timeline", &self.timelineid.to_string()); + conf.append_line(""); // Configure the node to stream WAL directly to the pageserver - self.append_conf( - "postgresql.conf", - format!( - concat!( - "synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg? - "zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping - ), - self.connstr(), - ) - .as_str(), - )?; + conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg? + conf.append("zenith.callmemaybe_connstring", &self.connstr()); + + let mut file = File::create(self.pgdata().join("postgresql.conf"))?; + file.write_all(conf.to_string().as_bytes())?; Ok(()) } @@ -416,14 +351,6 @@ impl PostgresNode { } } - pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> { - OpenOptions::new() - .append(true) - .open(self.pgdata().join(config).to_str().unwrap())? - .write_all(opts.as_bytes())?; - Ok(()) - } - fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl"); let mut cmd = Command::new(pg_ctl_path); @@ -525,9 +452,7 @@ impl PostgresNode { .output() .expect("failed to execute whoami"); - if !output.status.success() { - panic!("whoami failed"); - } + assert!(output.status.success(), "whoami failed"); String::from_utf8(output.stdout).unwrap().trim().to_string() } diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 6a14c0e77c..287f30be63 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -12,6 +12,7 @@ use std::path::Path; pub mod compute; pub mod local_env; +pub mod postgresql_conf; pub mod storage; /// Read a PID file diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs new file mode 100644 index 0000000000..bcd463999b --- /dev/null +++ b/control_plane/src/postgresql_conf.rs @@ -0,0 +1,212 @@ +/// +/// Module for parsing postgresql.conf file. +/// +/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just +/// enough to extract a few settings we need in Zenith, assuming you don't do +/// funny stuff like include-directives or funny escaping. +use anyhow::{anyhow, bail, Context, Result}; +use lazy_static::lazy_static; +use regex::Regex; +use std::collections::HashMap; +use std::fmt; +use std::io::BufRead; +use std::str::FromStr; + +/// In-memory representation of a postgresql.conf file +#[derive(Default)] +pub struct PostgresConf { + lines: Vec, + hash: HashMap, +} + +lazy_static! { + static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap(); +} + +impl PostgresConf { + pub fn new() -> PostgresConf { + PostgresConf::default() + } + + /// Read file into memory + pub fn read(read: impl std::io::Read) -> Result { + let mut result = Self::new(); + + for line in std::io::BufReader::new(read).lines() { + let line = line?; + + // Store each line in a vector, in original format + result.lines.push(line.clone()); + + // Also parse each line and insert key=value lines into a hash map. + // + // FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL. + // But it's close enough for our usage. + let line = line.trim(); + if line.starts_with('#') { + // comment, ignore + continue; + } else if let Some(caps) = CONF_LINE_RE.captures(line) { + let name = caps.get(1).unwrap().as_str(); + let raw_val = caps.get(2).unwrap().as_str(); + + if let Ok(val) = deescape_str(raw_val) { + // Note: if there's already an entry in the hash map for + // this key, this will replace it. That's the behavior what + // we want; when PostgreSQL reads the file, each line + // overrides any previous value for the same setting. + result.hash.insert(name.to_string(), val.to_string()); + } + } + } + Ok(result) + } + + /// Return the current value of 'option' + pub fn get(&self, option: &str) -> Option<&str> { + self.hash.get(option).map(|x| x.as_ref()) + } + + /// Return the current value of a field, parsed to the right datatype. + /// + /// This calls the FromStr::parse() function on the value of the field. If + /// the field does not exist, or parsing fails, returns an error. + /// + pub fn parse_field(&self, field_name: &str, context: &str) -> Result + where + T: FromStr, + ::Err: std::error::Error + Send + Sync + 'static, + { + self.get(field_name) + .ok_or_else(|| anyhow!("could not find '{}' option {}", field_name, context))? + .parse::() + .with_context(|| format!("could not parse '{}' option {}", field_name, context)) + } + + /// + /// Note: if you call this multiple times for the same option, the config + /// file will a line for each call. It would be nice to have a function + /// to change an existing line, but that's a TODO. + /// + pub fn append(&mut self, option: &str, value: &str) { + self.lines + .push(format!("{}={}\n", option, escape_str(value))); + self.hash.insert(option.to_string(), value.to_string()); + } + + /// Append an arbitrary non-setting line to the config file + pub fn append_line(&mut self, line: &str) { + self.lines.push(line.to_string()); + } +} + +impl fmt::Display for PostgresConf { + /// Return the whole configuration file as a string + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for line in self.lines.iter() { + f.write_str(line)?; + } + Ok(()) + } +} + +/// Escape a value for putting in postgresql.conf. +fn escape_str(s: &str) -> String { + // If the string doesn't contain anything that needs quoting or escaping, return it + // as it is. + // + // The first part of the regex, before the '|', matches the INTEGER rule in the + // PostgreSQL flex grammar (guc-file.l). It matches plain integers like "123" and + // "-123", and also accepts units like "10MB". The second part of the regex matches + // the UNQUOTED_STRING rule, and accepts strings that contain a single word, beginning + // with a letter. That covers words like "off" or "posix". Everything else is quoted. + // + // This regex is a bit more conservative than the rules in guc-file.l, so we quote some + // strings that PostgreSQL would accept without quoting, but that's OK. + lazy_static! { + static ref UNQUOTED_RE: Regex = + Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap(); + } + if UNQUOTED_RE.is_match(s) { + s.to_string() + } else { + // Otherwise escape and quote it + let s = s + .replace('\\', "\\\\") + .replace('\n', "\\n") + .replace('\'', "''"); + + "\'".to_owned() + &s + "\'" + } +} + +/// De-escape a possibly-quoted value. +/// +/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL +/// does this. +fn deescape_str(s: &str) -> Result { + // If the string has a quote at the beginning and end, strip them out. + if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') { + let mut result = String::new(); + + let mut iter = s[1..(s.len() - 1)].chars().peekable(); + while let Some(c) = iter.next() { + let newc = if c == '\\' { + match iter.next() { + Some('b') => '\x08', + Some('f') => '\x0c', + Some('n') => '\n', + Some('r') => '\r', + Some('t') => '\t', + Some('0'..='7') => { + // TODO + bail!("octal escapes not supported"); + } + Some(n) => n, + None => break, + } + } else if c == '\'' && iter.peek() == Some(&'\'') { + // doubled quote becomes just one quote + iter.next().unwrap() + } else { + c + }; + + result.push(newc); + } + Ok(result) + } else { + Ok(s.to_string()) + } +} + +#[test] +fn test_postgresql_conf_escapes() -> Result<()> { + assert_eq!(escape_str("foo bar"), "'foo bar'"); + // these don't need to be quoted + assert_eq!(escape_str("foo"), "foo"); + assert_eq!(escape_str("123"), "123"); + assert_eq!(escape_str("+123"), "+123"); + assert_eq!(escape_str("-10"), "-10"); + assert_eq!(escape_str("1foo"), "1foo"); + assert_eq!(escape_str("foo1"), "foo1"); + assert_eq!(escape_str("10MB"), "10MB"); + assert_eq!(escape_str("-10kB"), "-10kB"); + + // these need quoting and/or escaping + assert_eq!(escape_str("foo bar"), "'foo bar'"); + assert_eq!(escape_str("fo'o"), "'fo''o'"); + assert_eq!(escape_str("fo\no"), "'fo\\no'"); + assert_eq!(escape_str("fo\\o"), "'fo\\\\o'"); + assert_eq!(escape_str("10 cats"), "'10 cats'"); + + // Test de-escaping + assert_eq!(deescape_str(&escape_str("foo"))?, "foo"); + assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r"); + assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t"); + + // octal-escapes are currently not supported + assert!(deescape_str("'foo\\7\\07\\007'").is_err()); + + Ok(()) +} diff --git a/docs/README.md b/docs/README.md index 19041e06ad..0558fa24a8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -10,5 +10,5 @@ - [pageserver/README](/pageserver/README) — pageserver overview. - [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview. - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview. -- [walkeeper/README](/walkeeper/README.md) — WAL service overview. +- [walkeeper/README](/walkeeper/README) — WAL service overview. - [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index c92c7d3bc2..0e5a82df88 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -34,8 +34,12 @@ toml = "0.5" scopeguard = "1.1.0" rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] } async-trait = "0.1" +const_format = "0.2.21" postgres_ffi = { path = "../postgres_ffi" } zenith_metrics = { path = "../zenith_metrics" } zenith_utils = { path = "../zenith_utils" } workspace_hack = { path = "../workspace_hack" } + +[dev-dependencies] +hex-literal = "0.3" diff --git a/pageserver/README b/pageserver/README index 7595fb7738..0f858bb4ed 100644 --- a/pageserver/README +++ b/pageserver/README @@ -7,8 +7,9 @@ The Page Server has a few different duties: - Replay WAL that's applicable to the chunks that the Page Server maintains - Backup to S3 - - +S3 is the main fault-tolerant storage of all data, as there are no Page Server +replicas. We use a separate fault-tolerant WAL service to reduce latency. It +keeps track of WAL records which are not syncted to S3 yet. The Page Server consists of multiple threads that operate on a shared repository of page versions: diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 0a3dcf4fdb..c763f98a7f 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -9,22 +9,28 @@ use std::{ env, net::TcpListener, path::{Path, PathBuf}, - process::exit, str::FromStr, thread, }; use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType}; -use anyhow::{bail, ensure, Result}; +use anyhow::{bail, ensure, Context, Result}; use clap::{App, Arg, ArgMatches}; use daemonize::Daemonize; use pageserver::{ - branches, http, page_service, tenant_mgr, PageServerConf, RelishStorageConfig, S3Config, - LOG_FILE_NAME, + branches, + defaults::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS, + }, + http, page_service, relish_storage, tenant_mgr, PageServerConf, RelishStorageConfig, + RelishStorageKind, S3Config, LOG_FILE_NAME, }; use zenith_utils::http::endpoint; +use const_format::formatcp; + /// String arguments that can be declared via CLI or config file #[derive(Serialize, Deserialize)] struct CfgFileParams { @@ -39,6 +45,7 @@ struct CfgFileParams { auth_type: Option, // see https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for enum deserialisation examples relish_storage: Option, + relish_storage_max_concurrent_sync: Option, } #[derive(Serialize, Deserialize, Clone)] @@ -89,6 +96,7 @@ impl CfgFileParams { auth_validation_public_key_path: get_arg("auth-validation-public-key-path"), auth_type: get_arg("auth-type"), relish_storage, + relish_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"), } } @@ -108,6 +116,9 @@ impl CfgFileParams { .or(other.auth_validation_public_key_path), auth_type: self.auth_type.or(other.auth_type), relish_storage: self.relish_storage.or(other.relish_storage), + relish_storage_max_concurrent_sync: self + .relish_storage_max_concurrent_sync + .or(other.relish_storage_max_concurrent_sync), } } @@ -176,25 +187,34 @@ impl CfgFileParams { ); } - let relish_storage_config = - self.relish_storage - .as_ref() - .map(|storage_params| match storage_params.clone() { - RelishStorage::Local { local_path } => { - RelishStorageConfig::LocalFs(PathBuf::from(local_path)) - } - RelishStorage::AwsS3 { - bucket_name, - bucket_region, - access_key_id, - secret_access_key, - } => RelishStorageConfig::AwsS3(S3Config { - bucket_name, - bucket_region, - access_key_id, - secret_access_key, - }), - }); + let max_concurrent_sync = match self.relish_storage_max_concurrent_sync.as_deref() { + Some(relish_storage_max_concurrent_sync) => { + relish_storage_max_concurrent_sync.parse()? + } + None => DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS, + }; + let relish_storage_config = self.relish_storage.as_ref().map(|storage_params| { + let storage = match storage_params.clone() { + RelishStorage::Local { local_path } => { + RelishStorageKind::LocalFs(PathBuf::from(local_path)) + } + RelishStorage::AwsS3 { + bucket_name, + bucket_region, + access_key_id, + secret_access_key, + } => RelishStorageKind::AwsS3(S3Config { + bucket_name, + bucket_region, + access_key_id, + secret_access_key, + }), + }; + RelishStorageConfig { + max_concurrent_sync, + storage, + } + }); Ok(PageServerConf { daemonize: false, @@ -220,6 +240,7 @@ impl CfgFileParams { } fn main() -> Result<()> { + zenith_metrics::set_common_metrics_prefix("pageserver"); let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") .arg( @@ -228,14 +249,14 @@ fn main() -> Result<()> { .long("listen-pg") .alias("listen") // keep some compatibility .takes_value(true) - .help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"), + .help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")), ) .arg( Arg::with_name("listen-http") .long("listen-http") .alias("http_endpoint") // keep some compatibility .takes_value(true) - .help("http endpoint address for for metrics and management API calls ip:port (default: 127.0.0.1:5430)"), + .help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")), ) .arg( Arg::with_name("daemonize") @@ -343,10 +364,19 @@ fn main() -> Result<()> { .takes_value(true) .help("Credentials to access the AWS S3 bucket"), ) + .arg( + Arg::with_name("relish-storage-max-concurrent-sync") + .long("relish-storage-max-concurrent-sync") + .takes_value(true) + .help("Maximum allowed concurrent synchronisations with storage"), + ) .get_matches(); let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith")); - let cfg_file_path = workdir.canonicalize()?.join("pageserver.toml"); + let cfg_file_path = workdir + .canonicalize() + .with_context(|| format!("Error opening workdir '{}'", workdir.display()))? + .join("pageserver.toml"); let args_params = CfgFileParams::from_args(&arg_matches); @@ -358,22 +388,37 @@ fn main() -> Result<()> { args_params } else { // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)?; - let file_params: CfgFileParams = toml::from_str(&cfg_file_contents)?; + let cfg_file_contents = std::fs::read_to_string(&cfg_file_path) + .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?; + let file_params: CfgFileParams = toml::from_str(&cfg_file_contents).with_context(|| { + format!( + "Failed to read '{}' as pageserver config", + cfg_file_path.display() + ) + })?; args_params.or(file_params) }; // Set CWD to workdir for non-daemon modes - env::set_current_dir(&workdir)?; + env::set_current_dir(&workdir).with_context(|| { + format!( + "Failed to set application's current dir to '{}'", + workdir.display() + ) + })?; // Ensure the config is valid, even if just init-ing - let mut conf = params.try_into_config()?; + let mut conf = params.try_into_config().with_context(|| { + format!( + "Pageserver config at '{}' is not valid", + cfg_file_path.display() + ) + })?; conf.daemonize = arg_matches.is_present("daemonize"); if init && conf.daemonize { - eprintln!("--daemonize cannot be used with --init"); - exit(1); + bail!("--daemonize cannot be used with --init") } // The configuration is all set up now. Turn it into a 'static @@ -383,16 +428,21 @@ fn main() -> Result<()> { // Create repo and exit if init was requested if init { - branches::init_pageserver(conf, create_tenant)?; + branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?; // write the config file - let cfg_file_contents = toml::to_string_pretty(¶ms)?; + let cfg_file_contents = toml::to_string_pretty(¶ms) + .context("Failed to create pageserver config contents for initialisation")?; // TODO support enable-auth flag - std::fs::write(&cfg_file_path, cfg_file_contents)?; - - return Ok(()); + std::fs::write(&cfg_file_path, cfg_file_contents).with_context(|| { + format!( + "Failed to initialize pageserver config at '{}'", + cfg_file_path.display() + ) + })?; + Ok(()) + } else { + start_pageserver(conf).context("Failed to start pageserver") } - - start_pageserver(conf) } fn start_pageserver(conf: &'static PageServerConf) -> Result<()> { @@ -430,16 +480,20 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> { match daemonize.start() { Ok(_) => info!("Success, daemonized"), - Err(e) => error!("Error, {}", e), + Err(e) => error!("could not daemonize: {:#}", e), } } + // keep join handles for spawned threads + // don't spawn threads before daemonizing + let mut join_handles = Vec::new(); + + if let Some(handle) = relish_storage::run_storage_sync_thread(conf)? { + join_handles.push(handle); + } // Initialize tenant manager. tenant_mgr::init(conf); - // keep join handles for spawned threads - let mut join_handles = vec![]; - // initialize authentication for incoming connections let auth = match &conf.auth_type { AuthType::Trust | AuthType::MD5 => None, diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs index 4094757f31..57adf479ca 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/branches.rs @@ -17,6 +17,7 @@ use std::{ use zenith_utils::zid::{ZTenantId, ZTimelineId}; use log::*; +use zenith_utils::crashsafe_dir; use zenith_utils::logging; use zenith_utils::lsn::Lsn; @@ -118,7 +119,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str println!("initializing tenantid {}", tenantid); create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?; } - fs::create_dir_all(conf.tenants_path())?; + crashsafe_dir::create_dir_all(conf.tenants_path())?; println!("pageserver init succeeded"); Ok(()) @@ -135,12 +136,12 @@ pub fn create_repo( } // top-level dir may exist if we are creating it through CLI - fs::create_dir_all(&repo_dir) + crashsafe_dir::create_dir_all(&repo_dir) .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - fs::create_dir(conf.timelines_path(&tenantid))?; - fs::create_dir_all(conf.branches_path(&tenantid))?; - fs::create_dir_all(conf.tags_path(&tenantid))?; + crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?; + crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?; + crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?; info!("created directory structure in {}", repo_dir.display()); @@ -150,12 +151,13 @@ pub fn create_repo( conf, wal_redo_manager, tenantid, + false, )); // Load data into pageserver // TODO To implement zenith import we need to // move data loading out of create_repo() - bootstrap_timeline(conf, tenantid, tli, &*repo)?; + bootstrap_timeline(conf, tenantid, tli, repo.as_ref())?; Ok(repo) } @@ -221,11 +223,11 @@ fn bootstrap_timeline( // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. let timeline = repo.create_empty_timeline(tli)?; - restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; - - let wal_dir = pgdata_path.join("pg_wal"); - restore_local_repo::import_timeline_wal(&wal_dir, &*timeline, lsn)?; - + restore_local_repo::import_timeline_from_postgres_datadir( + &pgdata_path, + timeline.as_ref(), + lsn, + )?; timeline.checkpoint()?; println!( @@ -417,7 +419,6 @@ fn create_timeline( let timelinedir = conf.timeline_path(&timelineid, tenantid); fs::create_dir(&timelinedir)?; - fs::create_dir(&timelinedir.join("wal"))?; if let Some(ancestor) = ancestor { let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 96a9f2c8ba..1a7216c1c0 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -11,7 +11,7 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; use bookfile::Book; use bytes::Bytes; use lazy_static::lazy_static; @@ -22,27 +22,31 @@ use serde::{Deserialize, Serialize}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::collections::{BTreeSet, HashSet}; -use std::fs::File; +use std::convert::TryInto; +use std::fs; +use std::fs::{File, OpenOptions}; use std::io::Write; use std::ops::Bound::Included; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, MutexGuard}; use std::time::{Duration, Instant}; -use std::{fs, thread}; use crate::layered_repository::inmemory_layer::FreezeLayers; use crate::relish::*; -use crate::relish_storage::storage_uploader::QueueBasedRelishUploader; +use crate::relish_storage::schedule_timeline_upload; use crate::repository::{GcResult, Repository, Timeline, WALRecord}; -use crate::restore_local_repo::import_timeline_wal; +use crate::walreceiver::IS_WAL_RECEIVER; use crate::walredo::WalRedoManager; use crate::PageServerConf; use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{register_histogram, register_int_gauge_vec, Histogram, IntGaugeVec}; +use zenith_metrics::{ + register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec, +}; use zenith_metrics::{register_histogram_vec, HistogramVec}; use zenith_utils::bin_ser::BeSer; +use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use zenith_utils::seqwait::SeqWait; @@ -51,6 +55,7 @@ mod delta_layer; mod filename; mod image_layer; mod inmemory_layer; +mod interval_tree; mod layer_map; mod storage_layer; @@ -70,6 +75,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call. static TIMEOUT: Duration = Duration::from_secs(60); +// Taken from PG_CONTROL_MAX_SAFE_SIZE +const METADATA_MAX_SAFE_SIZE: usize = 512; +const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::(); +const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE; + // Metrics collected on operations on the storage repository. lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( @@ -109,7 +119,9 @@ pub struct LayeredRepository { timelines: Mutex>>, walredo_mgr: Arc, - relish_uploader: Option>, + /// Makes evey repo's timelines to backup their files to remote storage, + /// when they get frozen. + upload_relishes: bool, } /// Public interface @@ -124,7 +136,7 @@ impl Repository for LayeredRepository { let mut timelines = self.timelines.lock().unwrap(); // Create the timeline directory, and write initial metadata to file. - std::fs::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?; + crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?; let metadata = TimelineMetadata { disk_consistent_lsn: Lsn(0), @@ -132,7 +144,7 @@ impl Repository for LayeredRepository { ancestor_timeline: None, ancestor_lsn: Lsn(0), }; - Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata)?; + Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?; let timeline = LayeredTimeline::new( self.conf, @@ -141,8 +153,8 @@ impl Repository for LayeredRepository { timelineid, self.tenantid, Arc::clone(&self.walredo_mgr), - self.relish_uploader.as_ref().map(Arc::clone), 0, + false, )?; let timeline_rc = Arc::new(timeline); @@ -176,8 +188,8 @@ impl Repository for LayeredRepository { ancestor_timeline: Some(src), ancestor_lsn: start_lsn, }; - std::fs::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?; - Self::save_metadata(self.conf, dst, self.tenantid, &metadata)?; + crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?; + Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?; info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -191,12 +203,12 @@ impl Repository for LayeredRepository { &self, target_timelineid: Option, horizon: u64, - compact: bool, + checkpoint_before_gc: bool, ) -> Result { STORAGE_TIME .with_label_values(&["gc"]) .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, compact) + self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) }) } } @@ -214,6 +226,7 @@ impl LayeredRepository { Some(timeline) => Ok(timeline.clone()), None => { let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)?; + let disk_consistent_lsn = metadata.disk_consistent_lsn; // Recurse to look up the ancestor timeline. // @@ -233,35 +246,17 @@ impl LayeredRepository { timelineid, self.tenantid, Arc::clone(&self.walredo_mgr), - self.relish_uploader.as_ref().map(Arc::clone), - 0, // init with 0 and update after layers are loaded + 0, // init with 0 and update after layers are loaded, + self.upload_relishes, )?; // List the layers on disk, and load them into the layer map - timeline.load_layer_map()?; + timeline.load_layer_map(disk_consistent_lsn)?; // needs to be after load_layer_map timeline.init_current_logical_size()?; let timeline = Arc::new(timeline); - - // Load any new WAL after the last checkpoint into memory. - info!( - "Loading WAL for timeline {} starting at {}", - timelineid, - timeline.get_last_record_lsn() - ); - let wal_dir = self - .conf - .timeline_path(&timelineid, &self.tenantid) - .join("wal"); - import_timeline_wal(&wal_dir, timeline.as_ref(), timeline.get_last_record_lsn())?; - - if cfg!(debug_assertions) { - // check again after wal loading - Self::assert_size_calculation_matches_offloaded(Arc::clone(&timeline)); - } - timelines.insert(timelineid, timeline.clone()); Ok(timeline) } @@ -272,15 +267,14 @@ impl LayeredRepository { conf: &'static PageServerConf, walredo_mgr: Arc, tenantid: ZTenantId, + upload_relishes: bool, ) -> LayeredRepository { LayeredRepository { tenantid, conf, timelines: Mutex::new(HashMap::new()), walredo_mgr, - relish_uploader: conf.relish_storage_config.as_ref().map(|config| { - Arc::new(QueueBasedRelishUploader::new(config, &conf.workdir).unwrap()) - }), + upload_relishes, } } @@ -355,13 +349,36 @@ impl LayeredRepository { timelineid: ZTimelineId, tenantid: ZTenantId, data: &TimelineMetadata, + first_save: bool, ) -> Result { - let path = conf.timeline_path(&timelineid, &tenantid).join("metadata"); - let mut file = File::create(&path)?; + let timeline_path = conf.timeline_path(&timelineid, &tenantid); + let path = timeline_path.join("metadata"); + // use OpenOptions to ensure file presence is consistent with first_save + let mut file = OpenOptions::new() + .write(true) + .create_new(first_save) + .open(&path)?; info!("saving metadata {}", path.display()); - file.write_all(&TimelineMetadata::ser(data)?)?; + let mut metadata_bytes = TimelineMetadata::ser(data)?; + + assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE); + metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8); + + let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]); + metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum)); + + if file.write(&metadata_bytes)? != metadata_bytes.len() { + bail!("Could not write all the metadata bytes in a single call"); + } + file.sync_all()?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + let timeline_dir = File::open(&timeline_path)?; + timeline_dir.sync_all()?; + } Ok(path) } @@ -372,9 +389,18 @@ impl LayeredRepository { tenantid: ZTenantId, ) -> Result { let path = conf.timeline_path(&timelineid, &tenantid).join("metadata"); - let data = std::fs::read(&path)?; + let metadata_bytes = std::fs::read(&path)?; + ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE); - let data = TimelineMetadata::des(&data)?; + let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE]; + let calculated_checksum = crc32c::crc32c(data); + + let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] = + metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?; + let expected_checksum = u32::from_le_bytes(*checksum_bytes); + ensure!(calculated_checksum == expected_checksum); + + let data = TimelineMetadata::des_prefix(data)?; assert!(data.disk_consistent_lsn.is_aligned()); Ok(data) @@ -401,14 +427,14 @@ impl LayeredRepository { // don't cover any branch point LSNs. // // TODO: - // - if a relation has been modified on a child branch, then we + // - if a relation has a non-incremental persistent layer on a child branch, then we // don't need to keep that in the parent anymore. But currently // we do. fn gc_iteration_internal( &self, target_timelineid: Option, horizon: u64, - compact: bool, + checkpoint_before_gc: bool, ) -> Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); @@ -420,63 +446,71 @@ impl LayeredRepository { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. // + let mut timelineids: Vec = Vec::new(); + // We scan the directory, not the in-memory hash table, because the hash // table only contains entries for timelines that have been accessed. We // need to take all timelines into account, not only the active ones. - let mut timelineids: Vec = Vec::new(); - let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); let timelines_path = self.conf.timelines_path(&self.tenantid); + for direntry in fs::read_dir(timelines_path)? { let direntry = direntry?; if let Some(fname) = direntry.file_name().to_str() { if let Ok(timelineid) = fname.parse::() { timelineids.push(timelineid); - - // Read the metadata of this timeline to get its parent timeline. - // - // We read the ancestor information directly from the file, instead - // of calling get_timeline(). We don't want to load the timeline - // into memory just for GC. - // - // FIXME: we open the timeline in the loop below with - // get_timeline_locked() anyway, so maybe we should just do it - // here, too. - let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)?; - if let Some(ancestor_timeline) = metadata.ancestor_timeline { - all_branchpoints.insert((ancestor_timeline, metadata.ancestor_lsn)); - } } } } - // Ok, we now know all the branch points. Iterate through them. + //Now collect info about branchpoints + let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); + for timelineid in &timelineids { + let timeline = self.get_timeline_locked(*timelineid, &mut *timelines)?; + + if let Some(ancestor_timeline) = &timeline.ancestor_timeline { + // If target_timeline is specified, we only need to know branchpoints of its childs + if let Some(timelineid) = target_timelineid { + if ancestor_timeline.timelineid == timelineid { + all_branchpoints + .insert((ancestor_timeline.timelineid, timeline.ancestor_lsn)); + } + } + // Collect branchpoints for all timelines + else { + all_branchpoints.insert((ancestor_timeline.timelineid, timeline.ancestor_lsn)); + } + } + } + + // Ok, we now know all the branch points. + // Perform GC for each timeline. for timelineid in timelineids { - // If a target timeline was specified, leave the other timelines alone. - // This is a bit inefficient, because we still collect the information for - // all the timelines above. - if let Some(x) = target_timelineid { - if x != timelineid { + // We have already loaded all timelines above + // so this operation is just a quick map lookup. + let timeline = self.get_timeline_locked(timelineid, &mut *timelines)?; + + // If target_timeline is specified, only GC it + if let Some(target_timelineid) = target_timelineid { + if timelineid != target_timelineid { continue; } } - let branchpoints: Vec = all_branchpoints - .range(( - Included((timelineid, Lsn(0))), - Included((timelineid, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); + if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { + let branchpoints: Vec = all_branchpoints + .range(( + Included((timelineid, Lsn(0))), + Included((timelineid, Lsn(u64::MAX))), + )) + .map(|&x| x.1) + .collect(); - let timeline = self.get_timeline_locked(timelineid, &mut *timelines)?; - let last_lsn = timeline.get_last_record_lsn(); - - if let Some(cutoff) = last_lsn.checked_sub(horizon) { - // If GC was explicitly requested by the admin, force flush all in-memory - // layers to disk first, so that they too can be garbage collected. That's + // If requested, force flush all in-memory layers to disk first, + // so that they too can be garbage collected. That's // used in tests, so we want as deterministic results as possible. - if compact { + if checkpoint_before_gc { timeline.checkpoint()?; + info!("timeline {} checkpoint_before_gc done", timelineid); } let result = timeline.gc_timeline(branchpoints, cutoff)?; @@ -488,24 +522,6 @@ impl LayeredRepository { totals.elapsed = now.elapsed(); Ok(totals) } - - fn assert_size_calculation_matches(incremental: usize, timeline: &LayeredTimeline) { - match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) { - Ok(non_incremental) => { - if incremental != non_incremental { - error!("timeline size calculation diverged, incremental doesn't match non incremental. incremental={} non_incremental={}", incremental, non_incremental); - } - } - Err(e) => error!("failed to calculate non incremental timeline size: {}", e), - } - } - - fn assert_size_calculation_matches_offloaded(timeline: Arc) { - let incremental = timeline.get_current_logical_size(); - thread::spawn(move || { - Self::assert_size_calculation_matches(incremental, &timeline); - }); - } } /// Metadata stored on disk for each timeline @@ -542,8 +558,6 @@ pub struct LayeredTimeline { // WAL redo manager walredo_mgr: Arc, - relish_uploader: Option>, - // What page versions do we hold in the repository? If we get a // request > last_record_lsn, we need to wait until we receive all // the WAL up to the request. The SeqWait provides functions for @@ -582,6 +596,18 @@ pub struct LayeredTimeline { // because current_logical_size is consistent on last_record_lsn, not ondisk_consistent_lsn // NOTE: current_logical_size also includes size of the ancestor current_logical_size: AtomicUsize, // bytes + + // To avoid calling .with_label_values and formatting the tenant and timeline IDs to strings + // every time the logical size is updated, keep a direct reference to the Gauge here. + // unfortunately it doesnt forward atomic methods like .fetch_add + // so use two fields: actual size and metric + // see https://github.com/zenithdb/zenith/issues/622 for discussion + // TODO: it is possible to combine these two fields into single one using custom metric which uses SeqCst + // ordering for its operations, but involves private modules, and macro trickery + current_logical_size_gauge: IntGauge, + + /// If `true`, will backup its timeline files to remote storage after freezing. + upload_relishes: bool, } /// Public interface functions @@ -589,8 +615,11 @@ impl Timeline for LayeredTimeline { /// Wait until WAL has been received up to the given LSN. fn wait_lsn(&self, lsn: Lsn) -> Result<()> { // This should never be called from the WAL receiver thread, because that could lead - // to a deadlock. FIXME: Is there a less hacky way to check that? - assert_ne!(thread::current().name(), Some("WAL receiver thread")); + // to a deadlock. + assert!( + !IS_WAL_RECEIVER.with(|c| c.get()), + "wait_lsn called by WAL receiver thread" + ); self.last_record_lsn .wait_for_timeout(lsn, TIMEOUT) @@ -764,6 +793,7 @@ impl Timeline for LayeredTimeline { rel ); } + ensure!(rec.lsn.is_aligned(), "unaligned record LSN"); let seg = SegmentTag::from_blknum(rel, blknum); let delta_size = self.perform_write_op(seg, rec.lsn, |layer| { @@ -777,6 +807,7 @@ impl Timeline for LayeredTimeline { if !rel.is_blocky() { bail!("invalid truncation for non-blocky relish {}", rel); } + ensure!(lsn.is_aligned(), "unaligned record LSN"); debug!("put_truncation: {} to {} blocks at {}", rel, relsize, lsn); @@ -867,6 +898,7 @@ impl Timeline for LayeredTimeline { rel ); } + ensure!(lsn.is_aligned(), "unaligned record LSN"); let seg = SegmentTag::from_blknum(rel, blknum); @@ -963,9 +995,12 @@ impl LayeredTimeline { timelineid: ZTimelineId, tenantid: ZTenantId, walredo_mgr: Arc, - relish_uploader: Option>, current_logical_size: usize, + upload_relishes: bool, ) -> Result { + let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE + .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .unwrap(); let timeline = LayeredTimeline { conf, timelineid, @@ -973,7 +1008,6 @@ impl LayeredTimeline { layers: Mutex::new(LayerMap::default()), walredo_mgr, - relish_uploader, // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. last_record_lsn: SeqWait::new(RecordLsn { @@ -985,6 +1019,8 @@ impl LayeredTimeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn, current_logical_size: AtomicUsize::new(current_logical_size), + current_logical_size_gauge, + upload_relishes, }; Ok(timeline) } @@ -992,17 +1028,29 @@ impl LayeredTimeline { /// /// Scan the timeline directory to populate the layer map /// - fn load_layer_map(&self) -> anyhow::Result<()> { + fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { info!( "loading layer map for timeline {} into memory", self.timelineid ); let mut layers = self.layers.lock().unwrap(); - let (imgfilenames, mut deltafilenames) = + let (imgfilenames, deltafilenames) = filename::list_files(self.conf, self.timelineid, self.tenantid)?; + let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid); + // First create ImageLayer structs for each image file. for filename in imgfilenames.iter() { + if filename.lsn > disk_consistent_lsn { + warn!( + "found future image layer {} on timeline {}", + filename, self.timelineid + ); + + rename_to_backup(timeline_path.join(filename.to_string()))?; + continue; + } + let layer = ImageLayer::new(self.conf, self.timelineid, self.tenantid, filename); info!( @@ -1014,33 +1062,25 @@ impl LayeredTimeline { layers.insert_historic(Arc::new(layer)); } - // Then for the Delta files. The delta files are created in order starting - // from the oldest file, because each DeltaLayer needs a reference to its - // predecessor. - deltafilenames.sort(); - + // Then for the Delta files. for filename in deltafilenames.iter() { - let predecessor = layers.get(&filename.seg, filename.start_lsn); + ensure!(filename.start_lsn < filename.end_lsn); + if filename.end_lsn > disk_consistent_lsn { + warn!( + "found future delta layer {} on timeline {}", + filename, self.timelineid + ); - let predecessor_str: String = if let Some(prec) = &predecessor { - prec.filename().display().to_string() - } else { - "none".to_string() - }; + rename_to_backup(timeline_path.join(filename.to_string()))?; + continue; + } - let layer = DeltaLayer::new( - self.conf, - self.timelineid, - self.tenantid, - filename, - predecessor, - ); + let layer = DeltaLayer::new(self.conf, self.timelineid, self.tenantid, filename); info!( - "found layer {} on timeline {}, predecessor: {}", + "found layer {} on timeline {}", layer.filename().display(), self.timelineid, - predecessor_str, ); layers.insert_historic(Arc::new(layer)); } @@ -1167,36 +1207,57 @@ impl LayeredTimeline { assert!(lsn.is_aligned()); let last_record_lsn = self.get_last_record_lsn(); - if lsn <= last_record_lsn { - panic!( - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", - lsn, - last_record_lsn - ); - } + assert!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + lsn, + last_record_lsn, + ); // Do we have a layer open for writing already? - if let Some(layer) = layers.get_open(&seg) { - if layer.get_start_lsn() > lsn { + let layer; + if let Some(open_layer) = layers.get_open(&seg) { + if open_layer.get_start_lsn() > lsn { bail!("unexpected open layer in the future"); } - return Ok(layer); - } - // No (writeable) layer for this relation yet. Create one. + // Open layer exists, but it is dropped, so create a new one. + if open_layer.is_dropped() { + assert!(!open_layer.is_writeable()); + // Layer that is created after dropped one represents a new relish segment. + trace!( + "creating layer for write for new relish segment after dropped layer {} at {}/{}", + seg, + self.timelineid, + lsn + ); + + layer = InMemoryLayer::create( + self.conf, + self.timelineid, + self.tenantid, + seg, + lsn, + lsn, + )?; + } else { + return Ok(open_layer); + } + } + // No writeable layer for this relation. Create one. // // Is this a completely new relation? Or the first modification after branching? // - - let layer; - if let Some((prev_layer, _prev_lsn)) = self.get_layer_for_read_locked(seg, lsn, &layers)? { + else if let Some((prev_layer, _prev_lsn)) = + self.get_layer_for_read_locked(seg, lsn, &layers)? + { // Create new entry after the previous one. let start_lsn; if prev_layer.get_timeline_id() != self.timelineid { // First modification on this timeline - start_lsn = self.ancestor_lsn; + start_lsn = self.ancestor_lsn + 1; trace!( - "creating file for write for {} at branch point {}/{}", + "creating layer for write for {} at branch point {}/{}", seg, self.timelineid, start_lsn @@ -1204,7 +1265,7 @@ impl LayeredTimeline { } else { start_lsn = prev_layer.get_end_lsn(); trace!( - "creating file for write for {} after previous layer {}/{}", + "creating layer for write for {} after previous layer {}/{}", seg, self.timelineid, start_lsn @@ -1271,6 +1332,8 @@ impl LayeredTimeline { last_record_lsn ); + let timeline_dir = File::open(self.conf.timeline_path(&self.timelineid, &self.tenantid))?; + // Take the in-memory layer with the oldest WAL record. If it's older // than the threshold, write it out to disk as a new image and delta file. // Repeat until all remaining in-memory layers are within the threshold. @@ -1282,6 +1345,8 @@ impl LayeredTimeline { // a lot of memory and/or aren't receiving much updates anymore. let mut disk_consistent_lsn = last_record_lsn; + let mut created_historics = false; + while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() { let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn(); @@ -1326,31 +1391,23 @@ impl LayeredTimeline { layers.insert_open(new_open); } + // We temporarily insert InMemory layer into historic list here. + // TODO: check that all possible concurrent users of 'historic' treat it right layers.insert_historic(frozen.clone()); // Write the now-frozen layer to disk. That could take a while, so release the lock while do it drop(layers); let new_historics = frozen.write_to_disk(self)?; layers = self.layers.lock().unwrap(); - if let Some(relish_uploader) = &self.relish_uploader { - for label_path in new_historics.iter().filter_map(|layer| layer.path()) { - relish_uploader.schedule_upload(self.timelineid, label_path); - } + + if !new_historics.is_empty() { + created_historics = true; } // Finally, replace the frozen in-memory layer with the new on-disk layers - layers.remove_historic(frozen.as_ref()); - - if let Some(last_historic) = new_historics.last() { - if let Some(new_open) = &maybe_new_open { - let maybe_old_predecessor = - new_open.update_predecessor(Arc::clone(last_historic)); - let old_predecessor = maybe_old_predecessor - .expect("new_open should always be a successor to frozen"); - assert!(layer_ptr_eq(frozen.as_ref(), old_predecessor.as_ref())); - } - } + layers.remove_historic(frozen.clone()); + // Add the historics to the LayerMap for n in new_historics { layers.insert_historic(n); } @@ -1363,6 +1420,14 @@ impl LayeredTimeline { layer.unload()?; } + drop(layers); + + if created_historics { + // We must fsync the timeline dir to ensure the directory entries for + // new layer files are durable + timeline_dir.sync_all()?; + } + // Save the metadata, with updated 'disk_consistent_lsn', to a // file in the timeline dir. After crash, we will restart WAL // streaming and processing from that point. @@ -1387,10 +1452,23 @@ impl LayeredTimeline { ancestor_timeline: ancestor_timelineid, ancestor_lsn: self.ancestor_lsn, }; - let metadata_path = - LayeredRepository::save_metadata(self.conf, self.timelineid, self.tenantid, &metadata)?; - if let Some(relish_uploader) = &self.relish_uploader { - relish_uploader.schedule_upload(self.timelineid, metadata_path); + let _metadata_path = LayeredRepository::save_metadata( + self.conf, + self.timelineid, + self.tenantid, + &metadata, + false, + )?; + if self.upload_relishes { + schedule_timeline_upload(()) + // schedule_timeline_upload(LocalTimeline { + // tenant_id: self.tenantid, + // timeline_id: self.timelineid, + // metadata_path, + // image_layers: image_layer_uploads, + // delta_layers: delta_layer_uploads, + // disk_consistent_lsn, + // }); } // Also update the in-memory copy @@ -1429,15 +1507,18 @@ impl LayeredTimeline { "running GC on timeline {}, cutoff {}", self.timelineid, cutoff ); + info!("retain_lsns: {:?}", retain_lsns); let mut layers_to_remove: Vec> = Vec::new(); - // Scan all layer files in the directory. For each file, if a newer file - // exists, we can remove the old one. + // Scan all on-disk layers in the timeline. + // + // Garbage collect the layer if all conditions are satisfied: + // 1. it is older than cutoff LSN; + // 2. it doesn't need to be retained for 'retain_lsns'; + // 3. newer on-disk layer exists (only for non-dropped segments); + // 4. this layer doesn't serve as a tombstone for some older layer; // - // Determine for each file if it needs to be retained - // FIXME: also scan open in-memory layers. Normally we cannot remove the - // latest layer of any seg, but if it was dropped it's possible let mut layers = self.layers.lock().unwrap(); 'outer: for l in layers.iter_historic_layers() { let seg = l.get_seg_tag(); @@ -1448,7 +1529,7 @@ impl LayeredTimeline { result.ondisk_nonrelfiles_total += 1; } - // Is it newer than cutoff point? + // 1. Is it newer than cutoff point? if l.get_end_lsn() > cutoff { info!( "keeping {} {}-{} because it's newer than cutoff {}", @@ -1465,10 +1546,10 @@ impl LayeredTimeline { continue 'outer; } - // Is it needed by a child branch? + // 2. Is it needed by a child branch? for retain_lsn in &retain_lsns { - // FIXME: are the bounds inclusive or exclusive? - if l.get_start_lsn() <= *retain_lsn && *retain_lsn <= l.get_end_lsn() { + // start_lsn is inclusive and end_lsn is exclusive + if l.get_start_lsn() <= *retain_lsn && *retain_lsn < l.get_end_lsn() { info!( "keeping {} {}-{} because it's needed by branch point {}", seg, @@ -1485,9 +1566,15 @@ impl LayeredTimeline { } } - // Unless the relation was dropped, is there a later image file for this relation? + // 3. Is there a later on-disk layer for this relation? if !l.is_dropped() && !layers.newer_image_layer_exists(l.get_seg_tag(), l.get_end_lsn()) { + info!( + "keeping {} {}-{} because it is the latest layer", + seg, + l.get_start_lsn(), + l.get_end_lsn() + ); if seg.rel.is_relation() { result.ondisk_relfiles_not_updated += 1; } else { @@ -1496,6 +1583,77 @@ impl LayeredTimeline { continue 'outer; } + // 4. Does this layer serve as a tombstome for some older layer? + if l.is_dropped() { + let prior_lsn = l.get_start_lsn().checked_sub(1u64).unwrap(); + + // Check if this layer serves as a tombstone for this timeline + // We have to do this separately from timeline check below, + // because LayerMap of this timeline is already locked. + let mut is_tombstone = layers.layer_exists_at_lsn(l.get_seg_tag(), prior_lsn)?; + if is_tombstone { + info!( + "earlier layer exists at {} in {}", + prior_lsn, self.timelineid + ); + } + // Now check ancestor timelines, if any + else if let Some(ancestor) = &self.ancestor_timeline { + let prior_lsn = ancestor.get_last_record_lsn(); + if seg.rel.is_blocky() { + info!( + "check blocky relish size {} at {} in {} for layer {}-{}", + seg, + prior_lsn, + ancestor.timelineid, + l.get_start_lsn(), + l.get_end_lsn() + ); + match ancestor.get_relish_size(seg.rel, prior_lsn).unwrap() { + Some(size) => { + let last_live_seg = SegmentTag::from_blknum(seg.rel, size - 1); + info!( + "blocky rel size is {} last_live_seg.segno {} seg.segno {}", + size, last_live_seg.segno, seg.segno + ); + if last_live_seg.segno >= seg.segno { + is_tombstone = true; + } + } + _ => { + info!("blocky rel doesn't exist"); + } + } + } else { + info!( + "check non-blocky relish existence {} at {} in {} for layer {}-{}", + seg, + prior_lsn, + ancestor.timelineid, + l.get_start_lsn(), + l.get_end_lsn() + ); + is_tombstone = ancestor.get_rel_exists(seg.rel, prior_lsn).unwrap_or(false); + } + } + + if is_tombstone { + info!( + "keeping {} {}-{} because this layer servers as a tombstome for older layer", + seg, + l.get_start_lsn(), + l.get_end_lsn() + ); + + if seg.rel.is_relation() { + result.ondisk_relfiles_needed_as_tombstone += 1; + } else { + result.ondisk_nonrelfiles_needed_as_tombstone += 1; + } + continue 'outer; + } + } + // We didn't find any reason to keep this file, so remove it. info!( "garbage collecting {} {}-{} {}", @@ -1512,7 +1670,7 @@ impl LayeredTimeline { // while iterating it. BTreeMap::retain() would be another option) for doomed_layer in layers_to_remove { doomed_layer.delete()?; - layers.remove_historic(&*doomed_layer); + layers.remove_historic(doomed_layer.clone()); match ( doomed_layer.is_dropped(), @@ -1555,12 +1713,20 @@ impl LayeredTimeline { loop { match layer_ref.get_page_reconstruct_data(blknum, curr_lsn, &mut data)? { PageReconstructResult::Complete => break, - PageReconstructResult::Continue(cont_lsn, cont_layer) => { + PageReconstructResult::Continue(cont_lsn) => { // Fetch base image / more WAL from the returned predecessor layer - layer_arc = cont_layer; - layer_ref = &*layer_arc; - curr_lsn = cont_lsn; - continue; + if let Some((cont_layer, cont_lsn)) = self.get_layer_for_read(seg, cont_lsn)? { + layer_arc = cont_layer; + layer_ref = &*layer_arc; + curr_lsn = cont_lsn; + continue; + } else { + bail!( + "could not find predecessor layer of segment {} at {}", + seg.rel, + cont_lsn + ); + } } PageReconstructResult::Missing(lsn) => { // Oops, we could not reconstruct the page. @@ -1662,9 +1828,8 @@ impl LayeredTimeline { diff, val + diff as usize, ); - LOGICAL_TIMELINE_SIZE - .with_label_values(&[&self.tenantid.to_string(), &self.timelineid.to_string()]) - .set(val as i64 + diff as i64) + self.current_logical_size_gauge + .set(val as i64 + diff as i64); } /// @@ -1680,9 +1845,8 @@ impl LayeredTimeline { diff, val - diff as usize, ); - LOGICAL_TIMELINE_SIZE - .with_label_values(&[&self.tenantid.to_string(), &self.timelineid.to_string()]) - .set(val as i64 - diff as i64) + self.current_logical_size_gauge + .set(val as i64 - diff as i64); } /// If a layer is in the process of being replaced in [`LayerMap`], write @@ -1735,13 +1899,22 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> { Ok(()) } -/// Check for equality of Layer memory addresses -fn layer_ptr_eq(l1: &dyn Layer, l2: &dyn Layer) -> bool { - let l1_ptr = l1 as *const dyn Layer; - let l2_ptr = l2 as *const dyn Layer; - // comparing *const dyn Layer will not only check for data address equality, - // but also for vtable address equality. - // to avoid this, we compare *const (). - // see here for more https://github.com/rust-lang/rust/issues/46139 - std::ptr::eq(l1_ptr as *const (), l2_ptr as *const ()) +/// Add a suffix to a layer file's name: .{num}.old +/// Uses the first available num (starts at 0) +fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { + let filename = path.file_name().unwrap().to_str().unwrap(); + let mut new_path = path.clone(); + + for i in 0u32.. { + new_path.set_file_name(format!("{}.{}.old", filename, i)); + if !new_path.exists() { + std::fs::rename(&path, &new_path)?; + return Ok(()); + } + } + + Err(anyhow!( + "couldn't find an unused backup number for {:?}", + path + )) } diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md index fae2e12b2f..180f3dbea8 100644 --- a/pageserver/src/layered_repository/README.md +++ b/pageserver/src/layered_repository/README.md @@ -25,11 +25,13 @@ OnDisk layers can be Image or Delta: Dropped segments are always represented on disk by DeltaLayer. LSN range defined by start_lsn and end_lsn: -- start_lsn is always inclusive. -- end_lsn depends on layer kind: - - InMemoryLayer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn) - - ImageLayer represents snapshot at one LSN, so end_lsn = lsn. - - DeltaLayer has explicit end_lsn, which represents end of incremental layer. +- start_lsn is inclusive. +- end_lsn is exclusive. + +For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen +in-memory layer or a delta layer, it is a valid end bound. An image +layer represents snapshot at one LSN, so end_lsn is always the +snapshot LSN + 1 Layers can be open or historical: - Open layer is a writeable one. Only InMemory layer can be open. diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index b834daae10..ad16a86030 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -42,12 +42,10 @@ use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, }; -use crate::repository::WALRecord; use crate::waldecoder; use crate::PageServerConf; use crate::{ZTenantId, ZTimelineId}; -use anyhow::{bail, Result}; -use bytes::Bytes; +use anyhow::{bail, ensure, Result}; use log::*; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -59,7 +57,7 @@ use std::fs::File; use std::io::{BufWriter, Write}; use std::ops::Bound::Included; use std::path::{Path, PathBuf}; -use std::sync::{Arc, Mutex, MutexGuard}; +use std::sync::{Mutex, MutexGuard}; use bookfile::{Book, BookWriter}; @@ -109,12 +107,6 @@ impl From<&DeltaLayer> for Summary { } } -#[derive(Serialize, Deserialize)] -struct PageVersionMeta { - page_image_range: Option, - record_range: Option, -} - /// /// DeltaLayer is the in-memory data structure associated with an /// on-disk delta file. We keep a DeltaLayer in memory for each @@ -139,9 +131,6 @@ pub struct DeltaLayer { dropped: bool, - /// Predecessor layer - predecessor: Option>, - inner: Mutex, } @@ -152,7 +141,7 @@ pub struct DeltaLayerInner { /// All versions of all pages in the file are are kept here. /// Indexed by block number and LSN. - page_version_metas: BTreeMap<(u32, Lsn), PageVersionMeta>, + page_version_metas: BTreeMap<(u32, Lsn), BlobRange>, /// `relsizes` tracks the size of the relation at different points in time. relsizes: BTreeMap, @@ -229,15 +218,15 @@ impl Layer for DeltaLayer { let mut iter = inner .page_version_metas .range((Included(&minkey), Included(&maxkey))); - while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() { - if let Some(img_range) = &entry.page_image_range { + while let Some(((_blknum, _entry_lsn), blob_range)) = iter.next_back() { + let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?; + + if let Some(img) = pv.page_image { // Found a page image, return it - let img = Bytes::from(read_blob(&page_version_reader, img_range)?); reconstruct_data.page_img = Some(img); need_image = false; break; - } else if let Some(rec_range) = &entry.record_range { - let rec = WALRecord::des(&read_blob(&page_version_reader, rec_range)?)?; + } else if let Some(rec) = pv.record { let will_init = rec.will_init; reconstruct_data.records.push(rec); if will_init { @@ -255,16 +244,9 @@ impl Layer for DeltaLayer { } // If an older page image is needed to reconstruct the page, let the - // caller know about the predecessor layer. + // caller know. if need_image { - if let Some(cont_layer) = &self.predecessor { - Ok(PageReconstructResult::Continue( - self.start_lsn, - Arc::clone(cont_layer), - )) - } else { - Ok(PageReconstructResult::Missing(self.start_lsn)) - } + Ok(PageReconstructResult::Continue(self.start_lsn)) } else { Ok(PageReconstructResult::Complete) } @@ -273,6 +255,10 @@ impl Layer for DeltaLayer { /// Get size of the relation at given LSN fn get_seg_size(&self, lsn: Lsn) -> Result { assert!(lsn >= self.start_lsn); + ensure!( + self.seg.rel.is_blocky(), + "get_seg_size() called on a non-blocky rel" + ); // Scan the BTreeMap backwards, starting from the given entry. let inner = self.load()?; @@ -281,11 +267,8 @@ impl Layer for DeltaLayer { let result; if let Some((_entry_lsn, entry)) = iter.next_back() { result = *entry; - // Use the base image if needed - } else if let Some(predecessor) = &self.predecessor { - result = predecessor.get_seg_size(lsn)?; } else { - result = 0; + bail!("could not find seg size in delta layer"); } Ok(result) } @@ -340,16 +323,16 @@ impl Layer for DeltaLayer { println!("--- page versions ---"); let (_path, book) = self.open_book()?; let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?; - for (k, v) in inner.page_version_metas.iter() { + for ((blk, lsn), blob_range) in inner.page_version_metas.iter() { let mut desc = String::new(); - if let Some(page_image_range) = v.page_image_range.as_ref() { - let image = read_blob(&chapter, page_image_range)?; - write!(&mut desc, " img {} bytes", image.len())?; + let buf = read_blob(&chapter, blob_range)?; + let pv = PageVersion::des(&buf)?; + + if let Some(img) = pv.page_image.as_ref() { + write!(&mut desc, " img {} bytes", img.len())?; } - if let Some(record_range) = v.record_range.as_ref() { - let record_bytes = read_blob(&chapter, record_range)?; - let rec = WALRecord::des(&record_bytes)?; + if let Some(rec) = pv.record.as_ref() { let wal_desc = waldecoder::describe_wal_record(&rec.rec); write!( &mut desc, @@ -359,7 +342,7 @@ impl Layer for DeltaLayer { wal_desc )?; } - println!(" blk {} at {}: {}", k.0, k.1, desc); + println!(" blk {} at {}: {}", blk, lsn, desc); } Ok(()) @@ -381,14 +364,15 @@ impl DeltaLayer { } } - /// Create a new delta file, using the given btreemaps containing the page versions and - /// relsizes. + /// Create a new delta file, using the given page versions and relsizes. + /// The page versions are passed by an iterator; the iterator must return + /// page versions in blknum+lsn order. /// /// This is used to write the in-memory layer to disk. The in-memory layer uses the same /// data structure with two btreemaps as we do, so passing the btreemaps is currently /// expedient. #[allow(clippy::too_many_arguments)] - pub fn create( + pub fn create<'a>( conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId, @@ -396,10 +380,13 @@ impl DeltaLayer { start_lsn: Lsn, end_lsn: Lsn, dropped: bool, - predecessor: Option>, - page_versions: BTreeMap<(u32, Lsn), PageVersion>, + page_versions: impl Iterator, relsizes: BTreeMap, ) -> Result { + if seg.rel.is_blocky() { + assert!(!relsizes.is_empty()); + } + let delta_layer = DeltaLayer { path_or_conf: PathOrConf::Conf(conf), timelineid, @@ -413,7 +400,6 @@ impl DeltaLayer { page_version_metas: BTreeMap::new(), relsizes, }), - predecessor, }; let mut inner = delta_layer.inner.lock().unwrap(); @@ -431,26 +417,10 @@ impl DeltaLayer { let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER); for (key, page_version) in page_versions { - let page_image_range = page_version - .page_image - .map(|page_image| page_version_writer.write_blob(page_image.as_ref())) - .transpose()?; + let buf = PageVersion::ser(page_version)?; + let blob_range = page_version_writer.write_blob(&buf)?; - let record_range = page_version - .record - .map(|record| { - let buf = WALRecord::ser(&record)?; - page_version_writer.write_blob(&buf) - }) - .transpose()?; - - let old = inner.page_version_metas.insert( - key, - PageVersionMeta { - page_image_range, - record_range, - }, - ); + let old = inner.page_version_metas.insert(*key, blob_range); assert!(old.is_none()); } @@ -484,7 +454,8 @@ impl DeltaLayer { let book = chapter.close()?; // This flushes the underlying 'buf_writer'. - book.close()?; + let writer = book.close()?; + writer.get_ref().sync_all()?; trace!("saved {}", &path.display()); @@ -573,7 +544,6 @@ impl DeltaLayer { timelineid: ZTimelineId, tenantid: ZTenantId, filename: &DeltaFileName, - predecessor: Option>, ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), @@ -588,7 +558,6 @@ impl DeltaLayer { page_version_metas: BTreeMap::new(), relsizes: BTreeMap::new(), }), - predecessor, } } @@ -612,7 +581,6 @@ impl DeltaLayer { page_version_metas: BTreeMap::new(), relsizes: BTreeMap::new(), }), - predecessor: None, }) } } diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs index e0adacaa26..50bfe2977e 100644 --- a/pageserver/src/layered_repository/filename.rs +++ b/pageserver/src/layered_repository/filename.rs @@ -290,7 +290,7 @@ pub fn list_files( deltafiles.push(deltafilename); } else if let Some(imgfilename) = ImageFileName::from_str(fname) { imgfiles.push(imgfilename); - } else if fname == "wal" || fname == "metadata" || fname == "ancestor" { + } else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") { // ignore these } else { warn!("unrecognized filename in timeline dir: {}", fname); diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 2ad1853755..a9487a02d4 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -152,7 +152,8 @@ impl Layer for ImageLayer { } fn get_end_lsn(&self) -> Lsn { - self.lsn + // End-bound is exclusive + self.lsn + 1 } /// Look up given page in the file @@ -336,7 +337,8 @@ impl ImageLayer { let book = chapter.close()?; // This flushes the underlying 'buf_writer'. - book.close()?; + let writer = book.close()?; + writer.get_ref().sync_all()?; trace!("saved {}", &path.display()); diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 7b47a701ad..f96b5e71d1 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -12,14 +12,16 @@ use crate::layered_repository::{DeltaLayer, ImageLayer}; use crate::repository::WALRecord; use crate::PageServerConf; use crate::{ZTenantId, ZTimelineId}; -use anyhow::{bail, Result}; +use anyhow::{bail, ensure, Result}; use bytes::Bytes; use log::*; +use std::cmp::min; use std::collections::BTreeMap; use std::ops::Bound::Included; use std::path::PathBuf; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, RwLock}; +use zenith_utils::accum::Accum; use zenith_utils::lsn::Lsn; pub struct InMemoryLayer { @@ -42,7 +44,10 @@ pub struct InMemoryLayer { /// The above fields never change. The parts that do change are in 'inner', /// and protected by mutex. - inner: Mutex, + inner: RwLock, + + /// Predecessor layer might be needed? + incremental: bool, } pub struct InMemoryLayerInner { @@ -58,14 +63,15 @@ pub struct InMemoryLayerInner { /// /// `segsizes` tracks the size of the segment at different points in time. /// + /// For a blocky rel, there is always one entry, at the layer's start_lsn, + /// so that determining the size never depends on the predecessor layer. For + /// a non-blocky rel, 'segsizes' is not used and is always empty. + /// segsizes: BTreeMap, /// Writes are only allowed when true. /// Set to false when this layer is in the process of being replaced. writeable: bool, - - /// Predecessor layer - predecessor: Option>, } impl InMemoryLayerInner { @@ -81,10 +87,11 @@ impl InMemoryLayerInner { // Scan the BTreeMap backwards, starting from the given entry. let mut iter = self.segsizes.range((Included(&Lsn(0)), Included(&lsn))); + // We make sure there is always at least one entry if let Some((_entry_lsn, entry)) = iter.next_back() { *entry } else { - 0 + panic!("could not find seg size in in-memory layer"); } } } @@ -93,7 +100,7 @@ impl Layer for InMemoryLayer { // An in-memory layer doesn't really have a filename as it's not stored on disk, // but we construct a filename as if it was a delta layer fn filename(&self) -> PathBuf { - let inner = self.inner.lock().unwrap(); + let inner = self.inner.read().unwrap(); let end_lsn; let dropped; @@ -137,7 +144,7 @@ impl Layer for InMemoryLayer { return Lsn(end_lsn.0 + 1); } - let inner = self.inner.lock().unwrap(); + let inner = self.inner.read().unwrap(); if let Some(drop_lsn) = inner.drop_lsn { drop_lsn @@ -147,7 +154,7 @@ impl Layer for InMemoryLayer { } fn is_dropped(&self) -> bool { - let inner = self.inner.lock().unwrap(); + let inner = self.inner.read().unwrap(); inner.drop_lsn.is_some() } @@ -162,10 +169,8 @@ impl Layer for InMemoryLayer { assert!(self.seg.blknum_in_seg(blknum)); - let predecessor: Option>; - { - let inner = self.inner.lock().unwrap(); + let inner = self.inner.read().unwrap(); // Scan the BTreeMap backwards, starting from reconstruct_data.lsn. let minkey = (blknum, Lsn(0)); @@ -190,16 +195,14 @@ impl Layer for InMemoryLayer { bail!("no page image or WAL record for requested page"); } } - - predecessor = inner.predecessor.clone(); // release lock on 'inner' } // If an older page image is needed to reconstruct the page, let the - // caller know about the predecessor layer. + // caller know if need_image { - if let Some(cont_layer) = predecessor { - Ok(PageReconstructResult::Continue(self.start_lsn, cont_layer)) + if self.incremental { + Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1))) } else { Ok(PageReconstructResult::Missing(self.start_lsn)) } @@ -211,14 +214,18 @@ impl Layer for InMemoryLayer { /// Get size of the relation at given LSN fn get_seg_size(&self, lsn: Lsn) -> Result { assert!(lsn >= self.start_lsn); + ensure!( + self.seg.rel.is_blocky(), + "get_seg_size() called on a non-blocky rel" + ); - let inner = self.inner.lock().unwrap(); + let inner = self.inner.read().unwrap(); Ok(inner.get_seg_size(lsn)) } /// Does this segment exist at given LSN? fn get_seg_exists(&self, lsn: Lsn) -> Result { - let inner = self.inner.lock().unwrap(); + let inner = self.inner.read().unwrap(); // If the segment created after requested LSN, // it doesn't exist in the layer. But we shouldn't @@ -250,13 +257,12 @@ impl Layer for InMemoryLayer { } fn is_incremental(&self) -> bool { - let inner = self.inner.lock().unwrap(); - inner.predecessor.is_some() + self.incremental } /// debugging function to print out the contents of the layer fn dump(&self) -> Result<()> { - let inner = self.inner.lock().unwrap(); + let inner = self.inner.read().unwrap(); let end_str = inner .drop_lsn @@ -330,6 +336,12 @@ impl InMemoryLayer { start_lsn ); + // The segment is initially empty, so initialize 'segsizes' with 0. + let mut segsizes = BTreeMap::new(); + if seg.rel.is_blocky() { + segsizes.insert(start_lsn, 0); + } + Ok(InMemoryLayer { conf, timelineid, @@ -338,12 +350,12 @@ impl InMemoryLayer { start_lsn, end_lsn: None, oldest_pending_lsn, - inner: Mutex::new(InMemoryLayerInner { + incremental: false, + inner: RwLock::new(InMemoryLayerInner { drop_lsn: None, page_versions: BTreeMap::new(), - segsizes: BTreeMap::new(), + segsizes, writeable: true, - predecessor: None, }), }) } @@ -387,7 +399,7 @@ impl InMemoryLayer { self.timelineid, lsn ); - let mut inner = self.inner.lock().unwrap(); + let mut inner = self.inner.write().unwrap(); inner.check_writeable()?; @@ -405,7 +417,7 @@ impl InMemoryLayer { if self.seg.rel.is_blocky() { let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1; - // use inner get_seg_size, since calling self.get_seg_size will try to acquire self.inner.lock + // use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock, // which we've just acquired above let oldsize = inner.get_seg_size(lsn); if newsize > oldsize { @@ -456,9 +468,13 @@ impl InMemoryLayer { /// Remember that the relation was truncated at given LSN pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> { + assert!( + self.seg.rel.is_blocky(), + "put_truncation() called on a non-blocky rel" + ); self.assert_not_frozen(); - let mut inner = self.inner.lock().unwrap(); + let mut inner = self.inner.write().unwrap(); inner.check_writeable()?; // check that this we truncate to a smaller size than segment was before the truncation @@ -479,14 +495,15 @@ impl InMemoryLayer { pub fn drop_segment(&self, lsn: Lsn) -> WriteResult<()> { self.assert_not_frozen(); - let mut inner = self.inner.lock().unwrap(); + let mut inner = self.inner.write().unwrap(); inner.check_writeable()?; assert!(inner.drop_lsn.is_none()); inner.drop_lsn = Some(lsn); + inner.writeable = false; - info!("dropped segment {} at {}", self.seg, lsn); + trace!("dropped segment {} at {}", self.seg, lsn); Ok(()) } @@ -505,6 +522,9 @@ impl InMemoryLayer { ) -> Result { let seg = src.get_seg_tag(); + assert!(oldest_pending_lsn.is_aligned()); + assert!(oldest_pending_lsn >= start_lsn); + trace!( "initializing new InMemoryLayer for writing {} on timeline {} at {}", seg, @@ -512,7 +532,7 @@ impl InMemoryLayer { start_lsn, ); - // For convenience, copy the segment size from the predecessor layer + // Copy the segment size at the start LSN from the predecessor layer. let mut segsizes = BTreeMap::new(); if seg.rel.is_blocky() { let size = src.get_seg_size(start_lsn)?; @@ -527,75 +547,83 @@ impl InMemoryLayer { start_lsn, end_lsn: None, oldest_pending_lsn, - inner: Mutex::new(InMemoryLayerInner { + incremental: true, + inner: RwLock::new(InMemoryLayerInner { drop_lsn: None, page_versions: BTreeMap::new(), segsizes, writeable: true, - predecessor: Some(src), }), }) } + pub fn is_writeable(&self) -> bool { + let inner = self.inner.read().unwrap(); + inner.writeable + } + /// Splits `self` into two InMemoryLayers: `frozen` and `open`. - /// All data up to and including `cutoff_lsn` (or the drop LSN, if dropped) + /// All data up to and including `cutoff_lsn` /// is copied to `frozen`, while the remaining data is copied to `open`. /// After completion, self is non-writeable, but not frozen. - pub fn freeze(&self, cutoff_lsn: Lsn) -> Result { + pub fn freeze(self: Arc, cutoff_lsn: Lsn) -> Result { info!( - "freezing in memory layer for {} on timeline {} at {}", - self.seg, self.timelineid, cutoff_lsn + "freezing in memory layer {} on timeline {} at {} (oldest {})", + self.filename().display(), + self.timelineid, + cutoff_lsn, + self.oldest_pending_lsn ); self.assert_not_frozen(); - let mut inner = self.inner.lock().unwrap(); + let self_ref = self.clone(); + let mut inner = self_ref.inner.write().unwrap(); + // Dropped layers don't need any special freeze actions, + // they are marked as non-writeable at drop and just + // written out to disk by checkpointer. + if inner.drop_lsn.is_some() { + assert!(!inner.writeable); + info!( + "freezing in memory layer for {} on timeline {} is dropped at {}", + self.seg, + self.timelineid, + inner.drop_lsn.unwrap() + ); + + // There should be no newer layer that refers this non-writeable layer, + // because layer that is created after dropped one represents a new rel. + return Ok(FreezeLayers { + frozen: self, + open: None, + }); + } assert!(inner.writeable); inner.writeable = false; - // Normally, use the cutoff LSN as the end of the frozen layer. - // But if the relation was dropped, we know that there are no - // more changes coming in for it, and in particular we know that - // there are no changes "in flight" for the LSN anymore, so we use - // the drop LSN instead. The drop-LSN could be ahead of the - // caller-specified LSN! - let dropped = inner.drop_lsn.is_some(); - let end_lsn = if dropped { - inner.drop_lsn.unwrap() - } else { - cutoff_lsn - }; - - // Divide all the page versions into old and new at the 'end_lsn' cutoff point. - let mut before_page_versions; - let mut before_segsizes; - let mut after_page_versions; - let mut after_segsizes; - if !dropped { - before_segsizes = BTreeMap::new(); - after_segsizes = BTreeMap::new(); - for (lsn, size) in inner.segsizes.iter() { - if *lsn > end_lsn { - after_segsizes.insert(*lsn, *size); - } else { - before_segsizes.insert(*lsn, *size); - } + // Divide all the page versions into old and new + // at the 'cutoff_lsn' point. + let mut before_segsizes = BTreeMap::new(); + let mut after_segsizes = BTreeMap::new(); + let mut after_oldest_lsn: Accum = Accum(None); + for (lsn, size) in inner.segsizes.iter() { + if *lsn > cutoff_lsn { + after_segsizes.insert(*lsn, *size); + after_oldest_lsn.accum(min, *lsn); + } else { + before_segsizes.insert(*lsn, *size); } + } - before_page_versions = BTreeMap::new(); - after_page_versions = BTreeMap::new(); - for ((blknum, lsn), pv) in inner.page_versions.iter() { - if *lsn > end_lsn { - after_page_versions.insert((*blknum, *lsn), pv.clone()); - } else { - before_page_versions.insert((*blknum, *lsn), pv.clone()); - } + let mut before_page_versions = BTreeMap::new(); + let mut after_page_versions = BTreeMap::new(); + for ((blknum, lsn), pv) in inner.page_versions.iter() { + if *lsn > cutoff_lsn { + after_page_versions.insert((*blknum, *lsn), pv.clone()); + after_oldest_lsn.accum(min, *lsn); + } else { + before_page_versions.insert((*blknum, *lsn), pv.clone()); } - } else { - before_page_versions = inner.page_versions.clone(); - before_segsizes = inner.segsizes.clone(); - after_segsizes = BTreeMap::new(); - after_page_versions = BTreeMap::new(); } let frozen = Arc::new(InMemoryLayer { @@ -604,25 +632,25 @@ impl InMemoryLayer { timelineid: self.timelineid, seg: self.seg, start_lsn: self.start_lsn, - end_lsn: Some(end_lsn), + end_lsn: Some(cutoff_lsn), oldest_pending_lsn: self.start_lsn, - inner: Mutex::new(InMemoryLayerInner { + incremental: self.incremental, + inner: RwLock::new(InMemoryLayerInner { drop_lsn: inner.drop_lsn, page_versions: before_page_versions, segsizes: before_segsizes, writeable: false, - predecessor: inner.predecessor.clone(), }), }); - let open = if !dropped && (!after_segsizes.is_empty() || !after_page_versions.is_empty()) { + let open = if !after_segsizes.is_empty() || !after_page_versions.is_empty() { let mut new_open = Self::create_successor_layer( self.conf, frozen.clone(), self.timelineid, self.tenantid, - end_lsn, - end_lsn, + cutoff_lsn + 1, + after_oldest_lsn.0.unwrap(), )?; let new_inner = new_open.inner.get_mut().unwrap(); @@ -634,7 +662,6 @@ impl InMemoryLayer { None }; - // TODO could we avoid creating the `frozen` if it contains no data Ok(FreezeLayers { frozen, open }) } @@ -647,35 +674,59 @@ impl InMemoryLayer { /// when a new relish is created with a single LSN, so that the start and /// end LSN are the same.) pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result>> { - let end_lsn = self.end_lsn.expect("can only write frozen layers to disk"); + trace!( + "write_to_disk {} end_lsn is {} get_end_lsn is {}", + self.filename().display(), + self.end_lsn.unwrap_or(Lsn(0)), + self.get_end_lsn() + ); - let inner = self.inner.lock().unwrap(); + // Grab the lock in read-mode. We hold it over the I/O, but because this + // layer is not writeable anymore, no one should be trying to aquire the + // write lock on it, so we shouldn't block anyone. There's one exception + // though: another thread might have grabbed a reference to this layer + // in `get_layer_for_write' just before the checkpointer called + // `freeze`, and then `write_to_disk` on it. When the thread gets the + // lock, it will see that it's not writeable anymore and retry, but it + // would have to wait until we release it. That race condition is very + // rare though, so we just accept the potential latency hit for now. + let inner = self.inner.read().unwrap(); + assert!(!inner.writeable); - let drop_lsn = inner.drop_lsn; - let predecessor = inner.predecessor.clone(); - - let mut before_page_versions; - let mut before_segsizes; - if inner.drop_lsn.is_none() { - before_segsizes = BTreeMap::new(); - for (lsn, size) in inner.segsizes.iter() { - if *lsn <= end_lsn { - before_segsizes.insert(*lsn, *size); - } - } - - before_page_versions = BTreeMap::new(); - for ((blknum, lsn), pv) in inner.page_versions.iter() { - if *lsn < end_lsn { - before_page_versions.insert((*blknum, *lsn), pv.clone()); - } - } - } else { - before_page_versions = inner.page_versions.clone(); - before_segsizes = inner.segsizes.clone(); + if let Some(drop_lsn) = inner.drop_lsn { + let delta_layer = DeltaLayer::create( + self.conf, + self.timelineid, + self.tenantid, + self.seg, + self.start_lsn, + drop_lsn, + true, + inner.page_versions.iter(), + inner.segsizes.clone(), + )?; + trace!( + "freeze: created delta layer for dropped segment {} {}-{}", + self.seg, + self.start_lsn, + drop_lsn + ); + return Ok(vec![Arc::new(delta_layer)]); } - drop(inner); + let end_lsn = self.end_lsn.unwrap(); + + let mut before_segsizes = BTreeMap::new(); + for (lsn, size) in inner.segsizes.iter() { + if *lsn <= end_lsn { + before_segsizes.insert(*lsn, *size); + } + } + let mut before_page_versions = inner.page_versions.iter().filter(|tup| { + let ((_blknum, lsn), _pv) = tup; + + *lsn < end_lsn + }); let mut frozen_layers: Vec> = Vec::new(); @@ -688,8 +739,7 @@ impl InMemoryLayer { self.seg, self.start_lsn, end_lsn, - drop_lsn.is_some(), - predecessor, + false, before_page_versions, before_segsizes, )?; @@ -701,21 +751,16 @@ impl InMemoryLayer { end_lsn ); } else { - assert!(before_page_versions.is_empty()); + assert!(before_page_versions.next().is_none()); } - if drop_lsn.is_none() { - // Write a new base image layer at the cutoff point - let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?; - frozen_layers.push(Arc::new(image_layer)); - trace!("freeze: created image layer {} at {}", self.seg, end_lsn); - } + drop(inner); + + // Write a new base image layer at the cutoff point + let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?; + frozen_layers.push(Arc::new(image_layer)); + trace!("freeze: created image layer {} at {}", self.seg, end_lsn); Ok(frozen_layers) } - - pub fn update_predecessor(&self, predecessor: Arc) -> Option> { - let mut inner = self.inner.lock().unwrap(); - inner.predecessor.replace(predecessor) - } } diff --git a/pageserver/src/layered_repository/interval_tree.rs b/pageserver/src/layered_repository/interval_tree.rs new file mode 100644 index 0000000000..978ecd837e --- /dev/null +++ b/pageserver/src/layered_repository/interval_tree.rs @@ -0,0 +1,468 @@ +/// +/// IntervalTree is data structure for holding intervals. It is generic +/// to make unit testing possible, but the only real user of it is the layer map, +/// +/// It's inspired by the "segment tree" or a "statistic tree" as described in +/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold +/// the points instead of a binary tree. This is called an "interval tree" instead +/// of "segment tree" because the term "segment" is already using Zenith to mean +/// something else. To add to the confusion, there is another data structure known +/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree), +/// for storing intervals, but this isn't that. +/// +/// The basic idea is to have a B-tree of "interesting Points". At each Point, +/// there is a list of intervals that contain the point. The Points are formed +/// from the start bounds of each interval; there is a Point for each distinct +/// start bound. +/// +/// Operations: +/// +/// To find intervals that contain a given point, you search the b-tree to find +/// the nearest Point <= search key. Then you just return the list of intervals. +/// +/// To insert an interval, find the Point with start key equal to the inserted item. +/// If the Point doesn't exist yet, create it, by copying all the items from the +/// previous Point that cover the new Point. Then walk right, inserting the new +/// interval to all the Points that are contained by the new interval (including the +/// newly created Point). +/// +/// To remove an interval, you scan the tree for all the Points that are contained by +/// the removed interval, and remove it from the list in each Point. +/// +/// Requirements and assumptions: +/// +/// - Can store overlapping items +/// - But there are not many overlapping items +/// - The interval bounds don't change after it is added to the tree +/// - Intervals are uniquely identified by pointer equality. You must not be insert the +/// same interval object twice, and `remove` uses pointer equality to remove the right +/// interval. It is OK to have two intervals with the same bounds, however. +/// +use std::collections::BTreeMap; +use std::fmt::Debug; +use std::ops::Range; +use std::sync::Arc; + +pub struct IntervalTree +where + I: IntervalItem, +{ + points: BTreeMap>, +} + +struct Point { + /// All intervals that contain this point, in no particular order. + /// + /// We assume that there aren't a lot of overlappingg intervals, so that this vector + /// never grows very large. If that assumption doesn't hold, we could keep this ordered + /// by the end bound, to speed up `search`. But as long as there are only a few elements, + /// a linear search is OK. + elements: Vec>, +} + +/// Abstraction for an interval that can be stored in the tree +/// +/// The start bound is inclusive and the end bound is exclusive. End must be greater +/// than start. +pub trait IntervalItem { + type Key: Ord + Copy + Debug + Sized; + + fn start_key(&self) -> Self::Key; + fn end_key(&self) -> Self::Key; + + fn bounds(&self) -> Range { + self.start_key()..self.end_key() + } +} + +impl IntervalTree +where + I: IntervalItem, +{ + /// Return an element that contains 'key', or precedes it. + /// + /// If there are multiple candidates, returns the one with the highest 'end' key. + pub fn search(&self, key: I::Key) -> Option> { + // Find the greatest point that precedes or is equal to the search key. If there is + // none, returns None. + let (_, p) = self.points.range(..=key).next_back()?; + + // Find the element with the highest end key at this point + let highest_item = p + .elements + .iter() + .reduce(|a, b| { + // starting with Rust 1.53, could use `std::cmp::min_by_key` here + if a.end_key() > b.end_key() { + a + } else { + b + } + }) + .unwrap(); + Some(Arc::clone(highest_item)) + } + + /// Iterate over all items with start bound >= 'key' + pub fn iter_newer(&self, key: I::Key) -> IntervalIter { + IntervalIter { + point_iter: self.points.range(key..), + elem_iter: None, + } + } + + /// Iterate over all items + pub fn iter(&self) -> IntervalIter { + IntervalIter { + point_iter: self.points.range(..), + elem_iter: None, + } + } + + pub fn insert(&mut self, item: Arc) { + let start_key = item.start_key(); + let end_key = item.end_key(); + assert!(start_key < end_key); + let bounds = start_key..end_key; + + // Find the starting point and walk forward from there + let mut found_start_point = false; + let iter = self.points.range_mut(bounds); + for (point_key, point) in iter { + if *point_key == start_key { + found_start_point = true; + // It is an error to insert the same item to the tree twice. + assert!( + !point.elements.iter().any(|x| Arc::ptr_eq(x, &item)), + "interval is already in the tree" + ); + } + point.elements.push(Arc::clone(&item)); + } + if !found_start_point { + // Create a new Point for the starting point + + // Look at the previous point, and copy over elements that overlap with this + // new point + let mut new_elements: Vec> = Vec::new(); + if let Some((_, prev_point)) = self.points.range(..start_key).next_back() { + let overlapping_prev_elements = prev_point + .elements + .iter() + .filter(|x| x.bounds().contains(&start_key)) + .cloned(); + + new_elements.extend(overlapping_prev_elements); + } + new_elements.push(item); + + let new_point = Point { + elements: new_elements, + }; + self.points.insert(start_key, new_point); + } + } + + pub fn remove(&mut self, item: &Arc) { + // range search points + let start_key = item.start_key(); + let end_key = item.end_key(); + let bounds = start_key..end_key; + + let mut points_to_remove: Vec = Vec::new(); + let mut found_start_point = false; + for (point_key, point) in self.points.range_mut(bounds) { + if *point_key == start_key { + found_start_point = true; + } + let len_before = point.elements.len(); + point.elements.retain(|other| !Arc::ptr_eq(other, item)); + let len_after = point.elements.len(); + assert_eq!(len_after + 1, len_before); + if len_after == 0 { + points_to_remove.push(*point_key); + } + } + assert!(found_start_point); + + for k in points_to_remove { + self.points.remove(&k).unwrap(); + } + } +} + +pub struct IntervalIter<'a, I: ?Sized> +where + I: IntervalItem, +{ + point_iter: std::collections::btree_map::Range<'a, I::Key, Point>, + elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc>)>, +} + +impl<'a, I> Iterator for IntervalIter<'a, I> +where + I: IntervalItem + ?Sized, +{ + type Item = Arc; + + fn next(&mut self) -> Option { + // Iterate over all elements in all the points in 'point_iter'. To avoid + // returning the same element twice, we only return each element at its + // starting point. + loop { + // Return next remaining element from the current point + if let Some((point_key, elem_iter)) = &mut self.elem_iter { + for elem in elem_iter { + if elem.start_key() == *point_key { + return Some(Arc::clone(elem)); + } + } + } + // No more elements at this point. Move to next point. + if let Some((point_key, point)) = self.point_iter.next() { + self.elem_iter = Some((*point_key, point.elements.iter())); + continue; + } else { + // No more points, all done + return None; + } + } + } +} + +impl Default for IntervalTree +where + I: IntervalItem, +{ + fn default() -> Self { + IntervalTree { + points: BTreeMap::new(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fmt; + + #[derive(Debug)] + struct MockItem { + start_key: u32, + end_key: u32, + val: String, + } + impl IntervalItem for MockItem { + type Key = u32; + + fn start_key(&self) -> u32 { + self.start_key + } + fn end_key(&self) -> u32 { + self.end_key + } + } + impl MockItem { + fn new(start_key: u32, end_key: u32) -> Self { + MockItem { + start_key, + end_key, + val: format!("{}-{}", start_key, end_key), + } + } + fn new_str(start_key: u32, end_key: u32, val: &str) -> Self { + MockItem { + start_key, + end_key, + val: format!("{}-{}: {}", start_key, end_key, val), + } + } + } + impl fmt::Display for MockItem { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.val) + } + } + #[rustfmt::skip] + fn assert_search( + tree: &IntervalTree, + key: u32, + expected: &[&str], + ) -> Option> { + if let Some(v) = tree.search(key) { + let vstr = v.to_string(); + + assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v); + assert!( + expected.contains(&vstr.as_str()), + "search with {} returned {}, expected one of: {:?}", + key, v, expected, + ); + + Some(v) + } else { + assert!( + expected.is_empty(), + "search with {} returned None, expected one of {:?}", + key, expected + ); + None + } + } + + fn assert_contents(tree: &IntervalTree, expected: &[&str]) { + let mut contents: Vec = tree.iter().map(|e| e.to_string()).collect(); + contents.sort(); + assert_eq!(contents, expected); + } + + fn dump_tree(tree: &IntervalTree) { + for (point_key, point) in tree.points.iter() { + print!("{}:", point_key); + for e in point.elements.iter() { + print!(" {}", e); + } + println!(); + } + } + + #[test] + fn test_interval_tree_simple() { + let mut tree: IntervalTree = IntervalTree::default(); + + // Simple, non-overlapping ranges. + tree.insert(Arc::new(MockItem::new(10, 11))); + tree.insert(Arc::new(MockItem::new(11, 12))); + tree.insert(Arc::new(MockItem::new(12, 13))); + tree.insert(Arc::new(MockItem::new(18, 19))); + tree.insert(Arc::new(MockItem::new(17, 18))); + tree.insert(Arc::new(MockItem::new(15, 16))); + + assert_search(&tree, 9, &[]); + assert_search(&tree, 10, &["10-11"]); + assert_search(&tree, 11, &["11-12"]); + assert_search(&tree, 12, &["12-13"]); + assert_search(&tree, 13, &["12-13"]); + assert_search(&tree, 14, &["12-13"]); + assert_search(&tree, 15, &["15-16"]); + assert_search(&tree, 16, &["15-16"]); + assert_search(&tree, 17, &["17-18"]); + assert_search(&tree, 18, &["18-19"]); + assert_search(&tree, 19, &["18-19"]); + assert_search(&tree, 20, &["18-19"]); + + // remove a few entries and search around them again + tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry + tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle + tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry + assert_search(&tree, 9, &[]); + assert_search(&tree, 10, &[]); + assert_search(&tree, 11, &["11-12"]); + assert_search(&tree, 12, &["11-12"]); + assert_search(&tree, 14, &["11-12"]); + assert_search(&tree, 15, &["15-16"]); + assert_search(&tree, 17, &["17-18"]); + assert_search(&tree, 18, &["17-18"]); + } + + #[test] + fn test_interval_tree_overlap() { + let mut tree: IntervalTree = IntervalTree::default(); + + // Overlapping items + tree.insert(Arc::new(MockItem::new(22, 24))); + tree.insert(Arc::new(MockItem::new(23, 25))); + let x24_26 = Arc::new(MockItem::new(24, 26)); + tree.insert(Arc::clone(&x24_26)); + let x26_28 = Arc::new(MockItem::new(26, 28)); + tree.insert(Arc::clone(&x26_28)); + tree.insert(Arc::new(MockItem::new(25, 27))); + + assert_search(&tree, 22, &["22-24"]); + assert_search(&tree, 23, &["22-24", "23-25"]); + assert_search(&tree, 24, &["23-25", "24-26"]); + assert_search(&tree, 25, &["24-26", "25-27"]); + assert_search(&tree, 26, &["25-27", "26-28"]); + assert_search(&tree, 27, &["26-28"]); + assert_search(&tree, 28, &["26-28"]); + assert_search(&tree, 29, &["26-28"]); + + tree.remove(&x24_26); + tree.remove(&x26_28); + assert_search(&tree, 23, &["22-24", "23-25"]); + assert_search(&tree, 24, &["23-25"]); + assert_search(&tree, 25, &["25-27"]); + assert_search(&tree, 26, &["25-27"]); + assert_search(&tree, 27, &["25-27"]); + assert_search(&tree, 28, &["25-27"]); + assert_search(&tree, 29, &["25-27"]); + } + + #[test] + fn test_interval_tree_nested() { + let mut tree: IntervalTree = IntervalTree::default(); + + // Items containing other items + tree.insert(Arc::new(MockItem::new(31, 39))); + tree.insert(Arc::new(MockItem::new(32, 34))); + tree.insert(Arc::new(MockItem::new(33, 35))); + tree.insert(Arc::new(MockItem::new(30, 40))); + + assert_search(&tree, 30, &["30-40"]); + assert_search(&tree, 31, &["30-40", "31-39"]); + assert_search(&tree, 32, &["30-40", "32-34", "31-39"]); + assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]); + assert_search(&tree, 34, &["30-40", "33-35", "31-39"]); + assert_search(&tree, 35, &["30-40", "31-39"]); + assert_search(&tree, 36, &["30-40", "31-39"]); + assert_search(&tree, 37, &["30-40", "31-39"]); + assert_search(&tree, 38, &["30-40", "31-39"]); + assert_search(&tree, 39, &["30-40"]); + assert_search(&tree, 40, &["30-40"]); + assert_search(&tree, 41, &["30-40"]); + } + + #[test] + fn test_interval_tree_duplicates() { + let mut tree: IntervalTree = IntervalTree::default(); + + // Duplicate keys + let item_a = Arc::new(MockItem::new_str(55, 56, "a")); + tree.insert(Arc::clone(&item_a)); + let item_b = Arc::new(MockItem::new_str(55, 56, "b")); + tree.insert(Arc::clone(&item_b)); + let item_c = Arc::new(MockItem::new_str(55, 56, "c")); + tree.insert(Arc::clone(&item_c)); + let item_d = Arc::new(MockItem::new_str(54, 56, "d")); + tree.insert(Arc::clone(&item_d)); + let item_e = Arc::new(MockItem::new_str(55, 57, "e")); + tree.insert(Arc::clone(&item_e)); + + dump_tree(&tree); + + assert_search( + &tree, + 55, + &["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"], + ); + tree.remove(&item_b); + dump_tree(&tree); + + assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]); + + tree.remove(&item_d); + dump_tree(&tree); + assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]); + } + + #[test] + #[should_panic] + fn test_interval_tree_insert_twice() { + let mut tree: IntervalTree = IntervalTree::default(); + + // Inserting the same item twice is not cool + let item = Arc::new(MockItem::new(1, 2)); + tree.insert(Arc::clone(&item)); + tree.insert(Arc::clone(&item)); // fails assertion + } +} diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 50114efeeb..c5643e896b 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -9,13 +9,14 @@ //! new image and delta layers and corresponding files are written to disk. //! +use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree}; use crate::layered_repository::storage_layer::{Layer, SegmentTag}; use crate::layered_repository::InMemoryLayer; use crate::relish::*; use anyhow::Result; use lazy_static::lazy_static; use std::cmp::Ordering; -use std::collections::{BTreeMap, BinaryHeap, HashMap}; +use std::collections::{BinaryHeap, HashMap}; use std::sync::Arc; use zenith_metrics::{register_int_gauge, IntGauge}; use zenith_utils::lsn::Lsn; @@ -32,6 +33,7 @@ lazy_static! { /// /// LayerMap tracks what layers exist on a timeline. /// +#[derive(Default)] pub struct LayerMap { /// All the layers keyed by segment tag segs: HashMap, @@ -46,16 +48,6 @@ pub struct LayerMap { current_generation: u64, } -impl Default for LayerMap { - fn default() -> Self { - LayerMap { - segs: HashMap::new(), - open_layers: BinaryHeap::new(), - current_generation: 0, - } - } -} - impl LayerMap { /// /// Look up a layer using the given segment tag and LSN. This differs from a @@ -85,7 +77,14 @@ impl LayerMap { pub fn insert_open(&mut self, layer: Arc) { let segentry = self.segs.entry(layer.get_seg_tag()).or_default(); - segentry.insert_open(Arc::clone(&layer)); + segentry.update_open(Arc::clone(&layer)); + + let oldest_pending_lsn = layer.get_oldest_pending_lsn(); + + // After a crash and restart, 'oldest_pending_lsn' of the oldest in-memory + // layer becomes the WAL streaming starting point, so it better not point + // in the middle of a WAL record. + assert!(oldest_pending_lsn.is_aligned()); // Also add it to the binary heap let open_layer_entry = OpenLayerEntry { @@ -106,11 +105,14 @@ impl LayerMap { // Also remove it from the SegEntry of this segment let mut segentry = self.segs.get_mut(&segtag).unwrap(); - assert!(Arc::ptr_eq( - segentry.open.as_ref().unwrap(), - &oldest_entry.layer - )); - segentry.open = None; + if Arc::ptr_eq(segentry.open.as_ref().unwrap(), &oldest_entry.layer) { + segentry.open = None; + } else { + // We could have already updated segentry.open for + // dropped (non-writeable) layer. This is fine. + assert!(!oldest_entry.layer.is_writeable()); + assert!(oldest_entry.layer.is_dropped()); + } NUM_INMEMORY_LAYERS.dec(); } @@ -130,12 +132,11 @@ impl LayerMap { /// /// This should be called when the corresponding file on disk has been deleted. /// - pub fn remove_historic(&mut self, layer: &dyn Layer) { + pub fn remove_historic(&mut self, layer: Arc) { let tag = layer.get_seg_tag(); - let start_lsn = layer.get_start_lsn(); if let Some(segentry) = self.segs.get_mut(&tag) { - segentry.historic.remove(&start_lsn); + segentry.historic.remove(&layer); } NUM_ONDISK_LAYERS.dec(); } @@ -153,7 +154,7 @@ impl LayerMap { if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode) && (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode) { - if let Some(exists) = segentry.exists_at_lsn(lsn) { + if let Some(exists) = segentry.exists_at_lsn(lsn)? { rels.insert(seg.rel, exists); } } @@ -161,7 +162,7 @@ impl LayerMap { } _ => { if tag == None { - if let Some(exists) = segentry.exists_at_lsn(lsn) { + if let Some(exists) = segentry.exists_at_lsn(lsn)? { rels.insert(seg.rel, exists); } } @@ -183,6 +184,20 @@ impl LayerMap { } } + /// Is there any layer for given segment that is alive at the lsn? + /// + /// This is a public wrapper for SegEntry fucntion, + /// used for garbage collection, to determine if some alive layer + /// exists at the lsn. If so, we shouldn't delete a newer dropped layer + /// to avoid incorrectly making it visible. + pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result { + Ok(if let Some(segentry) = self.segs.get(&seg) { + segentry.exists_at_lsn(lsn)?.unwrap_or(false) + } else { + false + }) + } + /// Return the oldest in-memory layer, along with its generation number. pub fn peek_oldest_open(&self) -> Option<(Arc, u64)> { self.open_layers @@ -200,7 +215,7 @@ impl LayerMap { pub fn iter_historic_layers(&self) -> HistoricLayerIter { HistoricLayerIter { - segiter: self.segs.iter(), + seg_iter: self.segs.iter(), iter: None, } } @@ -214,7 +229,7 @@ impl LayerMap { open.dump()?; } - for (_, layer) in segentry.historic.iter() { + for layer in segentry.historic.iter() { layer.dump()?; } } @@ -223,42 +238,40 @@ impl LayerMap { } } +impl IntervalItem for dyn Layer { + type Key = Lsn; + + fn start_key(&self) -> Lsn { + self.get_start_lsn() + } + fn end_key(&self) -> Lsn { + self.get_end_lsn() + } +} + /// /// Per-segment entry in the LayerMap::segs hash map. Holds all the layers /// associated with the segment. /// /// The last layer that is open for writes is always an InMemoryLayer, /// and is kept in a separate field, because there can be only one for -/// each segment. The older layers, stored on disk, are kept in a -/// BTreeMap keyed by the layer's start LSN. +/// each segment. The older layers, stored on disk, are kept in an +/// IntervalTree. +#[derive(Default)] struct SegEntry { - pub open: Option>, - pub historic: BTreeMap>, -} - -impl Default for SegEntry { - fn default() -> Self { - SegEntry { - open: None, - historic: BTreeMap::new(), - } - } + open: Option>, + historic: IntervalTree, } impl SegEntry { /// Does the segment exist at given LSN? /// Return None if object is not found in this SegEntry. - fn exists_at_lsn(&self, lsn: Lsn) -> Option { - if let Some(layer) = &self.open { - if layer.get_start_lsn() <= lsn && lsn <= layer.get_end_lsn() { - let exists = layer.get_seg_exists(lsn).ok()?; - return Some(exists); - } - } else if let Some((_, layer)) = self.historic.range(..=lsn).next_back() { - let exists = layer.get_seg_exists(lsn).ok()?; - return Some(exists); + fn exists_at_lsn(&self, lsn: Lsn) -> Result> { + if let Some(layer) = self.get(lsn) { + Ok(Some(layer.get_seg_exists(lsn)?)) + } else { + Ok(None) } - None } pub fn get(&self, lsn: Lsn) -> Option> { @@ -269,40 +282,30 @@ impl SegEntry { } } - if let Some((_start_lsn, layer)) = self.historic.range(..=lsn).next_back() { - Some(Arc::clone(layer)) - } else { - None - } + self.historic.search(lsn) } pub fn newer_image_layer_exists(&self, lsn: Lsn) -> bool { // We only check on-disk layers, because // in-memory layers are not durable - for (_newer_lsn, layer) in self.historic.range(lsn..) { - // Ignore incremental layers. - if layer.is_incremental() { - continue; - } - if layer.get_end_lsn() > lsn { - return true; - } else { - continue; - } - } - false + self.historic + .iter_newer(lsn) + .any(|layer| !layer.is_incremental()) } - pub fn insert_open(&mut self, layer: Arc) { - assert!(self.open.is_none()); + // Set new open layer for a SegEntry. + // It's ok to rewrite previous open layer, + // but only if it is not writeable anymore. + pub fn update_open(&mut self, layer: Arc) { + if let Some(prev_open) = &self.open { + assert!(!prev_open.is_writeable()); + } self.open = Some(layer); } pub fn insert_historic(&mut self, layer: Arc) { - let start_lsn = layer.get_start_lsn(); - - self.historic.insert(start_lsn, layer); + self.historic.insert(layer); } } @@ -341,8 +344,8 @@ impl Eq for OpenLayerEntry {} /// Iterator returned by LayerMap::iter_historic_layers() pub struct HistoricLayerIter<'a> { - segiter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>, - iter: Option>>, + seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>, + iter: Option>, } impl<'a> Iterator for HistoricLayerIter<'a> { @@ -352,11 +355,11 @@ impl<'a> Iterator for HistoricLayerIter<'a> { loop { if let Some(x) = &mut self.iter { if let Some(x) = x.next() { - return Some(Arc::clone(&*x.1)); + return Some(Arc::clone(&x)); } } - if let Some(seg) = self.segiter.next() { - self.iter = Some(seg.1.historic.iter()); + if let Some((_tag, segentry)) = self.seg_iter.next() { + self.iter = Some(segentry.historic.iter()); continue; } else { return None; @@ -411,14 +414,14 @@ mod tests { let mut layers = LayerMap::default(); let gen1 = layers.increment_generation(); - layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(100), Lsn(100))); - layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(100), Lsn(200))); - layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(100), Lsn(120))); - layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(100), Lsn(110))); + layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100))); + layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200))); + layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120))); + layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110))); let gen2 = layers.increment_generation(); - layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(100), Lsn(110))); - layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(100), Lsn(100))); + layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110))); + layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100))); // A helper function (closure) to pop the next oldest open entry from the layer map, // and assert that it is what we'd expect @@ -429,12 +432,12 @@ mod tests { layers.pop_oldest_open(); }; - assert_pop_layer(0, gen1); // 100 - assert_pop_layer(5, gen2); // 100 - assert_pop_layer(3, gen1); // 110 - assert_pop_layer(4, gen2); // 110 - assert_pop_layer(2, gen1); // 120 - assert_pop_layer(1, gen1); // 200 + assert_pop_layer(0, gen1); // 0x100 + assert_pop_layer(5, gen2); // 0x100 + assert_pop_layer(3, gen1); // 0x110 + assert_pop_layer(4, gen2); // 0x110 + assert_pop_layer(2, gen1); // 0x120 + assert_pop_layer(1, gen1); // 0x200 Ok(()) } diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 73207514e0..a107d63b40 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -10,7 +10,6 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::fmt; use std::path::PathBuf; -use std::sync::Arc; use zenith_utils::lsn::Lsn; @@ -87,9 +86,9 @@ pub struct PageReconstructData { pub enum PageReconstructResult { /// Got all the data needed to reconstruct the requested page Complete, - /// This layer didn't contain all the required data, the caller should collect - /// more data from the returned predecessor layer at the returned LSN. - Continue(Lsn, Arc), + /// This layer didn't contain all the required data, the caller should look up + /// the predecessor layer at the returned LSN and collect more data from there. + Continue(Lsn), /// This layer didn't contain data needed to reconstruct the page version at /// the returned LSN. This is usually considered an error, but might be OK /// in some circumstances. @@ -111,15 +110,14 @@ pub trait Layer: Send + Sync { /// Identify the relish segment fn get_seg_tag(&self) -> SegmentTag; - /// Inclusive start bound of the LSN range that this layer hold + /// Inclusive start bound of the LSN range that this layer holds fn get_start_lsn(&self) -> Lsn; - /// 'end_lsn' meaning depends on the layer kind: - /// - in-memory layer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn) - /// - image layer represents snapshot at one LSN, so end_lsn = lsn - /// - delta layer has end_lsn + /// Exclusive end bound of the LSN range that this layer holds. /// - /// TODO Is end_lsn always exclusive for all layer kinds? + /// - For an open in-memory layer, this is MAX_LSN. + /// - For a frozen in-memory layer or a delta layer, this is a valid end bound. + /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 fn get_end_lsn(&self) -> Lsn; /// Is the segment represented by this layer dropped by PostgreSQL? @@ -146,8 +144,8 @@ pub trait Layer: Send + Sync { /// /// See PageReconstructResult for possible return values. The collected data /// is appended to reconstruct_data; the caller should pass an empty struct - /// on first call. If this returns PageReconstructResult::Continue, call - /// again on the returned predecessor layer with the same 'reconstruct_data' + /// on first call. If this returns PageReconstructResult::Continue, look up + /// the predecessor layer and call again with the same 'reconstruct_data' /// to collect more data. fn get_page_reconstruct_data( &self, diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index eff9b54897..9db7ceb1b4 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -13,7 +13,7 @@ pub mod http; pub mod layered_repository; pub mod page_service; pub mod relish; -mod relish_storage; +pub mod relish_storage; pub mod repository; pub mod restore_local_repo; pub mod tenant_mgr; @@ -22,12 +22,13 @@ pub mod walreceiver; pub mod walredo; pub mod defaults { + use const_format::formatcp; use std::time::Duration; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; - pub const DEFAULT_PG_LISTEN_ADDR: &str = "127.0.0.1:64000"; // can't format! const yet... + pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = "127.0.0.1:9898"; + pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB // would be more appropriate. But a low value forces the code to be exercised more, @@ -39,6 +40,7 @@ pub mod defaults { pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100); pub const DEFAULT_SUPERUSER: &str = "zenith_admin"; + pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100; } lazy_static! { @@ -124,10 +126,6 @@ impl PageServerConf { self.timeline_path(timelineid, tenantid).join("ancestor") } - fn wal_dir_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf { - self.timeline_path(timelineid, tenantid).join("wal") - } - // // Postgres distribution paths // @@ -153,8 +151,8 @@ impl PageServerConf { checkpoint_period: Duration::from_secs(10), gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: Duration::from_secs(10), - listen_pg_addr: "127.0.0.1:5430".to_string(), - listen_http_addr: "127.0.0.1:9898".to_string(), + listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), + listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), superuser: "zenith_admin".to_string(), workdir: repo_dir, pg_distrib_dir: "".into(), @@ -167,18 +165,37 @@ impl PageServerConf { /// External relish storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone)] -pub enum RelishStorageConfig { - /// Root folder to place all stored relish data into. +pub struct RelishStorageConfig { + /// Limits the number of concurrent sync operations between pageserver and relish storage. + pub max_concurrent_sync: usize, + /// The storage connection configuration. + pub storage: RelishStorageKind, +} + +/// A kind of a relish storage to connect to, with its connection configuration. +#[derive(Debug, Clone)] +pub enum RelishStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored relish data into. LocalFs(PathBuf), + /// AWS S3 based storage, storing all relishes into the root + /// of the S3 bucket from the config. AwsS3(S3Config), } /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). #[derive(Clone)] pub struct S3Config { + /// Name of the bucket to connect to. pub bucket_name: String, + /// The region where the bucket is located at. pub bucket_region: String, + /// "Login" to use when connecting to bucket. + /// Can be empty for cases like AWS k8s IAM + /// where we can allow certain pods to connect + /// to the bucket directly without any credentials. pub access_key_id: Option, + /// "Password" to use when connecting to bucket. pub secret_access_key: Option, } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a8ea76dda3..d592a83993 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -194,7 +194,7 @@ pub fn thread_main( let local_auth = auth.clone(); thread::spawn(move || { if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) { - error!("error: {}", err); + error!("page server thread exiting with error: {:#}", err); } }); } @@ -293,7 +293,9 @@ impl PageServerHandler { }; let response = response.unwrap_or_else(|e| { - error!("error reading relation or page version: {}", e); + // print the all details to the log with {:#}, but for the client the + // error message is enough + error!("error reading relation or page version: {:#}", e); PagestreamBeMessage::Error(PagestreamErrorResponse { message: e.to_string(), }) @@ -654,12 +656,14 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"), RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"), RowDescriptor::int8_col(b"layer_relfiles_not_updated"), + RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"), RowDescriptor::int8_col(b"layer_relfiles_removed"), RowDescriptor::int8_col(b"layer_relfiles_dropped"), RowDescriptor::int8_col(b"layer_nonrelfiles_total"), RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"), RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"), RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"), + RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"), RowDescriptor::int8_col(b"layer_nonrelfiles_removed"), RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"), RowDescriptor::int8_col(b"elapsed"), @@ -679,6 +683,12 @@ impl postgres_backend::Handler for PageServerHandler { .as_bytes(), ), Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()), + Some( + result + .ondisk_relfiles_needed_as_tombstone + .to_string() + .as_bytes(), + ), Some(result.ondisk_relfiles_removed.to_string().as_bytes()), Some(result.ondisk_relfiles_dropped.to_string().as_bytes()), Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()), @@ -695,6 +705,12 @@ impl postgres_backend::Handler for PageServerHandler { .as_bytes(), ), Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()), + Some( + result + .ondisk_nonrelfiles_needed_as_tombstone + .to_string() + .as_bytes(), + ), Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()), Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()), Some(result.elapsed.as_millis().to_string().as_bytes()), diff --git a/pageserver/src/relish_storage.rs b/pageserver/src/relish_storage.rs index 11e73711a3..a687abe489 100644 --- a/pageserver/src/relish_storage.rs +++ b/pageserver/src/relish_storage.rs @@ -8,14 +8,43 @@ mod local_fs; mod rust_s3; -/// A queue and the background machinery behind it to upload -/// local page server layer files to external storage. -pub mod storage_uploader; +/// A queue-based storage with the background machinery behind it to synchronize +/// local page server layer files with external storage. +mod synced_storage; use std::path::Path; +use std::thread; use anyhow::Context; +use self::local_fs::LocalFs; +pub use self::synced_storage::schedule_timeline_upload; +use crate::relish_storage::rust_s3::RustS3; +use crate::{PageServerConf, RelishStorageKind}; + +pub fn run_storage_sync_thread( + config: &'static PageServerConf, +) -> anyhow::Result>>> { + match &config.relish_storage_config { + Some(relish_storage_config) => { + let max_concurrent_sync = relish_storage_config.max_concurrent_sync; + match &relish_storage_config.storage { + RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread( + config, + LocalFs::new(root.clone())?, + max_concurrent_sync, + ), + RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread( + config, + RustS3::new(s3_config)?, + max_concurrent_sync, + ), + } + } + None => Ok(None), + } +} + /// Storage (potentially remote) API to manage its state. #[async_trait::async_trait] pub trait RelishStorage: Send + Sync { diff --git a/pageserver/src/relish_storage/storage_uploader.rs b/pageserver/src/relish_storage/storage_uploader.rs deleted file mode 100644 index 2d0889173d..0000000000 --- a/pageserver/src/relish_storage/storage_uploader.rs +++ /dev/null @@ -1,116 +0,0 @@ -use std::{ - collections::VecDeque, - path::{Path, PathBuf}, - sync::{Arc, Mutex}, - thread, -}; - -use zenith_utils::zid::ZTimelineId; - -use crate::{relish_storage::RelishStorage, RelishStorageConfig}; - -use super::{local_fs::LocalFs, rust_s3::RustS3}; - -pub struct QueueBasedRelishUploader { - upload_queue: Arc>>, -} - -impl QueueBasedRelishUploader { - pub fn new( - config: &RelishStorageConfig, - page_server_workdir: &'static Path, - ) -> anyhow::Result { - let upload_queue = Arc::new(Mutex::new(VecDeque::new())); - let _handle = match config { - RelishStorageConfig::LocalFs(root) => { - let relish_storage = LocalFs::new(root.clone())?; - create_upload_thread( - Arc::clone(&upload_queue), - relish_storage, - page_server_workdir, - )? - } - RelishStorageConfig::AwsS3(s3_config) => { - let relish_storage = RustS3::new(s3_config)?; - create_upload_thread( - Arc::clone(&upload_queue), - relish_storage, - page_server_workdir, - )? - } - }; - - Ok(Self { upload_queue }) - } - - pub fn schedule_upload(&self, timeline_id: ZTimelineId, relish_path: PathBuf) { - self.upload_queue - .lock() - .unwrap() - .push_back((timeline_id, relish_path)) - } -} - -fn create_upload_thread>( - upload_queue: Arc>>, - relish_storage: S, - page_server_workdir: &'static Path, -) -> std::io::Result> { - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - thread::Builder::new() - .name("Queue based relish uploader".to_string()) - .spawn(move || loop { - runtime.block_on(async { - upload_loop_step(&upload_queue, &relish_storage, page_server_workdir).await; - }) - }) -} - -async fn upload_loop_step>( - upload_queue: &Mutex>, - relish_storage: &S, - page_server_workdir: &Path, -) { - let mut queue_accessor = upload_queue.lock().unwrap(); - log::debug!("current upload queue length: {}", queue_accessor.len()); - let next_upload = queue_accessor.pop_front(); - drop(queue_accessor); - - let (relish_timeline_id, relish_local_path) = match next_upload { - Some(data) => data, - None => { - // Don't spin and allow others to use the queue. - // In future, could be improved to be more clever about delays depending on relish upload stats - thread::sleep(std::time::Duration::from_secs(1)); - return; - } - }; - - if let Err(e) = upload_relish(relish_storage, page_server_workdir, &relish_local_path).await { - log::error!( - "Failed to upload relish '{}' for timeline {}, reason: {}", - relish_local_path.display(), - relish_timeline_id, - e - ); - upload_queue - .lock() - .unwrap() - .push_back((relish_timeline_id, relish_local_path)) - } else { - log::debug!("Relish successfully uploaded"); - } -} - -async fn upload_relish>( - relish_storage: &S, - page_server_workdir: &Path, - relish_local_path: &Path, -) -> anyhow::Result<()> { - let destination = S::derive_destination(page_server_workdir, relish_local_path)?; - relish_storage - .upload_relish(relish_local_path, &destination) - .await -} diff --git a/pageserver/src/relish_storage/synced_storage.rs b/pageserver/src/relish_storage/synced_storage.rs new file mode 100644 index 0000000000..f51e976a83 --- /dev/null +++ b/pageserver/src/relish_storage/synced_storage.rs @@ -0,0 +1,52 @@ +use std::time::Duration; +use std::{collections::BinaryHeap, sync::Mutex, thread}; + +use crate::{relish_storage::RelishStorage, PageServerConf}; + +lazy_static::lazy_static! { + static ref UPLOAD_QUEUE: Mutex> = Mutex::new(BinaryHeap::new()); +} + +pub fn schedule_timeline_upload(_local_timeline: ()) { + // UPLOAD_QUEUE + // .lock() + // .unwrap() + // .push(SyncTask::Upload(local_timeline)) +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +enum SyncTask {} + +pub fn run_storage_sync_thread< + P: std::fmt::Debug, + S: 'static + RelishStorage, +>( + config: &'static PageServerConf, + relish_storage: S, + max_concurrent_sync: usize, +) -> anyhow::Result>>> { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + + let handle = thread::Builder::new() + .name("Queue based relish storage sync".to_string()) + .spawn(move || loop { + let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap(); + log::debug!("Upload queue length: {}", queue_accessor.len()); + let next_task = queue_accessor.pop(); + drop(queue_accessor); + match next_task { + Some(task) => runtime.block_on(async { + // suppress warnings + let _ = (config, task, &relish_storage, max_concurrent_sync); + todo!("omitted for brevity") + }), + None => { + thread::sleep(Duration::from_secs(1)); + continue; + } + } + })?; + Ok(Some(handle)) +} diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 1efbb37d0c..fa6a3e83e0 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -28,18 +28,14 @@ pub trait Repository: Send + Sync { /// /// 'timelineid' specifies the timeline to GC, or None for all. /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). - /// `compact` parameter is used to force compaction of storage. - /// some storage implementation are based on lsm tree and require periodic merge (compaction). - /// usually storage implementation determines itself when compaction should be performed. - /// but for gc tests it way be useful to force compaction just after completion of gc iteration - /// to make sure that all detected garbage is removed. - /// so right now `compact` is set to true when gc explicitly requested through page srver api, - /// and is st to false in gc threads which infinitely repeats gc iterations in loop. + /// `checkpoint_before_gc` parameter is used to force compaction of storage before CG + /// to make tests more deterministic. + /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? fn gc_iteration( &self, timelineid: Option, horizon: u64, - compact: bool, + checkpoint_before_gc: bool, ) -> Result; } @@ -52,6 +48,7 @@ pub struct GcResult { pub ondisk_relfiles_needed_by_cutoff: u64, pub ondisk_relfiles_needed_by_branches: u64, pub ondisk_relfiles_not_updated: u64, + pub ondisk_relfiles_needed_as_tombstone: u64, pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped @@ -59,6 +56,7 @@ pub struct GcResult { pub ondisk_nonrelfiles_needed_by_cutoff: u64, pub ondisk_nonrelfiles_needed_by_branches: u64, pub ondisk_nonrelfiles_not_updated: u64, + pub ondisk_nonrelfiles_needed_as_tombstone: u64, pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped @@ -71,6 +69,7 @@ impl AddAssign for GcResult { self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff; self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches; self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated; + self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone; self.ondisk_relfiles_removed += other.ondisk_relfiles_removed; self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped; @@ -78,6 +77,7 @@ impl AddAssign for GcResult { self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff; self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches; self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated; + self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone; self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed; self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped; @@ -213,12 +213,18 @@ mod tests { use crate::layered_repository::LayeredRepository; use crate::walredo::{WalRedoError, WalRedoManager}; use crate::PageServerConf; + use hex_literal::hex; use postgres_ffi::pg_constants; use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT; use std::fs; - use std::str::FromStr; + use std::path::PathBuf; use zenith_utils::zid::ZTenantId; + const TIMELINE_ID: ZTimelineId = + ZTimelineId::from_array(hex!("11223344556677881122334455667788")); + const NEW_TIMELINE_ID: ZTimelineId = + ZTimelineId::from_array(hex!("AA223344556677881122334455667788")); + /// Arbitrary relation tag, for testing. const TESTREL_A: RelishTag = RelishTag::Relation(RelTag { spcnode: 0, @@ -254,39 +260,53 @@ mod tests { static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - fn get_test_repo(test_name: &str) -> Result> { - let repo_dir = PageServerConf::test_repo_dir(test_name); - let _ = fs::remove_dir_all(&repo_dir); - fs::create_dir_all(&repo_dir)?; - fs::create_dir_all(&repo_dir.join("timelines"))?; + struct RepoHarness { + conf: &'static PageServerConf, + tenant_id: ZTenantId, + } - let conf = PageServerConf::dummy_conf(repo_dir); - // Make a static copy of the config. This can never be free'd, but that's - // OK in a test. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - let tenantid = ZTenantId::generate(); - fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap(); + impl RepoHarness { + fn create(test_name: &'static str) -> Result { + let repo_dir = PageServerConf::test_repo_dir(test_name); + let _ = fs::remove_dir_all(&repo_dir); + fs::create_dir_all(&repo_dir)?; + fs::create_dir_all(&repo_dir.join("timelines"))?; - let walredo_mgr = TestRedoManager {}; + let conf = PageServerConf::dummy_conf(repo_dir); + // Make a static copy of the config. This can never be free'd, but that's + // OK in a test. + let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - let repo = Box::new(LayeredRepository::new( - conf, - Arc::new(walredo_mgr), - tenantid, - )); + let tenant_id = ZTenantId::generate(); + fs::create_dir_all(conf.tenant_path(&tenant_id))?; - Ok(repo) + Ok(Self { conf, tenant_id }) + } + + fn load(&self) -> Box { + let walredo_mgr = Arc::new(TestRedoManager); + + Box::new(LayeredRepository::new( + self.conf, + walredo_mgr, + self.tenant_id, + false, + )) + } + + fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { + self.conf.timeline_path(timeline_id, &self.tenant_id) + } } #[test] fn test_relsize() -> Result<()> { - let repo = get_test_repo("test_relsize")?; + let repo = RepoHarness::create("test_relsize")?.load(); // get_timeline() with non-existent timeline id should fail //repo.get_timeline("11223344556677881122334455667788"); // Create timeline to work on - let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap(); - let tline = repo.create_empty_timeline(timelineid)?; + let tline = repo.create_empty_timeline(TIMELINE_ID)?; tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; @@ -396,13 +416,148 @@ mod tests { Ok(()) } + // Test what happens if we dropped a relation + // and then created it again within the same layer. + #[test] + fn test_drop_extend() -> Result<()> { + let repo = RepoHarness::create("test_drop_extend")?.load(); + + // Create timeline to work on + let tline = repo.create_empty_timeline(TIMELINE_ID)?; + + tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; + tline.advance_last_record_lsn(Lsn(0x20)); + + // Check that rel exists and size is correct + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); + assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1); + + // Drop relish + tline.drop_relish(TESTREL_A, Lsn(0x30))?; + tline.advance_last_record_lsn(Lsn(0x30)); + + // Check that rel is not visible anymore + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); + assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none()); + + // Extend it again + tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; + tline.advance_last_record_lsn(Lsn(0x40)); + + // Check that rel exists and size is correct + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); + assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x40))?.unwrap(), 1); + + Ok(()) + } + + // Test what happens if we truncated a relation + // so that one of its segments was dropped + // and then extended it again within the same layer. + #[test] + fn test_truncate_extend() -> Result<()> { + let repo = RepoHarness::create("test_truncate_extend")?.load(); + + // Create timeline to work on + let tline = repo.create_empty_timeline(TIMELINE_ID)?; + + //from storage_layer.rs + const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; + let relsize = RELISH_SEG_SIZE * 2; + + // Create relation with relsize blocks + for blkno in 0..relsize { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; + } + + tline.advance_last_record_lsn(Lsn(0x20)); + + // The relation was created at LSN 2, not visible at LSN 1 yet. + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); + assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none()); + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); + assert_eq!( + tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), + relsize + ); + + // Check relation content + for blkno in 0..relsize { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_page_at_lsn(TESTREL_A, blkno, lsn)?, + TEST_IMG(&data) + ); + } + + // Truncate relation so that second segment was dropped + // - only leave one page + tline.put_truncation(TESTREL_A, Lsn(0x60), 1)?; + tline.advance_last_record_lsn(Lsn(0x60)); + + // Check reported size and contents after truncation + assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1); + + for blkno in 0..1 { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?, + TEST_IMG(&data) + ); + } + + // should still see all blocks with older LSN + assert_eq!( + tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), + relsize + ); + for blkno in 0..relsize { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?, + TEST_IMG(&data) + ); + } + + // Extend relation again. + // Add enough blocks to create second segment + for blkno in 0..relsize { + let lsn = Lsn(0x80); + let data = format!("foo blk {} at {}", blkno, lsn); + tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; + } + tline.advance_last_record_lsn(Lsn(0x80)); + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); + assert_eq!( + tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), + relsize + ); + // Check relation content + for blkno in 0..relsize { + let lsn = Lsn(0x80); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?, + TEST_IMG(&data) + ); + } + + Ok(()) + } + /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's /// split into multiple 1 GB segments in Postgres. #[test] fn test_large_rel() -> Result<()> { - let repo = get_test_repo("test_large_rel")?; - let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap(); - let tline = repo.create_empty_timeline(timelineid)?; + let repo = RepoHarness::create("test_large_rel")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID)?; let mut lsn = 0x10; for blknum in 0..pg_constants::RELSEG_SIZE + 1 { @@ -462,13 +617,9 @@ mod tests { /// Test list_rels() function, with branches and dropped relations /// #[test] - // FIXME: The last assertion in this test is currently failing, see - // https://github.com/zenithdb/zenith/issues/502. Ignore the failure until that's fixed. - #[ignore] fn test_list_rels_drop() -> Result<()> { - let repo = get_test_repo("test_list_rels_drop")?; - let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap(); - let tline = repo.create_empty_timeline(timelineid)?; + let repo = RepoHarness::create("test_list_rels_drop")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID)?; const TESTDB: u32 = 111; // Import initial dummy checkpoint record, otherwise the get_timeline() call @@ -486,9 +637,8 @@ mod tests { assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A)); // Create a branch, check that the relation is visible there - let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap(); - repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?; - let newtline = repo.get_timeline(newtimelineid)?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; + let newtline = repo.get_timeline(NEW_TIMELINE_ID)?; assert!(newtline .list_rels(0, TESTDB, Lsn(0x30))? @@ -508,9 +658,8 @@ mod tests { // Run checkpoint and garbage collection and check that it's still not visible newtline.checkpoint()?; - repo.gc_iteration(Some(newtimelineid), 0, true)?; + repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; - // FIXME: this is currently failing assert!(!newtline .list_rels(0, TESTDB, Lsn(0x40))? .contains(&TESTREL_A)); @@ -523,9 +672,8 @@ mod tests { /// #[test] fn test_branch() -> Result<()> { - let repo = get_test_repo("test_branch")?; - let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap(); - let tline = repo.create_empty_timeline(timelineid)?; + let repo = RepoHarness::create("test_branch")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID)?; // Import initial dummy checkpoint record, otherwise the get_timeline() call // after branching fails below @@ -543,9 +691,8 @@ mod tests { assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline - let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap(); - repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?; - let newtline = repo.get_timeline(newtimelineid)?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; + let newtline = repo.get_timeline(NEW_TIMELINE_ID)?; newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?; newtline.advance_last_record_lsn(Lsn(0x40)); @@ -573,8 +720,89 @@ mod tests { Ok(()) } + #[test] + fn corrupt_metadata() -> Result<()> { + const TEST_NAME: &str = "corrupt_metadata"; + let harness = RepoHarness::create(TEST_NAME)?; + let repo = harness.load(); + + repo.create_empty_timeline(TIMELINE_ID)?; + drop(repo); + + let metadata_path = harness.timeline_path(&TIMELINE_ID).join("metadata"); + + assert!(metadata_path.is_file()); + + let mut metadata_bytes = std::fs::read(&metadata_path)?; + assert_eq!(metadata_bytes.len(), 512); + metadata_bytes[512 - 4 - 2] ^= 1; + std::fs::write(metadata_path, metadata_bytes)?; + + let new_repo = harness.load(); + let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap(); + assert!(err.to_string().contains("checksum")); + + Ok(()) + } + + #[test] + fn future_layerfiles() -> Result<()> { + const TEST_NAME: &str = "future_layerfiles"; + let harness = RepoHarness::create(TEST_NAME)?; + let repo = harness.load(); + + repo.create_empty_timeline(TIMELINE_ID)?; + drop(repo); + + let timeline_path = harness.timeline_path(&TIMELINE_ID); + + let make_empty_file = |filename: &str| -> std::io::Result<()> { + let path = timeline_path.join(filename); + + assert!(!path.exists()); + std::fs::write(&path, &[])?; + + Ok(()) + }; + + let image_filename = format!("pg_control_0_{:016X}", 8000); + let delta_filename = format!("pg_control_0_{:016X}_{:016X}", 8000, 8008); + + make_empty_file(&image_filename)?; + make_empty_file(&delta_filename)?; + + let new_repo = harness.load(); + new_repo.get_timeline(TIMELINE_ID).unwrap(); + drop(new_repo); + + let check_old = |filename: &str, num: u32| { + let path = timeline_path.join(filename); + assert!(!path.exists()); + + let backup_path = timeline_path.join(format!("{}.{}.old", filename, num)); + assert!(backup_path.exists()); + }; + + check_old(&image_filename, 0); + check_old(&delta_filename, 0); + + make_empty_file(&image_filename)?; + make_empty_file(&delta_filename)?; + + let new_repo = harness.load(); + new_repo.get_timeline(TIMELINE_ID).unwrap(); + drop(new_repo); + + check_old(&image_filename, 0); + check_old(&delta_filename, 0); + check_old(&image_filename, 1); + check_old(&delta_filename, 1); + + Ok(()) + } + // Mock WAL redo manager that doesn't do much - struct TestRedoManager {} + struct TestRedoManager; impl WalRedoManager for TestRedoManager { fn request_redo( diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs index ff394e8094..dfe3edd7ac 100644 --- a/pageserver/src/restore_local_repo.rs +++ b/pageserver/src/restore_local_repo.rs @@ -9,11 +9,9 @@ use std::cmp::min; use std::fs; use std::fs::File; use std::io::Read; -use std::io::Seek; -use std::io::SeekFrom; -use std::path::{Path, PathBuf}; +use std::path::Path; -use anyhow::Result; +use anyhow::{bail, Result}; use bytes::{Buf, Bytes}; use crate::relish::*; @@ -175,8 +173,7 @@ fn import_relfile( break; } _ => { - error!("error reading file: {:?} ({})", path, e); - break; + bail!("error reading file {}: {:#}", path.display(), e); } }, }; @@ -270,8 +267,7 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa break; } _ => { - error!("error reading file: {:?} ({})", path, e); - break; + bail!("error reading file {}: {:#}", path.display(), e); } }, }; @@ -283,100 +279,6 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa Ok(()) } -/// Scan PostgreSQL WAL files in given directory -/// and load all records >= 'startpoint' into the repository. -pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint: Lsn) -> Result<()> { - let mut waldecoder = WalStreamDecoder::new(startpoint); - - let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE); - let mut last_lsn = startpoint; - - let checkpoint_bytes = timeline.get_page_at_lsn(RelishTag::Checkpoint, 0, startpoint)?; - let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - - loop { - // FIXME: assume postgresql tli 1 for now - let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE); - let mut buf = Vec::new(); - - //Read local file - let mut path = walpath.join(&filename); - - // It could be as .partial - if !PathBuf::from(&path).exists() { - path = walpath.join(filename + ".partial"); - } - - // Slurp the WAL file - let open_result = File::open(&path); - if let Err(e) = &open_result { - if e.kind() == std::io::ErrorKind::NotFound { - break; - } - } - let mut file = open_result?; - - if offset > 0 { - file.seek(SeekFrom::Start(offset as u64))?; - } - - let nread = file.read_to_end(&mut buf)?; - if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize { - // Maybe allow this for .partial files? - error!("read only {} bytes from WAL file", nread); - } - - waldecoder.feed_bytes(&buf); - - let mut nrecords = 0; - loop { - let rec = waldecoder.poll_decode(); - if rec.is_err() { - // Assume that an error means we've reached the end of - // a partial WAL record. So that's ok. - trace!("WAL decoder error {:?}", rec); - break; - } - if let Some((lsn, recdata)) = rec.unwrap() { - // The previous record has been handled, let the repository know that - // it is up-to-date to this LSN. (We do this here on the "next" iteration, - // rather than right after the save_decoded_record, because at the end of - // the WAL, we will also need to perform the update of the checkpoint data - // with the same LSN as the last actual record.) - timeline.advance_last_record_lsn(last_lsn); - - let decoded = decode_wal_record(recdata.clone()); - save_decoded_record(&mut checkpoint, timeline, &decoded, recdata, lsn)?; - last_lsn = lsn; - } else { - break; - } - nrecords += 1; - } - - info!("imported {} records up to {}", nrecords, last_lsn); - - segno += 1; - offset = 0; - } - - if last_lsn != startpoint { - info!( - "reached end of WAL at {}, updating checkpoint info", - last_lsn - ); - let checkpoint_bytes = checkpoint.encode(); - timeline.put_page_image(RelishTag::Checkpoint, 0, last_lsn, checkpoint_bytes)?; - - timeline.advance_last_record_lsn(last_lsn); - } else { - info!("no WAL to import at {}", last_lsn); - } - - Ok(()) -} - /// /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 3258ab2567..4eb46ba71a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -9,46 +9,84 @@ use crate::PageServerConf; use anyhow::{anyhow, bail, Context, Result}; use lazy_static::lazy_static; use log::info; +use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fs; use std::str::FromStr; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, Mutex, MutexGuard}; use zenith_utils::zid::{ZTenantId, ZTimelineId}; lazy_static! { - pub static ref REPOSITORY: Mutex>> = + static ref REPOSITORY: Mutex>> = Mutex::new(HashMap::new()); } -pub fn init(conf: &'static PageServerConf) { - let mut m = REPOSITORY.lock().unwrap(); +fn access_repository() -> MutexGuard<'static, HashMap>> { + REPOSITORY.lock().unwrap() +} +pub fn init(conf: &'static PageServerConf) { + let mut m = access_repository(); for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() { let tenantid = ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap(); - - // Set up a WAL redo manager, for applying WAL records. - let walredo_mgr = PostgresRedoManager::new(conf, tenantid); - - // Set up an object repository, for actual data storage. - let repo = Arc::new(LayeredRepository::new( - conf, - Arc::new(walredo_mgr), - tenantid, - )); - LayeredRepository::launch_checkpointer_thread(conf, repo.clone()); - LayeredRepository::launch_gc_thread(conf, repo.clone()); - + let repo = init_repo(conf, tenantid); info!("initialized storage for tenant: {}", &tenantid); m.insert(tenantid, repo); } } +fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc { + // Set up a WAL redo manager, for applying WAL records. + let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); + + // Set up an object repository, for actual data storage. + let repo = Arc::new(LayeredRepository::new( + conf, + Arc::new(walredo_mgr), + tenant_id, + true, + )); + LayeredRepository::launch_checkpointer_thread(conf, repo.clone()); + LayeredRepository::launch_gc_thread(conf, repo.clone()); + repo +} + +// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer. +// Relevant PR: https://github.com/zenithdb/zenith/pull/686 +pub fn register_relish_download( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, +) { + log::info!( + "Registering new download, tenant id {}, timeline id: {}", + tenant_id, + timeline_id + ); + match access_repository().entry(tenant_id) { + Entry::Occupied(o) => init_timeline(o.get().as_ref(), timeline_id), + Entry::Vacant(v) => { + log::info!("New repo initialized"); + let new_repo = init_repo(conf, tenant_id); + init_timeline(new_repo.as_ref(), timeline_id); + v.insert(new_repo); + } + } +} + +fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) { + match repo.get_timeline(timeline_id) { + Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id), + Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e), + } +} + pub fn create_repository_for_tenant( conf: &'static PageServerConf, tenantid: ZTenantId, ) -> Result<()> { - let mut m = REPOSITORY.lock().unwrap(); + let mut m = access_repository(); // First check that the tenant doesn't exist already if m.get(&tenantid).is_some() { @@ -62,15 +100,10 @@ pub fn create_repository_for_tenant( Ok(()) } -pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc) { - let o = &mut REPOSITORY.lock().unwrap(); - o.insert(tenantid, repo); -} - pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result> { - let o = &REPOSITORY.lock().unwrap(); - o.get(&tenantid) - .map(|repo| Arc::clone(repo)) + access_repository() + .get(&tenantid) + .map(Arc::clone) .ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid)) } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index a6a3879f4a..45195ec706 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -10,23 +10,22 @@ use crate::restore_local_repo; use crate::tenant_mgr; use crate::waldecoder::*; use crate::PageServerConf; -use anyhow::{Error, Result}; +use anyhow::{bail, Error, Result}; use lazy_static::lazy_static; use log::*; use postgres::fallible_iterator::FallibleIterator; use postgres::replication::ReplicationIter; use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow}; -use postgres_ffi::xlog_utils::*; use postgres_ffi::*; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; -use std::cmp::{max, min}; +use std::cell::Cell; use std::collections::HashMap; -use std::fs; use std::str::FromStr; use std::sync::Mutex; use std::thread; use std::thread::sleep; +use std::thread_local; use std::time::{Duration, SystemTime}; use zenith_utils::lsn::Lsn; use zenith_utils::zid::ZTenantId; @@ -44,6 +43,13 @@ lazy_static! { Mutex::new(HashMap::new()); } +thread_local! { + // Boolean that is true only for WAL receiver threads + // + // This is used in `wait_lsn` to guard against usage that might lead to a deadlock. + pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); +} + // Launch a new WAL receiver, or tell one that's running about change in connection string pub fn launch_wal_receiver( conf: &'static PageServerConf, @@ -64,12 +70,10 @@ pub fn launch_wal_receiver( receivers.insert(timelineid, receiver); // Also launch a new thread to handle this connection - // - // NOTE: This thread name is checked in the assertion in wait_lsn. If you change - // this, make sure you update the assertion too. let _walreceiver_thread = thread::Builder::new() .name("WAL receiver thread".into()) .spawn(move || { + IS_WAL_RECEIVER.with(|c| c.set(true)); thread_main(conf, timelineid, tenantid); }) .unwrap(); @@ -118,7 +122,7 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: } fn walreceiver_main( - conf: &PageServerConf, + _conf: &PageServerConf, timelineid: ZTimelineId, wal_producer_connstr: &str, tenantid: ZTenantId, @@ -158,7 +162,7 @@ fn walreceiver_main( let mut startpoint = last_rec_lsn; if startpoint == Lsn(0) { - error!("No previous WAL position"); + bail!("No previous WAL position"); } // There might be some padding after the last full record, skip it. @@ -188,7 +192,6 @@ fn walreceiver_main( let data = xlog_data.data(); let startlsn = Lsn::from(xlog_data.wal_start()); let endlsn = startlsn + data.len() as u64; - let prev_last_rec_lsn = last_rec_lsn; trace!("received XLogData between {} and {}", startlsn, endlsn); @@ -229,34 +232,6 @@ fn walreceiver_main( last_rec_lsn = lsn; } - // Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each), - // "checkpoint" the repository to flush all the changes from WAL we've processed - // so far to disk. After this, we don't need the original WAL anymore, and it - // can be removed. This is probably too aggressive for production, but it's useful - // to expose bugs now. - // - // TODO: We don't actually dare to remove the WAL. It's useful for debugging, - // and we might it for logical decoding other things in the future. Although - // we should also be able to fetch it back from the WAL safekeepers or S3 if - // needed. - if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE) - != last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE) - { - info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn); - let (oldest_segno, newest_segno) = find_wal_file_range( - conf, - &timelineid, - pg_constants::WAL_SEGMENT_SIZE, - last_rec_lsn, - &tenantid, - )?; - - if newest_segno - oldest_segno >= 10 { - // TODO: This is where we could remove WAL older than last_rec_lsn. - //remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?; - } - } - if !caught_up && endlsn >= end_of_wal { info!("caught up at LSN {}", endlsn); caught_up = true; @@ -278,7 +253,7 @@ fn walreceiver_main( ); if reply_requested { - Some(timeline.get_last_record_lsn()) + Some(last_rec_lsn) } else { None } @@ -301,53 +276,15 @@ fn walreceiver_main( Ok(()) } -fn find_wal_file_range( - conf: &PageServerConf, - timeline: &ZTimelineId, - wal_seg_size: usize, - written_upto: Lsn, - tenant: &ZTenantId, -) -> Result<(u64, u64)> { - let written_upto_segno = written_upto.segment_number(wal_seg_size); - - let mut oldest_segno = written_upto_segno; - let mut newest_segno = written_upto_segno; - // Scan the wal directory, and count how many WAL filed we could remove - let wal_dir = conf.wal_dir_path(timeline, tenant); - for entry in fs::read_dir(wal_dir)? { - let entry = entry?; - let path = entry.path(); - - if path.is_dir() { - continue; - } - - let filename = path.file_name().unwrap().to_str().unwrap(); - - if IsXLogFileName(filename) { - let (segno, _tli) = XLogFromFileName(filename, wal_seg_size); - - if segno > written_upto_segno { - // that's strange. - warn!("there is a WAL file from future at {}", path.display()); - continue; - } - - oldest_segno = min(oldest_segno, segno); - newest_segno = max(newest_segno, segno); - } - } - // FIXME: would be good to assert that there are no gaps in the WAL files - - Ok((oldest_segno, newest_segno)) -} - /// Data returned from the postgres `IDENTIFY_SYSTEM` command /// /// See the [postgres docs] for more details. /// /// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html #[derive(Debug)] +// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as +// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 +#[allow(dead_code)] pub struct IdentifySystem { systemid: u64, timeline: u32, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index f1399c4eec..f233fceb3e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -22,8 +22,7 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use lazy_static::lazy_static; use log::*; -use serde::{Deserialize, Serialize}; -use std::cell::RefCell; +use serde::Serialize; use std::fs; use std::fs::OpenOptions; use std::io::prelude::*; @@ -60,7 +59,7 @@ use postgres_ffi::XLogRecord; /// In Postgres `BufferTag` structure is used for exactly the same purpose. /// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91). /// -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)] pub struct BufferTag { pub rel: RelTag, pub blknum: u32, @@ -206,7 +205,7 @@ impl WalRedoManager for PostgresRedoManager { .block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?; *process_guard = Some(p); } - let process = (*process_guard).as_ref().unwrap(); + let process = process_guard.as_mut().unwrap(); self.runtime .block_on(self.handle_apply_request(process, &request)) @@ -247,7 +246,7 @@ impl PostgresRedoManager { /// async fn handle_apply_request( &self, - process: &PostgresRedoProcess, + process: &mut PostgresRedoProcess, request: &WalRedoRequest, ) -> Result { let rel = request.rel; @@ -299,9 +298,11 @@ impl PostgresRedoManager { // Transaction manager stuff let rec_segno = match rel { RelishTag::Slru { slru, segno } => { - if slru != SlruKind::Clog { - panic!("Not valid XACT relish tag {:?}", rel); - } + assert!( + slru == SlruKind::Clog, + "Not valid XACT relish tag {:?}", + rel + ); segno } _ => panic!("Not valid XACT relish tag {:?}", rel), @@ -421,7 +422,7 @@ impl PostgresRedoManager { ); if let Err(e) = apply_result { - error!("could not apply WAL records: {}", e); + error!("could not apply WAL records: {:#}", e); result = Err(WalRedoError::IoError(e)); } else { let img = apply_result.unwrap(); @@ -438,8 +439,8 @@ impl PostgresRedoManager { /// Handle to the Postgres WAL redo process /// struct PostgresRedoProcess { - stdin: RefCell, - stdout: RefCell, + stdin: ChildStdin, + stdout: ChildStdout, } impl PostgresRedoProcess { @@ -459,7 +460,7 @@ impl PostgresRedoProcess { if datadir.exists() { info!("directory {:?} exists, removing", &datadir); if let Err(e) = fs::remove_dir_all(&datadir) { - error!("could not remove old wal-redo-datadir: {:?}", e); + error!("could not remove old wal-redo-datadir: {:#}", e); } } info!("running initdb in {:?}", datadir.display()); @@ -532,10 +533,7 @@ impl PostgresRedoProcess { }; tokio::spawn(f_stderr); - Ok(PostgresRedoProcess { - stdin: RefCell::new(stdin), - stdout: RefCell::new(stdout), - }) + Ok(PostgresRedoProcess { stdin, stdout }) } // @@ -543,13 +541,14 @@ impl PostgresRedoProcess { // new page image. // async fn apply_wal_records( - &self, + &mut self, tag: BufferTag, base_img: Option, records: &[WALRecord], ) -> Result { - let mut stdin = self.stdin.borrow_mut(); - let mut stdout = self.stdout.borrow_mut(); + let stdout = &mut self.stdout; + // Buffer the writes to avoid a lot of small syscalls. + let mut stdin = tokio::io::BufWriter::new(&mut self.stdin); // We do three things simultaneously: send the old base image and WAL records to // the child process's stdin, read the result from child's stdout, and forward any logging diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 078c3d18c7..abd6e83b6a 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -18,5 +18,6 @@ tokio = "1.11" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } clap = "2.33.0" rustls = "0.19.1" +reqwest = { version = "0.11", features = ["blocking", "json"] } zenith_utils = { path = "../zenith_utils" } diff --git a/proxy/src/cplane_api.rs b/proxy/src/cplane_api.rs index 690f7cf199..2b3b8a64bd 100644 --- a/proxy/src/cplane_api.rs +++ b/proxy/src/cplane_api.rs @@ -1,17 +1,14 @@ -use anyhow::{bail, Result}; +use anyhow::{bail, Context, Result}; use serde::{Deserialize, Serialize}; -use std::{ - collections::HashMap, - net::{IpAddr, SocketAddr}, -}; +use std::net::{SocketAddr, ToSocketAddrs}; pub struct CPlaneApi { - // address: SocketAddr, + auth_endpoint: &'static str, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug)] pub struct DatabaseInfo { - pub host: IpAddr, // TODO: allow host name here too + pub host: String, pub port: u16, pub dbname: String, pub user: String, @@ -19,8 +16,13 @@ pub struct DatabaseInfo { } impl DatabaseInfo { - pub fn socket_addr(&self) -> SocketAddr { - SocketAddr::new(self.host, self.port) + pub fn socket_addr(&self) -> Result { + let host_port = format!("{}:{}", self.host, self.port); + host_port + .to_socket_addrs() + .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))? + .next() + .ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr")) } pub fn conn_string(&self) -> String { @@ -31,62 +33,35 @@ impl DatabaseInfo { } } -// mock cplane api impl CPlaneApi { - pub fn new(_address: &SocketAddr) -> CPlaneApi { - CPlaneApi { - // address: address.clone(), - } + pub fn new(auth_endpoint: &'static str) -> CPlaneApi { + CPlaneApi { auth_endpoint } } - pub fn check_auth(&self, user: &str, md5_response: &[u8], salt: &[u8; 4]) -> Result<()> { - // passwords for both is "mypass" - let auth_map: HashMap<_, &str> = vec![ - ("stas@zenith", "716ee6e1c4a9364d66285452c47402b1"), - ("stas2@zenith", "3996f75df64c16a8bfaf01301b61d582"), - ] - .into_iter() - .collect(); + pub fn authenticate_proxy_request( + &self, + user: &str, + database: &str, + md5_response: &[u8], + salt: &[u8; 4], + ) -> Result { + let mut url = reqwest::Url::parse(self.auth_endpoint)?; + url.query_pairs_mut() + .append_pair("login", user) + .append_pair("database", database) + .append_pair("md5response", std::str::from_utf8(md5_response)?) + .append_pair("salt", &hex::encode(salt)); - let stored_hash = auth_map - .get(&user) - .ok_or_else(|| anyhow::Error::msg("user not found"))?; - let salted_stored_hash = format!( - "md5{:x}", - md5::compute([stored_hash.as_bytes(), salt].concat()) - ); + println!("cplane request: {}", url.as_str()); - let received_hash = std::str::from_utf8(md5_response)?; + let resp = reqwest::blocking::get(url)?; - println!( - "auth: {} rh={} sh={} ssh={} {:?}", - user, received_hash, stored_hash, salted_stored_hash, salt - ); - - if received_hash == salted_stored_hash { - Ok(()) + if resp.status().is_success() { + let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?; + println!("got conn info: #{:?}", conn_info); + Ok(conn_info) } else { bail!("Auth failed") } } - - pub fn get_database_uri(&self, _user: &str, _database: &str) -> Result { - Ok(DatabaseInfo { - host: "127.0.0.1".parse()?, - port: 5432, - dbname: "stas".to_string(), - user: "stas".to_string(), - password: "mypass".to_string(), - }) - } - - // pub fn create_database(&self, _user: &String, _database: &String) -> Result { - // Ok(DatabaseInfo { - // host: "127.0.0.1".parse()?, - // port: 5432, - // dbname: "stas".to_string(), - // user: "stas".to_string(), - // password: "mypass".to_string(), - // }) - // } } diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 200dde9007..bbabab52f7 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -34,7 +34,7 @@ pub struct ProxyConf { pub redirect_uri: String, /// control plane address where we would check auth. - pub cplane_address: SocketAddr, + pub auth_endpoint: String, pub ssl_config: Option>, } @@ -56,8 +56,7 @@ fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result anyhow::Result<()> { .help("redirect unauthenticated users to given uri") .default_value("http://localhost:3000/psql_session/"), ) + .arg( + Arg::with_name("auth-endpoint") + .short("a") + .long("auth-endpoint") + .takes_value(true) + .help("redirect unauthenticated users to given uri") + .default_value("http://localhost:3000/authenticate_proxy_request/"), + ) .arg( Arg::with_name("ssl-key") .short("k") @@ -122,7 +129,7 @@ fn main() -> anyhow::Result<()> { proxy_address: arg_matches.value_of("proxy").unwrap().parse()?, mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?, redirect_uri: arg_matches.value_of("uri").unwrap().parse()?, - cplane_address: "127.0.0.1:3000".parse()?, + auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, ssl_config: configure_ssl(&arg_matches)?, }; let state = ProxyState { diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 0d3c114421..2b3259f8ec 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -5,7 +5,7 @@ use std::{ use anyhow::bail; use bytes::Bytes; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use zenith_utils::{ postgres_backend::{self, query_from_cstring, AuthType, PostgresBackend}, pq_proto::{BeMessage, SINGLE_COL_ROWDESC}, @@ -49,7 +49,7 @@ struct MgmtHandler { // "host": "127.0.0.1", // "port": 5432, // "dbname": "stas", -// "user": "stas" +// "user": "stas", // "password": "mypass" // } // } @@ -60,13 +60,16 @@ struct MgmtHandler { // "Failure": "oops" // } // } -#[derive(Serialize, Deserialize)] +// +// // to test manually by sending a query to mgmt interface: +// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}' +#[derive(Deserialize)] pub struct PsqlSessionResponse { session_id: String, result: PsqlSessionResult, } -#[derive(Serialize, Deserialize)] +#[derive(Deserialize)] pub enum PsqlSessionResult { Success(DatabaseInfo), Failure(String), @@ -78,34 +81,47 @@ impl postgres_backend::Handler for MgmtHandler { pgb: &mut PostgresBackend, query_string: Bytes, ) -> anyhow::Result<()> { - let query_string = query_from_cstring(query_string); + let res = try_process_query(self, pgb, query_string); + // intercept and log error message + if res.is_err() { + println!("Mgmt query failed: #{:?}", res); + } + res + } +} - println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?); +fn try_process_query( + mgmt: &mut MgmtHandler, + pgb: &mut PostgresBackend, + query_string: Bytes, +) -> anyhow::Result<()> { + let query_string = query_from_cstring(query_string); - let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?; + println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?); - let waiters = self.state.waiters.lock().unwrap(); + let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?; - let sender = waiters - .get(&resp.session_id) - .ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?; + let waiters = mgmt.state.waiters.lock().unwrap(); - match resp.result { - PsqlSessionResult::Success(db_info) => { - sender.send(Ok(db_info))?; + let sender = waiters + .get(&resp.session_id) + .ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - pgb.flush()?; - Ok(()) - } + match resp.result { + PsqlSessionResult::Success(db_info) => { + sender.send(Ok(db_info))?; - PsqlSessionResult::Failure(message) => { - sender.send(Err(anyhow::Error::msg(message.clone())))?; + pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? + .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.flush()?; + Ok(()) + } - bail!("psql session request failed: {}", message) - } + PsqlSessionResult::Failure(message) => { + sender.send(Err(anyhow::Error::msg(message.clone())))?; + + bail!("psql session request failed: {}", message) } } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 802a0bd305..f246d4470a 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -57,7 +57,7 @@ pub fn proxy_conn_main( ) -> anyhow::Result<()> { let mut conn = ProxyConnection { state, - cplane: CPlaneApi::new(&state.conf.cplane_address), + cplane: CPlaneApi::new(&state.conf.auth_endpoint), user: "".into(), database: "".into(), pgb: PostgresBackend::new( @@ -80,6 +80,8 @@ pub fn proxy_conn_main( conn.handle_new_user()? }; + // XXX: move that inside handle_new_user/handle_existing_user to be able to + // report wrong connection error. proxy_pass(conn.pgb, db_info) } @@ -172,21 +174,31 @@ impl ProxyConnection { .split_last() .ok_or_else(|| anyhow::Error::msg("unexpected password message"))?; - if let Err(e) = self.check_auth_md5(md5_response) { - self.pgb - .write_message(&BeMessage::ErrorResponse(format!("{}", e)))?; - bail!("auth failed: {}", e); - } else { - self.pgb - .write_message_noflush(&BeMessage::AuthenticationOk)?; - self.pgb - .write_message_noflush(&BeMessage::ParameterStatus)?; - self.pgb.write_message(&BeMessage::ReadyForQuery)?; - } - } + match self.cplane.authenticate_proxy_request( + self.user.as_str(), + self.database.as_str(), + md5_response, + &self.md5_salt, + ) { + Err(e) => { + self.pgb + .write_message(&BeMessage::ErrorResponse(format!("{}", e)))?; - // ok, we are authorized - self.cplane.get_database_uri(&self.user, &self.database) + bail!("auth failed: {}", e); + } + Ok(conn_info) => { + self.pgb + .write_message_noflush(&BeMessage::AuthenticationOk)?; + self.pgb + .write_message_noflush(&BeMessage::ParameterStatus)?; + self.pgb.write_message(&BeMessage::ReadyForQuery)?; + + Ok(conn_info) + } + } + } else { + bail!("protocol violation"); + } } fn handle_new_user(&mut self) -> anyhow::Result { @@ -232,17 +244,11 @@ databases without opening the browser. Ok(dbinfo) } - - fn check_auth_md5(&self, md5_response: &[u8]) -> anyhow::Result<()> { - assert!(self.is_existing_user()); - self.cplane - .check_auth(self.user.as_str(), md5_response, &self.md5_salt) - } } /// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result { - let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?; + let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?; let config = db_info.conn_string().parse::()?; let _ = config.connect_raw(&mut socket, NoTls).await?; Ok(socket) @@ -273,7 +279,9 @@ fn proxy( /// Proxy a client connection to a postgres database fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> { - let runtime = tokio::runtime::Builder::new_current_thread().build()?; + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; let db_stream = runtime.block_on(connect_to_db(db_info))?; let db_stream = db_stream.into_std()?; db_stream.set_nonblocking(false)?; diff --git a/test_runner/Pipfile b/test_runner/Pipfile index f5250e4051..f5ff0d7e2b 100644 --- a/test_runner/Pipfile +++ b/test_runner/Pipfile @@ -10,6 +10,7 @@ typing-extensions = "*" pyjwt = {extras = ["crypto"], version = "*"} requests = "*" pytest-xdist = "*" +asyncpg = "*" [dev-packages] yapf = "*" diff --git a/test_runner/Pipfile.lock b/test_runner/Pipfile.lock index 4219b91098..3c68c0ff3a 100644 --- a/test_runner/Pipfile.lock +++ b/test_runner/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "480afaf71a214984dac55d128a4f67ec2d9749136e570c64df562c79900a9d83" + "sha256": "3cdc048691824d0b93912b6b78a0aa01dc98f278212c1badb0cc2edbd2103c3a" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,25 @@ ] }, "default": { + "asyncpg": { + "hashes": [ + "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317", + "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20", + "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18", + "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03", + "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d", + "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c", + "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843", + "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853", + "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1", + "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6", + "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83", + "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5", + "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece" + ], + "index": "pypi", + "version": "==0.24.0" + }, "attrs": { "hashes": [ "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1", @@ -83,11 +102,11 @@ }, "charset-normalizer": { "hashes": [ - "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b", - "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3" + "sha256:5d209c0a931f215cee683b6445e2d77677e7e75e159f78def0db09d68fafcaa6", + "sha256:5ec46d183433dcbd0ab716f2d7f29d8dee50505b3fdb40c6b985c7c4f5a3591f" ], "markers": "python_version >= '3'", - "version": "==2.0.4" + "version": "==2.0.6" }, "cryptography": { "hashes": [ @@ -96,8 +115,10 @@ "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7", "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085", "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc", + "sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d", "sha256:3fa3a7ccf96e826affdf1a0a9432be74dc73423125c8f96a909e3835a5ef194a", "sha256:5b0fbfae7ff7febdb74b574055c7466da334a5371f253732d7e2e7525d570498", + "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89", "sha256:8695456444f277af73a4877db9fc979849cd3ee74c198d04fc0776ebc3db52b9", "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c", "sha256:94fff993ee9bc1b2440d3b7243d488c6a3d9724cc2b09cdb297f6a886d040ef7", @@ -218,11 +239,11 @@ }, "pytest-xdist": { "hashes": [ - "sha256:e8ecde2f85d88fbcadb7d28cb33da0fa29bca5cf7d5967fa89fc0e97e5299ea5", - "sha256:ed3d7da961070fce2a01818b51f6888327fb88df4379edeb6b9d990e789d9c8d" + "sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168", + "sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9" ], "index": "pypi", - "version": "==2.3.0" + "version": "==2.4.0" }, "requests": { "hashes": [ diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 19e5384bba..9189017050 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -33,11 +33,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg main_cur.execute(''' INSERT INTO foo SELECT 'long string to consume some space' || g - FROM generate_series(1, 100000) g + FROM generate_series(1, 200000) g ''') main_cur.execute('SELECT pg_current_wal_insert_lsn()') lsn_b = main_cur.fetchone()[0] - print('LSN after 100100 rows: ' + lsn_b) + print('LSN after 200100 rows: ' + lsn_b) # Branch at the point where only 100 rows were inserted zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a]) @@ -46,15 +46,15 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg main_cur.execute(''' INSERT INTO foo SELECT 'long string to consume some space' || g - FROM generate_series(1, 100000) g + FROM generate_series(1, 200000) g ''') main_cur.execute('SELECT pg_current_wal_insert_lsn()') main_cur.execute('SELECT pg_current_wal_insert_lsn()') lsn_c = main_cur.fetchone()[0] - print('LSN after 200100 rows: ' + lsn_c) + print('LSN after 400100 rows: ' + lsn_c) - # Branch at the point where only 200 rows were inserted + # Branch at the point where only 200100 rows were inserted zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b]) pg_hundred = postgres.create_start("test_branch_behind_hundred") @@ -70,11 +70,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg more_pg_conn = pg_more.connect() more_cur = more_pg_conn.cursor() more_cur.execute('SELECT count(*) FROM foo') - assert more_cur.fetchone() == (100100, ) + assert more_cur.fetchone() == (200100, ) # All the rows are visible on the main branch main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (200100, ) + assert main_cur.fetchone() == (400100, ) # Check bad lsn's for branching diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index fafa37fe5a..bb28bdd83f 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -46,7 +46,7 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos from pg_settings where name = 'shared_buffers' ''') row = cur.fetchone() - print("shared_buffers is {}, table size {}", row[0], row[1]); + print(f'shared_buffers is {row[0]}, table size {row[1]}'); assert int(row[0]) < int(row[1]) cur.execute('VACUUM foo'); diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index 60015c03db..193b675e23 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -9,10 +9,7 @@ pytest_plugins = ("fixtures.zenith_fixtures") # # Test restarting and recreating a postgres instance # -# XXX: with_wal_acceptors=True fails now, would be fixed with -# `postgres --sync-walkeepers` patches. -# -@pytest.mark.parametrize('with_wal_acceptors', [False]) +@pytest.mark.parametrize('with_wal_acceptors', [False, True]) def test_restart_compute( zenith_cli, pageserver: ZenithPageserver, diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py index 957194d7d5..e01bf7f179 100644 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ b/test_runner/batch_others/test_snapfiles_gc.py @@ -6,8 +6,8 @@ pytest_plugins = ("fixtures.zenith_fixtures") def print_gc_result(row): print("GC duration {elapsed} ms".format_map(row)); - print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row)) - print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row)) + print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row)) + print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row)) # @@ -58,19 +58,21 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): layer_relfiles_remain = row['layer_relfiles_total'] - row['layer_relfiles_removed'] assert layer_relfiles_remain > 0 - # Insert a row. + # Insert a row and run GC. Checkpoint should freeze the layer + # so that there is only the most recent image layer left for the rel, + # removing the old image and delta layer. print("Inserting one row and running GC") cur.execute("INSERT INTO foo VALUES (1)") pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") row = pscur.fetchone() print_gc_result(row); - assert row['layer_relfiles_total'] == layer_relfiles_remain + 1 - assert row['layer_relfiles_removed'] == 1 + assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 + assert row['layer_relfiles_removed'] == 2 assert row['layer_relfiles_dropped'] == 0 # Insert two more rows and run GC. - # This should create a new layer file with the new contents, and - # remove the old one. + # This should create new image and delta layer file with the new contents, and + # then remove the old one image and the just-created delta layer. print("Inserting two more rows and running GC") cur.execute("INSERT INTO foo VALUES (2)") cur.execute("INSERT INTO foo VALUES (3)") @@ -78,11 +80,11 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") row = pscur.fetchone() print_gc_result(row); - assert row['layer_relfiles_total'] == layer_relfiles_remain + 1 - assert row['layer_relfiles_removed'] == 1 + assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 + assert row['layer_relfiles_removed'] == 2 assert row['layer_relfiles_dropped'] == 0 - # Do it again. Should again create a new layer file and remove old one. + # Do it again. Should again create two new layer files and remove old ones. print("Inserting two more rows and running GC") cur.execute("INSERT INTO foo VALUES (2)") cur.execute("INSERT INTO foo VALUES (3)") @@ -90,8 +92,8 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") row = pscur.fetchone() print_gc_result(row); - assert row['layer_relfiles_total'] == layer_relfiles_remain + 1 - assert row['layer_relfiles_removed'] == 1 + assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 + assert row['layer_relfiles_removed'] == 2 assert row['layer_relfiles_dropped'] == 0 # Run GC again, with no changes in the database. Should not remove anything. @@ -113,12 +115,19 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin): row = pscur.fetchone() print_gc_result(row); + # We still cannot remove the latest layers + # because they serve as tombstones for earlier layers. + assert row['layer_relfiles_dropped'] == 0 # Each relation fork is counted separately, hence 3. - assert row['layer_relfiles_dropped'] == 3 + assert row['layer_relfiles_needed_as_tombstone'] == 3 # The catalog updates also create new layer files of the catalogs, which # are counted as 'removed' assert row['layer_relfiles_removed'] > 0 + # TODO Change the test to check actual CG of dropped layers. + # Each relation fork is counted separately, hence 3. + #assert row['layer_relfiles_dropped'] == 3 + # TODO: perhaps we should count catalog and user relations separately, # to make this kind of testing more robust diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index fb45aa18b9..b5577f28d0 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -1,10 +1,14 @@ import pytest import random import time +import os +import subprocess +import uuid from contextlib import closing from multiprocessing import Process, Value -from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory +from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory, PgBin +from fixtures.utils import lsn_to_hex, mkdir_if_needed pytest_plugins = ("fixtures.zenith_fixtures") @@ -198,3 +202,92 @@ def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: Pos stop_value.value = 1 proc.join() + +class ProposerPostgres: + """Object for running safekeepers sync with walproposer""" + def __init__(self, pgdata_dir: str, pg_bin: PgBin, timeline_id: str, tenant_id: str): + self.pgdata_dir: str = pgdata_dir + self.pg_bin: PgBin = pg_bin + self.timeline_id: str = timeline_id + self.tenant_id: str = tenant_id + + def pg_data_dir_path(self) -> str: + """ Path to data directory """ + return self.pgdata_dir + + def config_file_path(self) -> str: + """ Path to postgresql.conf """ + return os.path.join(self.pgdata_dir, 'postgresql.conf') + + def create_dir_config(self, wal_acceptors: str): + """ Create dir and config for running --sync-safekeepers """ + + mkdir_if_needed(self.pg_data_dir_path()) + with open(self.config_file_path(), "w") as f: + f.writelines([ + "synchronous_standby_names = 'walproposer'\n", + f"zenith.zenith_timeline = '{self.timeline_id}'\n", + f"zenith.zenith_tenant = '{self.tenant_id}'\n", + f"wal_acceptors = '{wal_acceptors}'\n", + ]) + + def sync_safekeepers(self) -> str: + """ + Run 'postgres --sync-safekeepers'. + Returns execution result, which is commit_lsn after sync. + """ + + command = ["postgres", "--sync-safekeepers"] + env = { + "PGDATA": self.pg_data_dir_path(), + } + + basepath = self.pg_bin.run_capture(command, env) + stdout_filename = basepath + '.stdout' + + with open(stdout_filename, 'r') as stdout_f: + stdout = stdout_f.read() + return stdout.strip("\n ") + + +# insert wal in all safekeepers and run sync on proposer +def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorFactory): + wa_factory.start_n_new(3) + + timeline_id = uuid.uuid4().hex + tenant_id = uuid.uuid4().hex + + # write config for proposer + pgdata_dir = os.path.join(repo_dir, "proposer_pgdata") + pg = ProposerPostgres(pgdata_dir, pg_bin, timeline_id, tenant_id) + pg.create_dir_config(wa_factory.get_connstrs()) + + # valid lsn, which is not in the segment start, nor in zero segment + epoch_start_lsn = 0x16B9188 # 0/16B9188 + begin_lsn = epoch_start_lsn + + # append and commit WAL + lsn_after_append = [] + for i in range(3): + res = wa_factory.instances[i].append_logical_message( + tenant_id, + timeline_id, + { + "lm_prefix": "prefix", + "lm_message": "message", + "set_commit_lsn": True, + "term": 2, + "begin_lsn": begin_lsn, + "epoch_start_lsn": epoch_start_lsn, + "truncate_lsn": epoch_start_lsn, + }, + ) + lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"]) + lsn_after_append.append(lsn_hex) + print(f"safekeeper[{i}] lsn after append: {lsn_hex}") + + # run sync safekeepers + lsn_after_sync = pg.sync_safekeepers() + print(f"lsn after sync = {lsn_after_sync}") + + assert all(lsn_after_sync == lsn for lsn in lsn_after_append) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py new file mode 100644 index 0000000000..b1647a8544 --- /dev/null +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -0,0 +1,159 @@ +import asyncio +import asyncpg +import random + +from fixtures.zenith_fixtures import WalAcceptor, WalAcceptorFactory, ZenithPageserver, PostgresFactory, Postgres +from typing import List +from fixtures.utils import debug_print + +pytest_plugins = ("fixtures.zenith_fixtures") + + +class BankClient(object): + def __init__(self, conn: asyncpg.Connection, n_accounts, init_amount): + self.conn: asyncpg.Connection = conn + self.n_accounts = n_accounts + self.init_amount = init_amount + + async def initdb(self): + await self.conn.execute('DROP TABLE IF EXISTS bank_accs') + await self.conn.execute('CREATE TABLE bank_accs(uid int primary key, amount int)') + await self.conn.execute(''' + INSERT INTO bank_accs + SELECT *, $1 FROM generate_series(0, $2) + ''', self.init_amount, self.n_accounts - 1) + await self.conn.execute('DROP TABLE IF EXISTS bank_log') + await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)') + + # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed + await self.conn.execute('ALTER TABLE bank_accs SET (autovacuum_enabled = false)') + await self.conn.execute('ALTER TABLE bank_log SET (autovacuum_enabled = false)') + + async def check_invariant(self): + row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs') + assert row['sum'] == self.n_accounts * self.init_amount + +async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount): + # avoid deadlocks by sorting uids + if from_uid > to_uid: + from_uid, to_uid, amount = to_uid, from_uid, -amount + + async with conn.transaction(): + await conn.execute( + 'UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2', + amount, to_uid, + ) + await conn.execute( + 'UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2', + amount, from_uid, + ) + await conn.execute('INSERT INTO bank_log VALUES ($1, $2, $3)', + from_uid, to_uid, amount, + ) + +class WorkerStats(object): + def __init__(self, n_workers): + self.counters = [0] * n_workers + self.running = True + + def reset(self): + self.counters = [0] * len(self.counters) + + def inc_progress(self, worker_id): + self.counters[worker_id] += 1 + + def check_progress(self): + debug_print("Workers progress: {}".format(self.counters)) + + # every worker should finish at least one tx + assert all(cnt > 0 for cnt in self.counters) + + progress = sum(self.counters) + print('All workers made {} transactions'.format(progress)) + + +async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer): + pg_conn = await pg.connect_async() + debug_print('Started worker {}'.format(worker_id)) + + while stats.running: + from_uid = random.randint(0, n_accounts - 1) + to_uid = (from_uid + random.randint(1, n_accounts - 1)) % n_accounts + amount = random.randint(1, max_transfer) + + await bank_transfer(pg_conn, from_uid, to_uid, amount) + stats.inc_progress(worker_id) + + debug_print('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid)) + + debug_print('Finished worker {}'.format(worker_id)) + + await pg_conn.close() + + +# This test will run several iterations and check progress in each of them. +# On each iteration 1 acceptor is stopped, and 2 others should allow +# background workers execute transactions. In the end, state should remain +# consistent. +async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_workers=10): + n_accounts = 100 + init_amount = 100000 + max_transfer = 100 + period_time = 10 + iterations = 6 + + pg_conn = await pg.connect_async() + bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) + # create tables and initial balances + await bank.initdb() + + stats = WorkerStats(n_workers) + workers = [] + for worker_id in range(n_workers): + worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer) + workers.append(asyncio.create_task(worker)) + + + for it in range(iterations): + victim = acceptors[it % len(acceptors)] + victim.stop() + + # Wait till previous victim recovers so it is ready for the next + # iteration by making any writing xact. + conn = await pg.connect_async() + await conn.execute( + 'UPDATE bank_accs SET amount = amount WHERE uid = 1', + timeout=120 + ) + await conn.close() + + stats.reset() + await asyncio.sleep(period_time) + # assert that at least one transaction has completed in every worker + stats.check_progress() + + victim.start() + + print('Iterations are finished, exiting coroutines...') + stats.running = False + # await all workers + await asyncio.gather(*workers) + # assert balances sum hasn't changed + await bank.check_invariant() + await pg_conn.close() + + +# restart acceptors one by one, while executing and validating bank transactions +def test_restarts_under_load(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, + wa_factory: WalAcceptorFactory): + + wa_factory.start_n_new(3) + + zenith_cli.run(["branch", "test_wal_acceptors_restarts_under_load", "empty"]) + pg = postgres.create_start('test_wal_acceptors_restarts_under_load', + wal_acceptors=wa_factory.get_connstrs()) + + asyncio.run(run_restarts_under_load(pg, wa_factory.instances)) + + # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed + pg.stop() diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 86ca78d000..328ebcc1f8 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -136,9 +136,34 @@ class ZenithBenchmarker: # The metric should be an integer, as it's a number of bytes. But in general # all prometheus metrics are floats. So to be pedantic, read it as a float # and round to integer. - matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics) + matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$', all_metrics, + re.MULTILINE) return int(round(float(matches.group(1)))) + def get_peak_mem(self, pageserver) -> int: + """ + Fetch the "maxrss" metric from the pageserver + """ + # Fetch all the exposed prometheus metrics from page server + all_metrics = pageserver.http_client().get_metrics() + # See comment in get_io_writes() + matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, + re.MULTILINE) + return int(round(float(matches.group(1)))) + + def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str): + """ + Calculate the on-disk size of a timeline + """ + path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid) + + totalbytes = 0 + for root, dirs, files in os.walk(path): + for name in files: + totalbytes += os.path.getsize(os.path.join(root, name)) + + return totalbytes + @contextmanager def record_pageserver_writes(self, pageserver, metric_name): """ diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 9e20ae723b..92bd25ed24 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -21,7 +21,7 @@ def mkdir_if_needed(path: str) -> None: assert os.path.isdir(path) -def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None: +def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: """ Run a process and capture its output Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" @@ -29,6 +29,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None: counter. If those files already exist, we will overwrite them. + Returns basepath for files with captured output. """ assert type(cmd) is list base = os.path.basename(cmd[0]) + '_{}'.format(global_counter()) @@ -41,6 +42,8 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None: print('(capturing output to "{}.stdout")'.format(base)) subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) + return basepath + _global_counter = 0 @@ -54,3 +57,15 @@ def global_counter() -> int: global _global_counter _global_counter += 1 return _global_counter + +def debug_print(*args, **kwargs) -> None: + """ Print to the console if TEST_DEBUG_PRINT is set in env. + + All parameters are passed to print(). + """ + if os.environ.get('TEST_DEBUG_PRINT') is not None: + print(*args, **kwargs) + +def lsn_to_hex(num: int) -> str: + """ Convert lsn from int to standard hex notation. """ + return "{:X}/{:X}".format(num >> 32, num & 0xffffffff) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 8492aa6aa4..d29d278cdd 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1,9 +1,11 @@ from dataclasses import dataclass from functools import cached_property +import asyncpg import os import pathlib import uuid import jwt +import json import psycopg2 import pytest import shutil @@ -69,7 +71,9 @@ def pytest_configure(config): # This is bad; we don't want any of those processes polluting the # result of the test. # NOTE this shows as an internal pytest error, there might be a better way - raise Exception('found interfering processes running') + raise Exception( + 'Found interfering processes running. Stop all Zenith pageservers, nodes, WALs, as well as stand-alone Postgres.' + ) def determine_scope(fixture_name: str, config: Any) -> str: @@ -129,6 +133,21 @@ class PgProtocol: conn.autocommit = autocommit return conn + async def connect_async(self, *, dbname: str = 'postgres', username: Optional[str] = None, password: Optional[str] = None) -> asyncpg.Connection: + """ + Connect to the node from async python. + Returns asyncpg's connection object. + """ + + conn = await asyncpg.connect( + host=self.host, + port=self.port, + database=dbname, + user=username or self.username, + password=password, + ) + return conn + def safe_psql(self, query: str, **kwargs: Any) -> List[Any]: """ Execute query against the node and return all rows. @@ -469,17 +488,19 @@ class PgBin: def run_capture(self, command: List[str], env: Optional[Env] = None, - cwd: Optional[str] = None) -> None: + cwd: Optional[str] = None, + **kwargs: Any) -> None: """ Run one of the postgres binaries, with stderr and stdout redirected to a file. - This is just like `run`, but for chatty programs. + This is just like `run`, but for chatty programs. Returns basepath for files + with captured output. """ self._fixpath(command) print('Running command "{}"'.format(' '.join(command))) env = self._build_env(env) - subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True) + return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs) @zenfixture @@ -838,6 +859,27 @@ class WalAcceptor: pass # pidfile might be obsolete return self + def append_logical_message(self, tenant_id: str, timeline_id: str, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Send JSON_CTRL query to append LogicalMessage to WAL and modify + safekeeper state. It will construct LogicalMessage from provided + prefix and message, and then will write it to WAL. + """ + + # "replication=0" hacks psycopg not to send additional queries + # on startup, see https://github.com/psycopg/psycopg2/pull/482 + connstr = f"host=localhost port={self.port} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + + with closing(psycopg2.connect(connstr)) as conn: + # server doesn't support transactions + conn.autocommit = True + with conn.cursor() as cur: + request_json = json.dumps(request) + print(f"JSON_CTRL request on port {self.port}: {request_json}") + cur.execute("JSON_CTRL " + request_json) + all = cur.fetchall() + print(f"JSON_CTRL response: {all[0][0]}") + return json.loads(all[0][0]) class WalAcceptorFactory: """ An object representing multiple running wal acceptors. """ diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 24e154a5bb..95f1ea5e4a 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -4,19 +4,6 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture") -def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str): - path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid) - - totalbytes = 0 - for root, dirs, files in os.walk(path): - for name in files: - totalbytes += os.path.getsize(os.path.join(root, name)) - - if 'wal' in dirs: - dirs.remove('wal') # don't visit 'wal' subdirectory - - return totalbytes - # # Run bulk INSERT test. # @@ -25,6 +12,7 @@ def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str): # 1. Time to INSERT 5 million rows # 2. Disk writes # 3. Disk space used +# 4. Peak memory usage # def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str): # Create a branch for us @@ -55,6 +43,9 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg # time and I/O pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") + # Record peak memory usage + zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB') + # Report disk space used by the repository - timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline) + timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline) zenbenchmark.record('size', timeline_size / (1024*1024), 'MB') diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py new file mode 100644 index 0000000000..e1de1dd014 --- /dev/null +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -0,0 +1,58 @@ +import timeit +import pytest + +from fixtures.zenith_fixtures import ( + TenantFactory, + ZenithCli, + PostgresFactory, +) + +pytest_plugins = ("fixtures.benchmark_fixture") + +# Run bulk tenant creation test. +# +# Collects metrics: +# +# 1. Time to create {1,10,50} tenants +# 2. Average creation time per tenant + + +@pytest.mark.parametrize('tenants_count', [1, 5, 10]) +@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa']) +def test_bulk_tenant_create( + zenith_cli: ZenithCli, + tenant_factory: TenantFactory, + postgres: PostgresFactory, + wa_factory, + use_wal_acceptors: str, + tenants_count: int, + zenbenchmark, +): + """Measure tenant creation time (with and without wal acceptors)""" + + time_slices = [] + + for i in range(tenants_count): + start = timeit.default_timer() + + tenant = tenant_factory.create() + zenith_cli.run([ + "branch", f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", "main", + f"--tenantid={tenant}" + ]) + + if use_wal_acceptors == 'with_wa': + wa_factory.start_n_new(3) + + pg_tenant = postgres.create_start( + f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", + tenant, + wal_acceptors=wa_factory.get_connstrs() if use_wal_acceptors == 'with_wa' else None, + ) + + end = timeit.default_timer() + time_slices.append(end - start) + + pg_tenant.stop() + + zenbenchmark.record('tenant_creation_time', sum(time_slices) / len(time_slices), 's') diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 7e0f19bec8..18db78f12a 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -4,19 +4,6 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture") -def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str): - path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid) - - totalbytes = 0 - for root, dirs, files in os.walk(path): - for name in files: - totalbytes += os.path.getsize(os.path.join(root, name)) - - if 'wal' in dirs: - dirs.remove('wal') # don't visit 'wal' subdirectory - - return totalbytes - # # Run a very short pgbench test. # @@ -64,5 +51,5 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") # Report disk space used by the repository - timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline) + timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline) zenbenchmark.record('size', timeline_size / (1024*1024), 'MB') diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py new file mode 100644 index 0000000000..09310c702b --- /dev/null +++ b/test_runner/performance/test_write_amplification.py @@ -0,0 +1,74 @@ +# Demonstrate Write Amplification with naive oldest-first layer checkpointing +# algorithm. +# +# In each iteration of the test, we create a new table that's slightly under 10 +# MB in size (10 MB is the current "segment size" used by the page server). Then +# we make a tiny update to all the tables already created. This creates a WAL +# pattern where you have a lot of updates on one segment (the newly created +# one), alternating with a small updates on all relations. This is the worst +# case scenario for the naive checkpointing policy where we write out the layers +# in LSN order, writing the oldest layer first. That creates a new 10 MB image +# layer to be created for each of those small updates. This is the Write +# Amplification problem at its finest. +import os +from contextlib import closing +from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver + +pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture") + +def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str): + # Create a branch for us + zenith_cli.run(["branch", "test_write_amplification", "empty"]) + + pg = postgres.create_start('test_write_amplification') + print("postgres is running on 'test_write_amplification' branch") + + # Open a connection directly to the page server that we'll use to force + # flushing the layers to disk + psconn = pageserver.connect(); + pscur = psconn.cursor() + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + # Get the timeline ID of our branch. We need it for the 'do_gc' command + cur.execute("SHOW zenith.zenith_timeline") + timeline = cur.fetchone()[0] + + with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'): + with zenbenchmark.record_duration('run'): + + # NOTE: Because each iteration updates every table already created, + # the runtime and write amplification is O(n^2), where n is the + # number of iterations. + for i in range(25): + cur.execute(f''' + CREATE TABLE tbl{i} AS + SELECT g as i, 'long string to consume some space' || g as t + FROM generate_series(1, 100000) g + ''') + cur.execute(f"create index on tbl{i} (i);") + for j in range(1, i): + cur.execute(f"delete from tbl{j} where i = {i}") + + # Force checkpointing. As of this writing, we don't have + # a back-pressure mechanism, and the page server cannot + # keep up digesting and checkpointing the WAL at the + # rate that it is generated. If we don't force a + # checkpoint, the WAL will just accumulate in memory + # until you hit OOM error. So in effect, we use much + # more memory to hold the incoming WAL, and write them + # out in larger batches than we'd really want. Using + # more memory hides the write amplification problem this + # test tries to demonstrate. + # + # The write amplification problem is real, and using + # more memory isn't the right solution. We could + # demonstrate the effect also by generating the WAL + # slower, adding some delays in this loop. But forcing + # the the checkpointing and GC makes the test go faster, + # with the same total I/O effect. + pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") + + # Report disk space used by the repository + timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline) + zenbenchmark.record('size', timeline_size / (1024*1024), 'MB') diff --git a/vendor/postgres b/vendor/postgres index 53abd9296e..324a67700b 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 53abd9296e62cc894618bdcf46f55dc044708d9c +Subproject commit 324a67700b92460e52ece9e719d1b57b9e63e3b6 diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml index b28f392e35..16790ca214 100644 --- a/walkeeper/Cargo.toml +++ b/walkeeper/Cargo.toml @@ -13,6 +13,7 @@ bytes = "1.0.1" byteorder = "1.4.3" fs2 = "0.4.3" lazy_static = "1.4.0" +serde_json = "1" log = "0.4.14" clap = "2.33.0" daemonize = "0.4.1" diff --git a/walkeeper/README b/walkeeper/README index db8deda337..6c5a69e926 100644 --- a/walkeeper/README +++ b/walkeeper/README @@ -76,6 +76,43 @@ safekeepers. See README_PROTO.md for a more detailed desription of the consensus protocol. spec/ contains TLA+ specification of it. +# Q&A + +Q: Why have a separate service instead of connecting Page Server directly to a + primary PostgreSQL node? +A: Page Server is a single server which can be lost. As our primary + fault-tolerant storage is S3, we do not want to wait for it before + committing a transaction. The WAL service acts as a temporary fault-tolerant + storage for recent data before it gets to the Page Server and then finally + to S3. Whenever WALs and pages are committed to S3, WAL's storage can be + trimmed. + +Q: What if the compute node evicts a page, needs it back, but the page is yet + to reach the Page Server? +A: If the compute node has evicted a page, all changes from that page are + already committed, i.e. they are saved on majority of WAL safekeepers. These + WAL records will eventually reach the Page Server. The Page Server notes + that the compute note requests pages with a very recent LSN and will not + respond to the compute node until it a corresponding WAL is received from WAL + safekeepers. + +Q: How long may Page Server wait for? +A: Not too long, hopefully. If a page is evicted, it probably was not used for + a while, so the WAL service have had enough time to push changes to the Page + Server. There may be issues if there is no backpressure and compute node with + WAL service run ahead of Page Server, though. + There is no backpressure right now, so you may even see some spurious + timeouts in tests. + +Q: How do WAL safekeepers communicate with each other? +A: They may only send each other messages via the compute node, they never + communicate directly with each other. + +Q: Why have a consensus algorithm if there is only a single compute node? +A: Actually there may be moments with multiple PostgreSQL nodes running at the + same time. E.g. we are bringing one up and one down. We would like to avoid + simultaneous writes from different nodes, so there should be a consensus on + who is the primary node. # Terminology diff --git a/walkeeper/src/json_ctrl.rs b/walkeeper/src/json_ctrl.rs new file mode 100644 index 0000000000..e37b34fa95 --- /dev/null +++ b/walkeeper/src/json_ctrl.rs @@ -0,0 +1,249 @@ +//! +//! This module implements JSON_CTRL protocol, which allows exchange +//! JSON messages over psql for testing purposes. +//! +//! Currently supports AppendLogicalMessage, which is used for WAL +//! modifications in tests. +//! + +use anyhow::{anyhow, Result}; +use bytes::{BufMut, Bytes, BytesMut}; +use crc32c::crc32c_append; +use log::*; +use serde::{Deserialize, Serialize}; + +use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; +use crate::safekeeper::{ + AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerGreeting, +}; +use crate::safekeeper::{SafeKeeperState, Term}; +use crate::send_wal::SendWalHandler; +use crate::timeline::TimelineTools; +use postgres_ffi::pg_constants; +use postgres_ffi::xlog_utils; +use postgres_ffi::{uint32, uint64, Oid, XLogRecord}; +use zenith_utils::lsn::Lsn; +use zenith_utils::postgres_backend::PostgresBackend; +use zenith_utils::pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; + +#[derive(Serialize, Deserialize, Debug)] +struct AppendLogicalMessage { + // prefix and message to build LogicalMessage + lm_prefix: String, + lm_message: String, + + // if true, commit_lsn will match flush_lsn after append + set_commit_lsn: bool, + + // fields from AppendRequestHeader + term: Term, + epoch_start_lsn: Lsn, + begin_lsn: Lsn, + truncate_lsn: Lsn, +} + +#[derive(Serialize, Deserialize)] +struct AppendResult { + // safekeeper state after append + state: SafeKeeperState, + // info about new record in the WAL + inserted_wal: InsertedWAL, +} + +/// Handles command to craft logical message WAL record with given +/// content, and then append it with specified term and lsn. This +/// function is used to test safekeepers in different scenarios. +pub fn handle_json_ctrl( + swh: &mut SendWalHandler, + pgb: &mut PostgresBackend, + cmd: &Bytes, +) -> Result<()> { + let cmd = cmd + .strip_prefix(b"JSON_CTRL") + .ok_or_else(|| anyhow!("invalid prefix"))?; + // trim zeroes in the end + let cmd = cmd.strip_suffix(&[0u8]).unwrap_or(cmd); + + let append_request: AppendLogicalMessage = serde_json::from_slice(cmd)?; + info!("JSON_CTRL request: {:?}", append_request); + + // need to init safekeeper state before AppendRequest + prepare_safekeeper(swh)?; + + let inserted_wal = append_logical_message(swh, append_request)?; + let response = AppendResult { + state: swh.timeline.get().get_info(), + inserted_wal, + }; + let response_data = serde_json::to_vec(&response)?; + + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor { + name: b"json", + typoid: TEXT_OID, + typlen: -1, + ..Default::default() + }]))? + .write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))? + .write_message(&BeMessage::CommandComplete(b"JSON_CTRL"))?; + Ok(()) +} + +/// Prepare safekeeper to process append requests without crashes, +/// by sending ProposerGreeting with default server.wal_seg_size. +fn prepare_safekeeper(swh: &mut SendWalHandler) -> Result<()> { + let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting { + protocol_version: 1, // current protocol + pg_version: 0, // unknown + proposer_id: [0u8; 16], + system_id: 0, + ztli: swh.timelineid.unwrap(), + tenant_id: swh.tenantid.unwrap(), + tli: 0, + wal_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32, // 16MB, default for tests + }); + + let response = swh.timeline.get().process_msg(&greeting_request)?; + match response { + AcceptorProposerMessage::Greeting(_) => Ok(()), + _ => anyhow::bail!("not GreetingResponse"), + } +} + +#[derive(Serialize, Deserialize)] +struct InsertedWAL { + begin_lsn: Lsn, + end_lsn: Lsn, + append_response: AppendResponse, +} + +/// Extend local WAL with new LogicalMessage record. To do that, +/// create AppendRequest with new WAL and pass it to safekeeper. +fn append_logical_message( + swh: &mut SendWalHandler, + msg: AppendLogicalMessage, +) -> Result { + let wal_data = encode_logical_message(msg.lm_prefix, msg.lm_message); + let sk_state = swh.timeline.get().get_info(); + + let begin_lsn = msg.begin_lsn; + let end_lsn = begin_lsn + wal_data.len() as u64; + + let commit_lsn = if msg.set_commit_lsn { + end_lsn + } else { + sk_state.commit_lsn + }; + + let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { + h: AppendRequestHeader { + term: msg.term, + epoch_start_lsn: begin_lsn, + begin_lsn, + end_lsn, + commit_lsn, + truncate_lsn: msg.truncate_lsn, + proposer_uuid: [0u8; 16], + }, + wal_data: Bytes::from(wal_data), + }); + + let response = swh.timeline.get().process_msg(&append_request)?; + + let append_response = match response { + AcceptorProposerMessage::AppendResponse(resp) => resp, + _ => anyhow::bail!("not AppendResponse"), + }; + + Ok(InsertedWAL { + begin_lsn, + end_lsn, + append_response, + }) +} + +#[repr(C)] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +struct XlLogicalMessage { + db_id: Oid, + transactional: uint32, // bool, takes 4 bytes due to alignment in C structures + prefix_size: uint64, + message_size: uint64, +} + +impl XlLogicalMessage { + pub fn encode(&self) -> Bytes { + use zenith_utils::bin_ser::LeSer; + self.ser().unwrap().into() + } +} + +/// Create new WAL record for non-transactional logical message. +/// Used for creating artificial WAL for tests, as LogicalMessage +/// record is basically no-op. +fn encode_logical_message(prefix: String, message: String) -> Vec { + let mut prefix_bytes = BytesMut::with_capacity(prefix.len() + 1); + prefix_bytes.put(prefix.as_bytes()); + prefix_bytes.put_u8(0); + + let message_bytes = message.as_bytes(); + + let logical_message = XlLogicalMessage { + db_id: 0, + transactional: 0, + prefix_size: prefix_bytes.len() as u64, + message_size: message_bytes.len() as u64, + }; + + let mainrdata = logical_message.encode(); + let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len(); + // only short mainrdata is supported for now + assert!(mainrdata_len <= 255); + let mainrdata_len = mainrdata_len as u8; + + let mut data: Vec = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len]; + data.extend_from_slice(&mainrdata); + data.extend_from_slice(&prefix_bytes); + data.extend_from_slice(message_bytes); + + let total_len = xlog_utils::XLOG_SIZE_OF_XLOG_RECORD + data.len(); + + let mut header = XLogRecord { + xl_tot_len: total_len as u32, + xl_xid: 0, + xl_prev: 0, + xl_info: 0, + xl_rmid: 21, + __bindgen_padding_0: [0u8; 2usize], + xl_crc: 0, // crc will be calculated later + }; + + let header_bytes = header.encode(); + let crc = crc32c_append(0, &data); + let crc = crc32c_append(crc, &header_bytes[0..xlog_utils::XLOG_RECORD_CRC_OFFS]); + header.xl_crc = crc; + + let mut wal: Vec = Vec::new(); + wal.extend_from_slice(&header.encode()); + wal.extend_from_slice(&data); + + // WAL start position must be aligned at 8 bytes, + // this will add padding for the next WAL record. + const PADDING: usize = 8; + let padding_rem = wal.len() % PADDING; + if padding_rem != 0 { + wal.resize(wal.len() + PADDING - padding_rem, 0); + } + + wal +} + +#[test] +fn test_encode_logical_message() { + let expected = [ + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102, + 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, + ]; + let actual = encode_logical_message("prefix".to_string(), "message".to_string()); + assert_eq!(expected, actual[..]); +} diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index 1f49738950..fb04459c47 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use std::time::Duration; +pub mod json_ctrl; pub mod receive_wal; pub mod replication; pub mod s3_offload; diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index 5755294a71..95f0e9e0c2 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -28,7 +28,7 @@ const SK_PROTOCOL_VERSION: u32 = 1; const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. -type Term = u64; +pub type Term = u64; /// Unique id of proposer. Not needed for correctness, used for monitoring. type PgUuid = [u8; 16]; @@ -111,7 +111,7 @@ impl Default for SafeKeeperState { // protocol messages /// Initial Proposer -> Acceptor message -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Deserialize)] pub struct ProposerGreeting { /// proposer-acceptor protocol version pub protocol_version: u32, @@ -128,19 +128,19 @@ pub struct ProposerGreeting { /// Acceptor -> Proposer initial response: the highest term known to me /// (acceptor voted for). -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize)] pub struct AcceptorGreeting { term: u64, } /// Vote request sent from proposer to safekeepers -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Deserialize)] pub struct VoteRequest { term: Term, } /// Vote itself, sent from safekeeper to proposer -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize)] pub struct VoteResponse { term: Term, // not really needed, just a sanity check vote_given: u64, // fixme u64 due to padding @@ -152,26 +152,26 @@ pub struct VoteResponse { /// Request with WAL message sent from proposer to safekeeper. Along the way it /// announces 1) successful election (with epoch_start_lsn); 2) commit_lsn. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug)] pub struct AppendRequest { - h: AppendRequestHeader, - wal_data: Bytes, + pub h: AppendRequestHeader, + pub wal_data: Bytes, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Deserialize)] pub struct AppendRequestHeader { - term: Term, + pub term: Term, // LSN since the proposer appends WAL; determines epoch switch point. - epoch_start_lsn: Lsn, + pub epoch_start_lsn: Lsn, /// start position of message in WAL - begin_lsn: Lsn, + pub begin_lsn: Lsn, /// end position of message in WAL - end_lsn: Lsn, + pub end_lsn: Lsn, /// LSN committed by quorum of safekeepers - commit_lsn: Lsn, + pub commit_lsn: Lsn, /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper - truncate_lsn: Lsn, + pub truncate_lsn: Lsn, // only for logging/debugging - proposer_uuid: PgUuid, + pub proposer_uuid: PgUuid, } /// Report safekeeper state to proposer @@ -277,8 +277,8 @@ impl AcceptorProposerMessage { pub trait Storage { /// Persist safekeeper state on disk, optionally syncing it. fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()>; - /// Write piece of wal in buf to disk. - fn write_wal(&mut self, s: &SafeKeeperState, startpos: Lsn, buf: &[u8]) -> Result<()>; + /// Write piece of wal in buf to disk and sync it. + fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>; } /// SafeKeeper which consumes events (messages from compute) and provides @@ -426,7 +426,7 @@ where let mut last_rec_lsn = Lsn(0); if !msg.wal_data.is_empty() { self.storage - .write_wal(&self.s, msg.h.begin_lsn, &msg.wal_data)?; + .write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?; // figure out last record's end lsn for reporting (if we got the // whole record) @@ -550,7 +550,7 @@ mod tests { Ok(()) } - fn write_wal(&mut self, _s: &SafeKeeperState, _startpos: Lsn, _buf: &[u8]) -> Result<()> { + fn write_wal(&mut self, _server: &ServerInfo, _startpos: Lsn, _buf: &[u8]) -> Result<()> { Ok(()) } } diff --git a/walkeeper/src/send_wal.rs b/walkeeper/src/send_wal.rs index 3a6f770c7a..e81b6c5eac 100644 --- a/walkeeper/src/send_wal.rs +++ b/walkeeper/src/send_wal.rs @@ -2,6 +2,7 @@ //! pageserver/any other consumer. //! +use crate::json_ctrl::handle_json_ctrl; use crate::receive_wal::ReceiveWalConn; use crate::replication::ReplicationConn; use crate::timeline::{Timeline, TimelineTools}; @@ -49,9 +50,11 @@ impl postgres_backend::Handler for SendWalHandler { } fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: Bytes) -> Result<()> { - // START_WAL_PUSH is the only command that initializes the timeline + // START_WAL_PUSH is the only command that initializes the timeline in production. + // There is also JSON_CTRL command, which should initialize the timeline for testing. if self.timeline.is_none() { - if query_string.starts_with(b"START_WAL_PUSH") { + if query_string.starts_with(b"START_WAL_PUSH") || query_string.starts_with(b"JSON_CTRL") + { self.timeline.set( &self.conf, self.tenantid.unwrap(), @@ -76,6 +79,9 @@ impl postgres_backend::Handler for SendWalHandler { } else if query_string.starts_with(b"START_WAL_PUSH") { ReceiveWalConn::new(pgb)?.run(self)?; Ok(()) + } else if query_string.starts_with(b"JSON_CTRL") { + handle_json_ctrl(self, pgb, &query_string)?; + Ok(()) } else { bail!("Unexpected command {:?}", query_string); } diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index 12d17f5f43..d2a885e1ce 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -18,8 +18,8 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId}; use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER}; use crate::safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, Storage, - SK_FORMAT_VERSION, SK_MAGIC, + AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo, + Storage, SK_FORMAT_VERSION, SK_MAGIC, }; use crate::WalAcceptorConf; use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; @@ -61,7 +61,7 @@ struct SharedState { sk: SafeKeeper, /// For receiving-sending wal cooperation /// quorum commit LSN we've notified walsenders about - commit_lsn: Lsn, + notified_commit_lsn: Lsn, /// State of replicas replicas: Vec>, } @@ -127,7 +127,7 @@ impl SharedState { }; Ok(Self { - commit_lsn: Lsn(0), + notified_commit_lsn: Lsn(0), sk: SafeKeeper::new(Lsn(flush_lsn), tli, storage, state), replicas: Vec::new(), }) @@ -230,7 +230,7 @@ impl Timeline { pub fn wait_for_lsn(&self, lsn: Lsn) -> Option { let mut shared_state = self.mutex.lock().unwrap(); loop { - let commit_lsn = shared_state.commit_lsn; + let commit_lsn = shared_state.notified_commit_lsn; // This must be `>`, not `>=`. if commit_lsn > lsn { return Some(commit_lsn); @@ -249,8 +249,8 @@ impl Timeline { // Notify caught-up WAL senders about new WAL data received pub fn notify_wal_senders(&self, commit_lsn: Lsn) { let mut shared_state = self.mutex.lock().unwrap(); - if shared_state.commit_lsn < commit_lsn { - shared_state.commit_lsn = commit_lsn; + if shared_state.notified_commit_lsn < commit_lsn { + shared_state.notified_commit_lsn = commit_lsn; self.cond.notify_all(); } } @@ -389,14 +389,14 @@ impl Storage for FileStorage { Ok(()) } - fn write_wal(&mut self, s: &SafeKeeperState, startpos: Lsn, buf: &[u8]) -> Result<()> { + fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()> { let mut bytes_left: usize = buf.len(); let mut bytes_written: usize = 0; let mut partial; let mut start_pos = startpos; const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; - let wal_seg_size = s.server.wal_seg_size as usize; - let ztli = s.server.ztli; + let wal_seg_size = server.wal_seg_size as usize; + let ztli = server.ztli; /* Extract WAL location for this block */ let mut xlogoff = start_pos.segment_offset(wal_seg_size) as usize; @@ -417,7 +417,7 @@ impl Storage for FileStorage { /* Open file */ let segno = start_pos.segment_number(wal_seg_size); // note: we basically don't support changing pg timeline - let wal_file_name = XLogFileName(s.server.tli, segno, wal_seg_size); + let wal_file_name = XLogFileName(server.tli, segno, wal_seg_size); let wal_file_path = self .conf .data_dir diff --git a/zenith_metrics/Cargo.toml b/zenith_metrics/Cargo.toml index 79c59b5774..062ebc1c9c 100644 --- a/zenith_metrics/Cargo.toml +++ b/zenith_metrics/Cargo.toml @@ -7,3 +7,4 @@ edition = "2018" prometheus = {version = "0.12", default_features=false} # removes protobuf dependency libc = "0.2" lazy_static = "1.4" +once_cell = "1.8.0" diff --git a/zenith_metrics/src/lib.rs b/zenith_metrics/src/lib.rs index 0d1a60e978..e3c3c81ee7 100644 --- a/zenith_metrics/src/lib.rs +++ b/zenith_metrics/src/lib.rs @@ -3,6 +3,7 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use lazy_static::lazy_static; +use once_cell::race::OnceBox; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_histogram, Histogram}; pub use prometheus::{register_histogram_vec, HistogramVec}; @@ -20,17 +21,55 @@ pub use wrappers::{CountedReader, CountedWriter}; /// Metrics gathering is a relatively simple and standalone operation, so /// it might be fine to do it this way to keep things simple. pub fn gather() -> Vec { - update_io_metrics(); + update_rusage_metrics(); prometheus::gather() } +static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new(); + +/// Sets a prefix which will be used for all common metrics, typically a service +/// name like 'pageserver'. Should be executed exactly once in the beginning of +/// any executable which uses common metrics. +pub fn set_common_metrics_prefix(prefix: &'static str) { + // Not unwrap() because metrics may be initialized after multiple threads have been started. + COMMON_METRICS_PREFIX + .set(prefix.into()) + .unwrap_or_else(|_| { + eprintln!( + "set_common_metrics_prefix() was called second time with '{}', exiting", + prefix + ); + std::process::exit(1); + }); +} + +/// Prepends a prefix to a common metric name so they are distinguished between +/// different services, see https://github.com/zenithdb/zenith/pull/681 +/// A call to set_common_metrics_prefix() is necessary prior to calling this. +pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String { + // Not unwrap() because metrics may be initialized after multiple threads have been started. + format!( + "{}_{}", + COMMON_METRICS_PREFIX.get().unwrap_or_else(|| { + eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting"); + std::process::exit(1); + }), + unprefixed_metric_name + ) +} + lazy_static! { static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!( - "pageserver_disk_io_bytes", + new_common_metric_name("disk_io_bytes"), "Bytes written and read from disk, grouped by the operation (read|write)", &["io_operation"] ) .expect("Failed to register disk i/o bytes int gauge vec"); + static ref MAXRSS_KB: IntGauge = register_int_gauge!( + new_common_metric_name("maxrss_kb"), + "Memory usage (Maximum Resident Set Size)" + ) + .expect("Failed to register maxrss_kb int gauge"); } // Records I/O stats in a "cross-platform" way. @@ -42,7 +81,7 @@ lazy_static! { // We know the size of the block, so we can determine the I/O bytes out of it. // The value might be not 100% exact, but should be fine for Prometheus metrics in this case. #[allow(clippy::unnecessary_cast)] -fn update_io_metrics() { +fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); const BYTES_IN_BLOCK: i64 = 512; @@ -52,6 +91,7 @@ fn update_io_metrics() { DISK_IO_BYTES .with_label_values(&["write"]) .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK); + MAXRSS_KB.set(rusage_stats.ru_maxrss); } fn get_rusage_stats() -> libc::rusage { diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index f315a473af..22c1c9bab6 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -38,3 +38,4 @@ rustls-split = "0.2.1" hex-literal = "0.3" bytes = "1.0" webpki = "0.21" +tempfile = "3.2" diff --git a/zenith_utils/src/accum.rs b/zenith_utils/src/accum.rs new file mode 100644 index 0000000000..d3ad61e514 --- /dev/null +++ b/zenith_utils/src/accum.rs @@ -0,0 +1,33 @@ +/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you +/// feed the accumulated values by calling the 'accum' function, instead of having an +/// iterator. +/// +/// For example, to calculate the smallest value among some integers: +/// +/// ``` +/// use zenith_utils::accum::Accum; +/// +/// let values = [1, 2, 3]; +/// +/// let mut min_value: Accum = Accum(None); +/// for new_value in &values { +/// min_value.accum(std::cmp::min, *new_value); +/// } +/// +/// assert_eq!(min_value.0.unwrap(), 1); +/// ``` +pub struct Accum(pub Option); +impl Accum { + pub fn accum(&mut self, func: F, new_value: T) + where + F: FnOnce(T, T) -> T, + { + // If there is no previous value, just store the new value. + // Otherwise call the function to decide which one to keep. + self.0 = Some(if let Some(accum) = self.0 { + func(accum, new_value) + } else { + new_value + }); + } +} diff --git a/zenith_utils/src/bin_ser.rs b/zenith_utils/src/bin_ser.rs index 0b78265dfc..063d69557d 100644 --- a/zenith_utils/src/bin_ser.rs +++ b/zenith_utils/src/bin_ser.rs @@ -94,9 +94,12 @@ pub fn le_coder() -> impl Options { /// Binary serialize/deserialize helper functions (Big Endian) /// -pub trait BeSer: Serialize + DeserializeOwned { +pub trait BeSer { /// Serialize into a byte slice - fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError> { + fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError> + where + Self: Serialize, + { // &mut [u8] implements Write, but `ser_into` needs a mutable // reference to that. So we need the slightly awkward "mutable // reference to a mutable reference. @@ -107,19 +110,28 @@ pub trait BeSer: Serialize + DeserializeOwned { /// /// This is useful for most `Write` types except `&mut [u8]`, which /// can more easily use [`ser_into_slice`](Self::ser_into_slice). - fn ser_into(&self, w: &mut W) -> Result<(), SerializeError> { + fn ser_into(&self, w: &mut W) -> Result<(), SerializeError> + where + Self: Serialize, + { be_coder().serialize_into(w, &self).map_err(|e| e.into()) } /// Serialize into a new heap-allocated buffer - fn ser(&self) -> Result, SerializeError> { + fn ser(&self) -> Result, SerializeError> + where + Self: Serialize, + { be_coder().serialize(&self).map_err(|e| e.into()) } /// Deserialize from the full contents of a byte slice /// /// See also: [`BeSer::des_prefix`] - fn des(buf: &[u8]) -> Result { + fn des(buf: &[u8]) -> Result + where + Self: DeserializeOwned, + { be_coder() .deserialize(buf) .or(Err(DeserializeError::BadInput)) @@ -131,7 +143,10 @@ pub trait BeSer: Serialize + DeserializeOwned { /// type, but does not guarantee that the entire slice is used. /// /// See also: [`BeSer::des`] - fn des_prefix(buf: &[u8]) -> Result { + fn des_prefix(buf: &[u8]) -> Result + where + Self: DeserializeOwned, + { be_coder() .allow_trailing_bytes() .deserialize(buf) @@ -139,7 +154,10 @@ pub trait BeSer: Serialize + DeserializeOwned { } /// Deserialize from a reader - fn des_from(r: &mut R) -> Result { + fn des_from(r: &mut R) -> Result + where + Self: DeserializeOwned, + { be_coder().deserialize_from(r).map_err(|e| e.into()) } @@ -147,16 +165,22 @@ pub trait BeSer: Serialize + DeserializeOwned { /// /// Note: it may be faster to serialize to a buffer and then measure the /// buffer length, than to call `serialized_size` and then `ser_into`. - fn serialized_size(&self) -> Result { + fn serialized_size(&self) -> Result + where + Self: Serialize, + { be_coder().serialized_size(self).map_err(|e| e.into()) } } /// Binary serialize/deserialize helper functions (Little Endian) /// -pub trait LeSer: Serialize + DeserializeOwned { +pub trait LeSer { /// Serialize into a byte slice - fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError> { + fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError> + where + Self: Serialize, + { // &mut [u8] implements Write, but `ser_into` needs a mutable // reference to that. So we need the slightly awkward "mutable // reference to a mutable reference. @@ -167,19 +191,28 @@ pub trait LeSer: Serialize + DeserializeOwned { /// /// This is useful for most `Write` types except `&mut [u8]`, which /// can more easily use [`ser_into_slice`](Self::ser_into_slice). - fn ser_into(&self, w: &mut W) -> Result<(), SerializeError> { + fn ser_into(&self, w: &mut W) -> Result<(), SerializeError> + where + Self: Serialize, + { le_coder().serialize_into(w, &self).map_err(|e| e.into()) } /// Serialize into a new heap-allocated buffer - fn ser(&self) -> Result, SerializeError> { + fn ser(&self) -> Result, SerializeError> + where + Self: Serialize, + { le_coder().serialize(&self).map_err(|e| e.into()) } /// Deserialize from the full contents of a byte slice /// /// See also: [`LeSer::des_prefix`] - fn des(buf: &[u8]) -> Result { + fn des(buf: &[u8]) -> Result + where + Self: DeserializeOwned, + { le_coder() .deserialize(buf) .or(Err(DeserializeError::BadInput)) @@ -191,7 +224,10 @@ pub trait LeSer: Serialize + DeserializeOwned { /// type, but does not guarantee that the entire slice is used. /// /// See also: [`LeSer::des`] - fn des_prefix(buf: &[u8]) -> Result { + fn des_prefix(buf: &[u8]) -> Result + where + Self: DeserializeOwned, + { le_coder() .allow_trailing_bytes() .deserialize(buf) @@ -199,7 +235,10 @@ pub trait LeSer: Serialize + DeserializeOwned { } /// Deserialize from a reader - fn des_from(r: &mut R) -> Result { + fn des_from(r: &mut R) -> Result + where + Self: DeserializeOwned, + { le_coder().deserialize_from(r).map_err(|e| e.into()) } @@ -207,14 +246,18 @@ pub trait LeSer: Serialize + DeserializeOwned { /// /// Note: it may be faster to serialize to a buffer and then measure the /// buffer length, than to call `serialized_size` and then `ser_into`. - fn serialized_size(&self) -> Result { + fn serialized_size(&self) -> Result + where + Self: Serialize, + { le_coder().serialized_size(self).map_err(|e| e.into()) } } -impl BeSer for T where T: Serialize + DeserializeOwned {} - -impl LeSer for T where T: Serialize + DeserializeOwned {} +// Because usage of `BeSer` or `LeSer` can be done with *either* a Serialize or +// DeserializeOwned implementation, the blanket implementation has to be for every type. +impl BeSer for T {} +impl LeSer for T {} #[cfg(test)] mod tests { diff --git a/zenith_utils/src/crashsafe_dir.rs b/zenith_utils/src/crashsafe_dir.rs new file mode 100644 index 0000000000..fcb4ddcce3 --- /dev/null +++ b/zenith_utils/src/crashsafe_dir.rs @@ -0,0 +1,125 @@ +use std::{ + fs::{self, File}, + io, + path::Path, +}; + +/// Similar to [`std::fs::create_dir`], except we fsync the +/// created directory and its parent. +pub fn create_dir(path: impl AsRef) -> io::Result<()> { + let path = path.as_ref(); + + fs::create_dir(path)?; + File::open(path)?.sync_all()?; + + if let Some(parent) = path.parent() { + File::open(parent)?.sync_all() + } else { + Err(io::Error::new( + io::ErrorKind::InvalidInput, + "can't find parent", + )) + } +} + +/// Similar to [`std::fs::create_dir_all`], except we fsync all +/// newly created directories and the pre-existing parent. +pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { + let mut path = path.as_ref(); + + let mut dirs_to_create = Vec::new(); + + // Figure out which directories we need to create. + loop { + match path.metadata() { + Ok(metadata) if metadata.is_dir() => break, + Ok(_) => { + return Err(io::Error::new( + io::ErrorKind::AlreadyExists, + format!("non-directory found in path: {:?}", path), + )); + } + Err(ref e) if e.kind() == io::ErrorKind::NotFound => {} + Err(e) => return Err(e), + } + + dirs_to_create.push(path); + + match path.parent() { + Some(parent) => path = parent, + None => { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "can't find parent", + )) + } + } + } + + // Create directories from parent to child. + for &path in dirs_to_create.iter().rev() { + fs::create_dir(path)?; + } + + // Fsync the created directories from child to parent. + for &path in dirs_to_create.iter() { + File::open(path)?.sync_all()?; + } + + // If we created any new directories, fsync the parent. + if !dirs_to_create.is_empty() { + File::open(path)?.sync_all()?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use tempfile::tempdir; + + use super::*; + + #[test] + fn test_create_dir_fsyncd() { + let dir = tempdir().unwrap(); + + let existing_dir_path = dir.path(); + let err = create_dir(existing_dir_path).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::AlreadyExists); + + let child_dir = existing_dir_path.join("child"); + create_dir(child_dir).unwrap(); + + let nested_child_dir = existing_dir_path.join("child1").join("child2"); + let err = create_dir(nested_child_dir).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::NotFound); + } + + #[test] + fn test_create_dir_all_fsyncd() { + let dir = tempdir().unwrap(); + + let existing_dir_path = dir.path(); + create_dir_all(existing_dir_path).unwrap(); + + let child_dir = existing_dir_path.join("child"); + assert!(!child_dir.exists()); + create_dir_all(&child_dir).unwrap(); + assert!(child_dir.exists()); + + let nested_child_dir = existing_dir_path.join("child1").join("child2"); + assert!(!nested_child_dir.exists()); + create_dir_all(&nested_child_dir).unwrap(); + assert!(nested_child_dir.exists()); + + let file_path = existing_dir_path.join("file"); + std::fs::write(&file_path, b"").unwrap(); + + let err = create_dir_all(&file_path).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::AlreadyExists); + + let invalid_dir_path = file_path.join("folder"); + create_dir_all(&invalid_dir_path).unwrap_err(); + } +} diff --git a/zenith_utils/src/http/endpoint.rs b/zenith_utils/src/http/endpoint.rs index 03802f61ff..3c5b53b77a 100644 --- a/zenith_utils/src/http/endpoint.rs +++ b/zenith_utils/src/http/endpoint.rs @@ -9,14 +9,14 @@ use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; use std::net::TcpListener; -use zenith_metrics::{register_int_counter, IntCounter}; +use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; use zenith_metrics::{Encoder, TextEncoder}; use super::error::ApiError; lazy_static! { static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!( - "pageserver_serve_metrics_count", + new_common_metric_name("serve_metrics_count"), "Number of metric requests made" ) .expect("failed to define a metric"); diff --git a/zenith_utils/src/lib.rs b/zenith_utils/src/lib.rs index 302069494c..ca26be5df2 100644 --- a/zenith_utils/src/lib.rs +++ b/zenith_utils/src/lib.rs @@ -18,6 +18,9 @@ pub mod pq_proto; // dealing with connstring parsing and handy access to it's parts pub mod connstring; +// helper functions for creating and fsyncing directories/trees +pub mod crashsafe_dir; + // common authentication routines pub mod auth; @@ -31,3 +34,6 @@ pub mod sock_split; // common log initialisation routine pub mod logging; + +// Misc +pub mod accum; diff --git a/zenith_utils/src/lsn.rs b/zenith_utils/src/lsn.rs index 1afd8fb962..ffe49e9f38 100644 --- a/zenith_utils/src/lsn.rs +++ b/zenith_utils/src/lsn.rs @@ -192,9 +192,7 @@ impl AtomicLsn { /// This operation will panic on overflow. pub fn fetch_add(&self, val: u64) -> Lsn { let prev = self.inner.fetch_add(val, Ordering::AcqRel); - if prev.checked_add(val).is_none() { - panic!("AtomicLsn overflow"); - } + assert!(prev.checked_add(val).is_some(), "AtomicLsn overflow"); Lsn(prev) } diff --git a/zenith_utils/src/postgres_backend.rs b/zenith_utils/src/postgres_backend.rs index f5fe2319f3..b2e0a1a525 100644 --- a/zenith_utils/src/postgres_backend.rs +++ b/zenith_utils/src/postgres_backend.rs @@ -377,7 +377,11 @@ impl PostgresBackend { // xxx distinguish fatal and recoverable errors? if let Err(e) = handler.process_query(self, m.body.clone()) { let errmsg = format!("{}", e); - warn!("query handler for {:?} failed: {}", m.body, errmsg); + // ":#" uses the alternate formatting style, which makes anyhow display the + // full cause of the error, not just the top-level context. We don't want to + // send that in the ErrorResponse though, because it's not relevant to the + // compute node logs. + warn!("query handler for {:?} failed: {:#}", m.body, e); self.write_message_noflush(&BeMessage::ErrorResponse(errmsg))?; } self.write_message(&BeMessage::ReadyForQuery)?; diff --git a/zenith_utils/src/pq_proto.rs b/zenith_utils/src/pq_proto.rs index c536c806c4..12e08737bf 100644 --- a/zenith_utils/src/pq_proto.rs +++ b/zenith_utils/src/pq_proto.rs @@ -660,7 +660,7 @@ impl<'a> BeMessage<'a> { } BeMessage::EncryptionResponse(should_negotiate) => { - let response = if *should_negotiate { b'Y' } else { b'N' }; + let response = if *should_negotiate { b'S' } else { b'N' }; buf.put_u8(response); } diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index ee781ea9ef..b8c0715eb4 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -78,6 +78,10 @@ macro_rules! zid_newtype { pub fn generate() -> Self { $t(ZId::generate()) } + + pub const fn from_array(b: [u8; 16]) -> Self { + $t(ZId(b)) + } } impl FromStr for $t { diff --git a/zenith_utils/tests/bin_ser_test.rs b/zenith_utils/tests/bin_ser_test.rs index d920b430f3..ada43a1189 100644 --- a/zenith_utils/tests/bin_ser_test.rs +++ b/zenith_utils/tests/bin_ser_test.rs @@ -1,10 +1,10 @@ use bytes::{Buf, BytesMut}; use hex_literal::hex; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use std::io::Read; use zenith_utils::bin_ser::LeSer; -#[derive(Debug, PartialEq, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Deserialize)] pub struct HeaderData { magic: u16, info: u16, diff --git a/zenith_utils/tests/ssl_test.rs b/zenith_utils/tests/ssl_test.rs index 3bc5ffb790..ba0f63d6ec 100644 --- a/zenith_utils/tests/ssl_test.rs +++ b/zenith_utils/tests/ssl_test.rs @@ -43,7 +43,7 @@ fn ssl() { client_sock.write_u32::(80877103).unwrap(); let ssl_response = client_sock.read_u8().unwrap(); - assert_eq!(b'Y', ssl_response); + assert_eq!(b'S', ssl_response); let mut cfg = rustls::ClientConfig::new(); cfg.root_store.add(&CERT).unwrap();